### 2) Model Training

### Import Libraries

In [None]:
# Import Libraries
import kfp
from kfp import dsl
from kfp.v2 import compiler
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output, OutputPath, ClassificationMetrics,
                        Metrics, component)
from google.cloud.aiplatform import pipeline_jobs
from typing import NamedTuple
import google
from google.oauth2 import credentials
from google.oauth2 import service_account
from google.oauth2.service_account import Credentials
from google.cloud import storage
from google.cloud.aiplatform import pipeline_jobs
from google_cloud_pipeline_components.v1.batch_predict_job import \
    ModelBatchPredictOp as batch_prediction_op
import pandas as pd
import numpy as np


### Import Data

In [None]:

# Import Dataset and Save in GCS
@component(
    base_image="northamerica-northeast1-docker.pkg.dev/cio-workbench-image-np-0ddefe/wb-platform/pipelines/kubeflow-pycaret:latest",
    output_component_file="import_data.yaml",
)
def import_data(
            dataset_id: str,
            file_bucket: str, 
    ) -> NamedTuple(
        "Outputs", 
        [
         ("save_path", str), 
         ("col_list", list)
        ]
    ):
    # Import Libraries
    import pandas as pd
    import numpy as np
    from sklearn.datasets import load_breast_cancer

    # import the entire dataset into 'data'
    data = load_breast_cancer() 
    
    # save the data in df, including the targets
    df = pd.DataFrame(data = data.data, columns = data.feature_names) 
    df['target'] = pd.Series(data.target) 
    
    # save df in cloud storage 
    save_path = f'gs://{file_bucket}/{dataset_id}/{dataset_id}_data.csv'
    df.to_csv(save_path, index=True) 
    
    print(f'{dataset_id}_data.csv saved in {save_path}')
    
    col_list = list([col for col in df.columns if col != "target"])
    
    return (save_path, col_list)


### Model Training

In [None]:

# Load Dataset and Train Model
@component(
    base_image="northamerica-northeast1-docker.pkg.dev/cio-workbench-image-np-0ddefe/wb-platform/pipelines/kubeflow-pycaret:latest",
    output_component_file="train_model.yaml",
)
def model_training(
            dataset_id: str,
            file_bucket: str, 
            save_path: str,
            model: Output[Model],
            metrics: Output[Metrics],
            metricsc: Output[ClassificationMetrics], 
            col_list: list 
    ) -> NamedTuple(
        "Outputs",
        [
            ("accuracy", float),  # Return parameters
            ("f1_score", float),
            ("roc_auc", float), 
            ("X_y_val_index", list), 
            ("model_location", str)
        ],
    ):
    # Import Libraries
    import gc
    import time
    from datetime import datetime
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    import xgboost as xgb
    import pickle
    import logging
    from google.cloud import storage
    from google.cloud import bigquery
    from sklearn.datasets import load_breast_cancer
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, roc_curve, confusion_matrix
    
    # Read csv that was saved in 'import_data' component
    df = pd.read_csv(save_path)  

    # X and y
    y = np.squeeze(df['target'].values)
    X = df.drop(columns='target')
    
    # Create the training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)
    
    # Reserve some samples for final validation
    X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.2, random_state=123)
    
    # export X_val and y_val to GCS for scoring
    X_val.to_csv(f'gs://{file_bucket}/{dataset_id}/{dataset_id}_X_val.csv')
    
    y_val_df = pd.DataFrame(y_val, columns = ['target']) 
    y_val_df.to_csv(f'gs://{file_bucket}/{dataset_id}/{dataset_id}_y_val.csv')

    # Instantiate the XGB Classifier: xgb_model
    xgb_model = xgb.XGBClassifier(
        learning_rate=0.01,
        n_estimators=100,
        max_depth=8,
        min_child_weight=1,
        max_delta_step=1, 
        colsample_bytree=0.9,
        subsample=0.9,
        objective='binary:logistic',
        nthread=4,
        scale_pos_weight=1, 
        eval_metric='auc', 
        base_score=0.5
    )

    # Fit the classifier to the training set
    xgb_model.fit(X_train, y_train)
    
    # Predict based on X_test
    y_pred = xgb_model.predict(X_test)
    y_pred_proba = xgb_model.predict_proba(X_test)[:, 1]
    
    # Model accuracy 
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)
    
    # Precision & Recall 
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    # F1 Score 
    f1_score = f1_score(y_test, y_pred)
    print("F1 Score:", f1_score)

    # ROC AUC Score
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    print("ROC AUC Score:", roc_auc)

    # Log eval metrics
    metrics.log_metric("Model", "XGBClassifier")
    metrics.log_metric("Size", df.shape[0])
    metrics.log_metric("Accuracy",accuracy)
    metrics.log_metric("AUC", roc_auc)
    metrics.log_metric("Precision", precision) 
    metrics.log_metric("Recall", recall) 
    metrics.log_metric("F1_Score", f1_score)

    # Compute fpr, tpr, thresholds for the ROC Curve
    fpr, tpr, thresholds = roc_curve(
        y_true=y_test, y_score=y_pred_proba, pos_label=True
    )
    
    # Log classification metrics
    metricsc.log_roc_curve(fpr.tolist(), tpr.tolist(), thresholds.tolist())
    metricsc.log_confusion_matrix(['Malignant', 'Benign'], confusion_matrix(y_test, y_pred).tolist())

    # added to model_training component: save model artifacts in GCS bucket
    model_artifacts = {}
    create_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    model_artifacts['create_time'] = create_time
    model_artifacts['model'] = xgb_model
    model_artifacts['col_list'] = col_list
    
    # create and write model_artifacts.pkl
    with open('model_artifacts.pkl', 'wb') as pkl_file:
        pickle.dump(model_artifacts, pkl_file)

        # Use the 'pickle.dump()' method to serialize and store the 'model_artifacts' data
        pickle.dump(model_artifacts, pkl_file)

    # create a gcs bucket instance
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(file_bucket)
    
    # define the folder path where the models will be saved. create one if not found. 
    model_path = 'breast_cancer_models/'
    blob = bucket.blob(model_path)
    if not blob.exists(storage_client):
        blob.upload_from_string('')
    
    # set model name and upload 'model_artifacts.pkl' to the folder in gcs bucket 
    model_name = 'breast_cancer_models_{}'.format(model_artifacts['create_time'])
    model_location = f'{model_path}{model_name}'
    blob = bucket.blob(model_location)
    blob.upload_from_filename('model_artifacts.pkl')
    
    print(f"Model artifacts loaded to GCS Bucket: {model_location}")
    
#     model.metadata['accuracy'] = accuracy
#     model.metadata['precision'] = precision
#     model.metadata['recall'] = recall
#     model.metadata['f1_score'] = f1_score
#     model.metadata['auc'] = roc_auc
    
    model.uri = f'gs://{file_bucket}/{model_location}'
    
#     # Log additional model details 
#     with open(model.path, 'w') as output_file:
#         output_file.write(f'You can enter additional model details here')
#     output_file.close()
    
    time.sleep(120)

    return (accuracy, f1_score, roc_auc, list(X_val.index), model_location)
    

### Model Evaluation

In [None]:
# Evaluate the model performance
@component(
    base_image="northamerica-northeast1-docker.pkg.dev/cio-workbench-image-np-0ddefe/wb-platform/pipelines/kubeflow-pycaret:latest",
    output_component_file="model_evaluation.yaml",
)
def model_evaluation(
            accuracy: float, 
            f1_score: float, 
            roc_auc: float, 
            accuracy_threshold: float, 
            f1_score_threshold: float, 
            roc_auc_threshold: float
            ) -> NamedTuple(
                "Output", [("result", str)]
            ):
    
    # Set checker to True
    checker = True
    
    # Set checker to False if any of the eval metrics is below threshold
    if accuracy < accuracy_threshold: 
        checker = False 
    if f1_score < f1_score_threshold: 
        checker = False 
    if roc_auc < roc_auc_threshold: 
        checker = False 
        
    # if checker == True, return "Pass", otherwise return "Fail"
    if checker == True: 
        return ("Pass",) 
    else: 
        return ("Fail",)
    

### Upload model to Model Registry

In [None]:
# import required libraries
import kfp
from kfp import dsl
from kfp.v2.dsl import (Model, Input, Output, component)

# Component for uploading model to Vertex Model Registry
@component(
# Uploads model
    base_image="northamerica-northeast1-docker.pkg.dev/cio-workbench-image-np-0ddefe/wb-platform/pipelines/kubeflow-pycaret:latest",
    output_component_file="model-upload.yaml",
)

def upload_model_to_mr(
    project_id: str,
    model: Input[Model],
    vertex_model: Output[Model],
    region: str,
    model_name: str,
    prediction_image: str,
    col_list: list, 
    result: str
):
    """
    Upload model to Vertex Model Registry.
    Args:
        project_id (str): project id for where this pipeline is being run
        model (Input[Model]): model passed in from training component. Must have path specified in model.uri
        region (str): region for where the query will be run
        model_name (str): name of model to be stored
        prediction_image (str): prediction image uri
        col_list (str): string of list of columns in serving data
    Returns:
        vertex_model (Output[Model]): Model saved in Vertex AI
    """

    from google.cloud import aiplatform
    import os
    from datetime import datetime

    aiplatform.init(project=project_id, location=region)
    
    ## check if prediction image is custom or not
    if prediction_image.startswith('northamerica-northeast1-docker'):
        # custom: must set ports
        health_route = "/ping"
        predict_route = "/predict"
        serving_container_ports = [7080]
    else:
        # Google pre-built
        health_route = None
        predict_route = None
        serving_container_ports = None

    if result == "Pass": 

        ## check for existing models
        # if model exists, update the version
        try:
            model_uid = aiplatform.Model.list(
                filter=f'display_name={model_name}', 
                order_by="update_time",
                location=region)[-1].resource_name

            uploaded_model = aiplatform.Model.upload(
                display_name = model_name, 
                artifact_uri = os.path.dirname(model.uri),
                serving_container_image_uri = prediction_image,
                serving_container_environment_variables =  {"COL_LIST":str(col_list)}, # remove for posting
                parent_model = model_uid,
                is_default_version = True
            )
        # if model does not already exist, upload a new model
        except:
            uploaded_model = aiplatform.Model.upload(
                display_name = model_name,
                artifact_uri = os.path.dirname(model.uri),
                serving_container_image_uri=prediction_image,
                serving_container_environment_variables =  {"COL_LIST":str(col_list)}, # remove for posting
            )

        vertex_model.uri = uploaded_model.resource_name
        vertex_model.version_create_time = datetime.now()
        vertex_model.version_description = "breast cancer model" 
    
    else: 
        
        print("Training performance is not satisfactory. Upload to the Model Registry revoked.")
        
    

### Load Model to Notebook

In [None]:
from kfp.v2.dsl import (Artifact, Output, Input, HTML, component)

# Load Custom Model Component: load in most recent version of your model to run batch predictions with
@component(
    base_image="northamerica-northeast1-docker.pkg.dev/cio-workbench-image-np-0ddefe/wb-platform/pipelines/kubeflow-pycaret:latest",
    output_component_file="load_model.yaml"
)
# this model returns a model artifact that will be passed on to Batch Predictions
def load_model(
                project_id: str, 
                region: str, 
                model_name: str, 
                model: Output[Artifact]):
    
    from google.cloud import aiplatform
    
    model_uid = aiplatform.Model.list(
                                    filter=f'display_name={model_name}', 
                                    order_by="update_time",
                                    location=region)[-1].resource_name
    model.uri = model_uid
    model.metadata['resourceName'] = model_uid
    

In [None]:
from kfp.v2.dsl import (Artifact, Output, Input, HTML, component)
from dataclasses import dataclass

@dataclass
class ModelOutput:
    model_uri: str

# Load Custom Model Component: load in most recent version of your model to run batch predictions with
@component(
    base_image="northamerica-northeast1-docker.pkg.dev/cio-workbench-image-np-0ddefe/wb-platform/pipelines/kubeflow-pycaret:latest",
    output_component_file="load_model.yaml"
)
# this model returns a model artifact that will be passed on to Batch Predictions
def load_model(
                project_id: str, 
                region: str, 
                model_name: str, 
                model: Output[Artifact]
                ) -> ModelOutput:

    @dataclass
    class ModelOutput:
        model_uri: str

    from google.cloud import aiplatform

    model_uid = aiplatform.Model.list(
                                    filter=f'display_name={model_name}', 
                                    order_by="update_time",
                                    location=region)[-1].resource_name
    model.uri = model_uid
    model.metadata['resourceName'] = model_uid
    model_uri = model.uri

    return ModelOutput(model_uri=str(model_uri))


### Batch Prediction - 1

In [None]:
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output, OutputPath, ClassificationMetrics,
                        Metrics, component, HTML)
@component(
    base_image="northamerica-northeast1-docker.pkg.dev/cio-workbench-image-np-0ddefe/wb-platform/pipelines/kubeflow-pycaret:latest",
    output_component_file="batch_prediction.yaml",
)
def batch_prediction(
        project_id: str,
        dataset_id: str,
        file_bucket: str,
        val_index: list, 
        save_path: str, 
        model: Input[Model],
        metrics: Output[Metrics],
        metricsc: Output[ClassificationMetrics],
):
    import time
    import pandas as pd
    import numpy as np
    import pickle
    from datetime import date
    from dateutil.relativedelta import relativedelta
    from google.cloud import bigquery
    from google.cloud import storage
    from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, roc_curve, confusion_matrix
    
    # Read csv that was saved in 'import_data' component
    df = pd.read_csv(save_path)  

    # X and y
    X = df.drop(columns='target')
    y = df['target']
    
    X_val = X.loc[val_index] 
    y_val = np.squeeze(y.iloc[val_index].values) 

    time.sleep(10)
    
    print(str(model.uri))
    print(str(model.location))

    model_path = 'breast_cancer_models/'

    storage_client = storage.Client()
    bucket = storage_client.get_bucket(file_bucket)
    blobs = storage_client.list_blobs(file_bucket, prefix='{}breast_cancer_models'.format(model_path))

    model_lists = []
    for blob in blobs:
        model_lists.append(blob.name)

    blob = bucket.blob(model_lists[-1])
    blob_in = blob.download_as_string()
    model_dict = pickle.loads(blob_in)
    model_xgb = model_dict['model']
    features = model_dict['col_list']
    print('...... model loaded')
    time.sleep(10)

    # get full score to cave into bucket
    y_pred = model_xgb.predict(X_val)
    y_pred_proba = model_xgb.predict_proba(X_val)[:, 1] 
    
    result = pd.DataFrame(columns=['index', 'y_pred_proba', 'y_pred', 'y_val'])
    result['index'] = pd.Series(X_val.index.to_list())
    # result['index'] = result['index'].astype('int64')
    result['y_pred_proba'] = y_pred_proba
    # result['y_pred_proba'] = result['y_pred_proba'].fillna(0.0).astype('float64')
    result['y_pred'] = y_pred
    result['y_test'] = y_val

    result.to_csv('gs://{}/breast_cancer/model_validation.csv'.format(file_bucket), index=True)

    # Model accuracy 
    accuracy = accuracy_score(y_val, y_pred)
    print("Accuracy:", accuracy)
    
    # Precision & Recall 
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    
    # F1 Score 
    f1_score = f1_score(y_val, y_pred)
    print("F1 Score:", f1_score)

    # ROC AUC Score
    roc_auc = roc_auc_score(y_val, y_pred_proba)
    print("ROC AUC Score:", roc_auc)

    # Log eval metrics
    metrics.log_metric("Model", "XGBClassifier")
    metrics.log_metric("Size", X_val.shape[0])
    metrics.log_metric("Accuracy", accuracy)
    metrics.log_metric("AUC", roc_auc)
    metrics.log_metric("Precision", precision) 
    metrics.log_metric("Recall", recall) 
    metrics.log_metric("F1_Score", f1_score)
    
    time.sleep(60)
    print(f"Batch prediction for {X_val.shape[0]} samples completed")
    


### Batch Prediction - 2

In [None]:
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output, OutputPath, ClassificationMetrics,
                        Metrics, component, HTML)
@component(
    base_image="northamerica-northeast1-docker.pkg.dev/cio-workbench-image-np-0ddefe/wb-platform/pipelines/kubeflow-pycaret:latest",
    output_component_file="batch_prediction.yaml",
)
def batch_prediction(
        region: str, 
        project_id: str,
        dataset_id: str,
        file_bucket: str,
        val_index: list, 
        save_path: str, 
        model_name: str, 
        model: Output[Artifact],
        metrics: Output[Metrics],
        metricsc: Output[ClassificationMetrics],
):
    import time
    import pandas as pd
    import numpy as np
    import pickle
    from datetime import date
    from dateutil.relativedelta import relativedelta
    from google.cloud import bigquery
    from google.cloud import storage
    from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, roc_curve, confusion_matrix
    from google.cloud import aiplatform
    
    model_uid = aiplatform.Model.list(
                                    filter=f'display_name={model_name}', 
                                    order_by="update_time",
                                    location=region)[-1].resource_name
    model.uri = model_uid
    model.metadata['resourceName'] = model_uid
    
    # Read csv that was saved in 'import_data' component
    df = pd.read_csv(save_path)  

    # X and y
    X = df.drop(columns='target')
    y = df['target']
    
    X_val = X.loc[val_index] 
    y_val = np.squeeze(y.iloc[val_index].values) 

    time.sleep(10)

    model_path = 'breast_cancer_models/'

    storage_client = storage.Client()
    bucket = storage_client.get_bucket(file_bucket)
    blobs = storage_client.list_blobs(file_bucket, prefix='{}breast_cancer_models'.format(model_path))

    model_lists = []
    for blob in blobs:
        model_lists.append(blob.name)

    blob = bucket.blob(model_lists[-1])
    blob_in = blob.download_as_string()
    model_dict = pickle.loads(blob_in)
    model_xgb = model_dict['model']
    features = model_dict['col_list']
    print('...... model loaded')
    time.sleep(10)

    # get full score to cave into bucket
    y_pred = model_xgb.predict(X_val)
    y_pred_proba = model_xgb.predict_proba(X_val)[:, 1] 
    
    result = pd.DataFrame(columns=['index', 'y_pred_proba', 'y_pred', 'y_val'])
    result['index'] = pd.Series(X_val.index.to_list())
    # result['index'] = result['index'].astype('int64')
    result['y_pred_proba'] = y_pred_proba
    # result['y_pred_proba'] = result['y_pred_proba'].fillna(0.0).astype('float64')
    result['y_pred'] = y_pred
    result['y_test'] = y_val

    result.to_csv('gs://{}/breast_cancer/model_validation.csv'.format(file_bucket), index=True)

    # Model accuracy 
    accuracy = accuracy_score(y_val, y_pred)
    print("Accuracy:", accuracy)
    
    # Precision & Recall 
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    
    # F1 Score 
    f1_score = f1_score(y_val, y_pred)
    print("F1 Score:", f1_score)

    # ROC AUC Score
    roc_auc = roc_auc_score(y_val, y_pred_proba)
    print("ROC AUC Score:", roc_auc)

    # Log eval metrics
    metrics.log_metric("Model", "XGBClassifier")
    metrics.log_metric("Size", X_val.shape[0])
    metrics.log_metric("Accuracy", accuracy)
    metrics.log_metric("AUC", roc_auc)
    metrics.log_metric("Precision", precision) 
    metrics.log_metric("Recall", recall) 
    metrics.log_metric("F1_Score", f1_score)
    
    time.sleep(60)
    print(f"Batch prediction for {X_val.shape[0]} samples completed")
    


In [None]:
# import time
# import pandas as pd
# import numpy as np
# import pickle
# from datetime import date
# from dateutil.relativedelta import relativedelta
# from google.cloud import bigquery
# from google.cloud import storage

# file_bucket = FILE_BUCKET
# dataset_id = DATASET_ID 

# save_path = f'gs://{file_bucket}/{dataset_id}/{dataset_id}_data.csv'
# val_index = [22, 138, 192, 190, 260, 498, 157, 309, 454, 166, 202, 488, 33, 480, 205, 345, 334, 175, 520, 399, 511, 24, 400, 49, 74, 230, 557, 327, 43, 436, 456, 287, 209, 410, 326]

# # Read csv that was saved in 'import_data' component
# df = pd.read_csv(save_path)  

# # X and y
# X = df.drop(columns='target')
# y = df['target']

# X_val = X.loc[val_index] 
# y_val = np.squeeze(y.iloc[val_index].values) 

# result = pd.DataFrame(columns=['index', 'y_pred_proba', 'y_pred', 'y_val'])
# result['index'] = X_val.index
# result['index'] = result['index'].astype(int)


# result

In [None]:
#tag cell with parameters
PROJECT_ID =  'divg-josh-pr-d1cc3a'
BUCKET_NAME='divg-josh-pr-d1cc3a-default'
DATASET_ID = 'breast_cancer'
RESOURCE_BUCKET = 'divg-josh-pr-d1cc3a-default'
FILE_BUCKET = 'divg-josh-pr-d1cc3a-default'
MODEL_ID = '5070'
REGION = 'northamerica-northeast1'
MODEL_NAME = 'breast_cancer'
PREDICTION_IMAGE = 'northamerica-northeast1-docker.pkg.dev/cio-workbench-image-np-0ddefe/wb-platform/pipelines/kubeflow-pycaret:latest'

# batch predictions parameters
GCS_SOURCE_INPUT_URI = f'gs://{FILE_BUCKET}/{DATASET_ID}/{DATASET_ID}_X_val.csv'
GCS_DESTINATION_OUTPUT_URI = f'gs://{FILE_BUCKET}/{DATASET_ID}/'
BATCH_PREDICTIONS_DISPLAY_NAME = 'breast_cancer_batch_predictions'
INSTANCES_FORMAT = 'csv'
PREDICTIONS_FORMAT = 'csv'
MACHINE_TYPE = 'n1-standard-2'


In [None]:
# library imports
from kfp.v2 import compiler
from google.cloud.aiplatform import pipeline_jobs
@dsl.pipeline(
    name='breast-cancer-pipeline', 
    description='breast-cancer-pipeline'
    )
def pipeline(
        dataset_id: str = DATASET_ID, 
        file_bucket: str = FILE_BUCKET, 
        region: str = REGION
    ):
    
    import google.oauth2.credentials
    token = !gcloud auth print-access-token
    token_str = token[0]
    
    # ----- create training set --------
    import_data_op = import_data(dataset_id=dataset_id,
                          file_bucket=file_bucket)
    
    model_training_op = model_training(dataset_id=dataset_id,
                          file_bucket=file_bucket, 
                          save_path=import_data_op.outputs['save_path'], 
                          col_list = import_data_op.outputs["col_list"])
    
    model_evaluation_op=  model_evaluation(
                          accuracy=model_training_op.outputs["accuracy"], 
                          f1_score=model_training_op.outputs["f1_score"], 
                          roc_auc=model_training_op.outputs["roc_auc"], 
                          accuracy_threshold=0.95, 
                          f1_score_threshold=0.95, 
                          roc_auc_threshold=0.95
                          )
    
    upload_model_to_mr_op = upload_model_to_mr(
                        project_id = PROJECT_ID,
                        region = REGION,
                        model = model_training_op.outputs["model"],
                        model_name = MODEL_NAME,
                        prediction_image = PREDICTION_IMAGE,
                        col_list = import_data_op.outputs["col_list"], 
                        result = model_evaluation_op.outputs['result'])
    
    load_model_op = load_model(
                        project_id= PROJECT_ID, 
                        region= REGION, 
                        model_name= MODEL_NAME)
    
#     batch_prediction_op = batch_prediction(
#                         project_id = PROJECT_ID,
#                         dataset_id = DATASET_ID,
#                         file_bucket = FILE_BUCKET, 
#                         val_index = model_training_op.outputs['X_y_val_index'], 
#                         save_path = import_data_op.outputs['save_path'], 
#                         model = load_model_op.outputs["model"])

    batch_prediction = batch_prediction_op(
                                            project = PROJECT_ID,
                                            location = REGION,
                                            model = load_model_op.output,
                                            job_display_name = BATCH_PREDICTIONS_DISPLAY_NAME,
                                            gcs_source_uris = GCS_SOURCE_INPUT_URI, 
                                            gcs_destination_output_uri_prefix = GCS_DESTINATION_OUTPUT_URI, 
                                            instances_format = INSTANCES_FORMAT,
                                            predictions_format = PREDICTIONS_FORMAT,
                                            machine_type = MACHINE_TYPE,
                                            starting_replica_count=20,
                                            max_replica_count=30
                                        )

    model_training_op.after(import_data_op)
    model_evaluation_op.after(model_training_op)
    upload_model_to_mr_op.after(model_evaluation_op)
    load_model_op.after(upload_model_to_mr_op)
    batch_prediction.after(load_model_op)
    


In [None]:
import google.oauth2.credentials
import json

token = !gcloud auth print-access-token
CREDENTIALS = google.oauth2.credentials.Credentials(token[0])

compiler.Compiler().compile(
   pipeline_func=pipeline, package_path="pipeline.json"
)

job = pipeline_jobs.PipelineJob(
   display_name='breast-cancer-pipeline',
   template_path="pipeline.json",
   credentials = CREDENTIALS,
   pipeline_root = f"gs://{FILE_BUCKET}",
   location=REGION,
   enable_caching=False # I encourage you to enable caching when testing as it will reduce resource use
)

job.run()