In [1]:
# !pip install jupyter_contrib_nbextensions
# !jupyter contrib nbextension install - user
# from jedi import settings
# settings.case_insensitive_completion = True

In [2]:
# # Install ai platform and kfp
# USER_FLAG = "--user"
# !pip3 install {USER_FLAG} google-cloud-aiplatform==1.3.0 --upgrade
# !pip3 install {USER_FLAG} kfp --upgrade
# !pip install google_cloud_pipeline_components

In [3]:
# !pip install kfp --upgrade

In [4]:
# !gcloud services enable compute.googleapis.com         \
#                        containerregistry.googleapis.com  \
#                        aiplatform.googleapis.com  \
#                        cloudbuild.googleapis.com \
#                        cloudfunctions.googleapis.com

In [5]:
from typing import NamedTuple
from kfp.v2 import dsl
from kfp.v2.dsl import (Artifact,
                        Dataset,
                        Input,
                        Model,
                        Output,
                        Metrics,
                        ClassificationMetrics,
                        component, 
                        OutputPath, 
                        InputPath)

from kfp.v2 import compiler
from google.cloud import bigquery
from google.cloud import aiplatform
from google.cloud.aiplatform import pipeline_jobs
from google_cloud_pipeline_components import aiplatform as gcc_aip

In [6]:
USER_FLAG = "--user"
#!gcloud auth login if needed

In [7]:
# Get projet name
shell_output=!gcloud config get-value project 2> /dev/null
PROJECT_ID=shell_output[0]
PROJECT_ID

'gpa-poc-001'

In [8]:
# Set bucket name
BUCKET_NAME="gs://gpa-churn/artifacts"

# Create bucket
PIPELINE_ROOT = f"{BUCKET_NAME}/pipeline-vertexai/"
PIPELINE_ROOT

'gs://gpa-churn/artifacts/pipeline-vertexai/'

In [9]:
REGION="southamerica-east1"
REGION

'southamerica-east1'

---

## Creating pipeline components

In [10]:
@component(
    base_image="gcr.io/gpa-poc-001/churn-base-image-src-xgb@sha256:61db16ec13bba7d8023fff61329c6c28a7eb119f8f837fce4c09258776c16727",
    output_component_file="get_preprocessed_data.yaml"
)

def get_preprocessed_data(
    Xtrain_: Output[Dataset],
    Xval_: Output[Dataset],
    ytrain_: Output[Dataset],
    yval_: Output[Dataset],
    prefix:str='gs://gpa-churn/data/processed/input/'
    ):
    
    import os
    import gc
    import sys
    import numpy as np
    import pandas as pd
    from google.cloud import storage
    from sklearn.model_selection import train_test_split
    
    bucket = prefix.split('/')[2]
    storage_client = storage.Client()
    obj_list = storage_client.list_blobs(bucket)
    obj_list = [i.name for i in obj_list if 'data/processed/input/' in i.name]
    obj_list = obj_list[1:]
    df_list = []
    for obj in obj_list:
        local_df = pd.read_parquet('gs://gpa-churn/'+obj)
        df_list.append(local_df)
        print(f'added {prefix}{obj}')

    df = pd.concat(df_list, axis=0)
    df.drop(columns=['cod_cliente'], inplace=True)
    df.drop_duplicates(inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    target = 'target'
    features = list(df.columns)
    features = [i for i in features if i != target]

    Xtrain, Xval, ytrain, yval = train_test_split(
        df[features], 
        df[[target]],
        test_size=0.15, 
        random_state=501
        )
    
    del df
    gc.collect()

    Xtrain.reset_index(drop=True, inplace=True)
    Xval.reset_index(drop=True, inplace=True)
    ytrain.reset_index(drop=True, inplace=True)
    yval.reset_index(drop=True, inplace=True)
    print('Successfully read training data')
    print('shapes:')
    print(f'xtrain:{Xtrain.shape}, ytrain:{ytrain.shape}')
    print(f'xval:{Xval.shape}, yval:{yval.shape}')
    
    Xtrain.to_parquet(Xtrain_.path + '.parquet', index=False, compression='gzip')
    Xval.to_parquet(Xval_.path + '.parquet', index=False, compression='gzip')
    ytrain.to_parquet(ytrain_.path + '.parquet', index=False, compression='gzip')
    yval.to_parquet(yval_.path + '.parquet', index=False, compression='gzip')

In [11]:
@component(
    base_image="gcr.io/gpa-poc-001/churn-base-image-src-xgb@sha256:61db16ec13bba7d8023fff61329c6c28a7eb119f8f837fce4c09258776c16727",
    output_component_file="feature_engineering_sequence.yaml"
)

def feature_engineering_sequence(
    Xtrain_: Input[Dataset],
    Xval_: Input[Dataset],
    Xtrain_fe: Output[Dataset],
    Xval_fe: Output[Dataset],
    fe_pipeline_: Output[Model]
    ):
    
    import os
    import sys
    import pytz
    import joblib
    import pandas as pd
    from datetime import datetime
    from google.cloud import storage
    from sklearn.pipeline import Pipeline

    sys.path.append('/usr/app/')
    sys.path.append('/usr/app/src')
    import src.pipeline_modules as pipeline_modules
    
    Xtrain = pd.read_parquet(Xtrain_.path + ".parquet")
    Xval = pd.read_parquet(Xval_.path + ".parquet")
    
    numerical_columns = [
        'val_venda_bruta_cupom',
        'qtd_item_venda',
        'flg_vend_meu_desct',
        'valor_desconto',
        'flag_dev',
        'tipo_promo_0',
        'tipo_promo_1',
        'tipo_promo_2',
        'tipo_promo_3',
        'tipo_promo_4',
        'tipo_promo_5',
        'categoria_0',
        'categoria_1',
        'categoria_2',
        'categoria_3',
        'categoria_4',
        'categoria_5',
        'categoria_6',
        'categoria_7',
        'departamento_0',
        'compras_mes',
        'agg_l3m_val_venda_bruta_cupom',
        'agg_l3m_qtd_item_venda',
        'agg_l3m_flg_vend_meu_desct',
        'agg_l3m_valor_desconto',
        'agg_l3m_flag_dev',
        'agg_l3m_tipo_promo_0',
        'agg_l3m_tipo_promo_1',
        'agg_l3m_tipo_promo_2',
        'agg_l3m_tipo_promo_3',
        'agg_l3m_tipo_promo_4',
        'agg_l3m_tipo_promo_5',
        'agg_l3m_categoria_0',
        'agg_l3m_categoria_1',
        'agg_l3m_categoria_2',
        'agg_l3m_categoria_3',
        'agg_l3m_categoria_4',
        'agg_l3m_categoria_5',
        'agg_l3m_categoria_6',
        'agg_l3m_categoria_7',
        'agg_l3m_departamento_0',
        'agg_l3m_compras_mes',
    ]

    outlier_columns_mean = [
        'pib_percapita',
        'idade',
        'delta_de_cadastro',
        'delta_de_stix'
    ]

    yeojohnson_columns = [
        'val_venda_bruta_cupom',
        'qtd_item_venda',
        'flg_vend_meu_desct',
        'valor_desconto',
        'compras_mes',
        'agg_l3m_val_venda_bruta_cupom',
        'agg_l3m_qtd_item_venda',
        'agg_l3m_flg_vend_meu_desct',
        'agg_l3m_valor_desconto',
        'agg_l3m_compras_mes',
        'pib_percapita',
        'idade',
        'delta_de_cadastro'
    ]
    
    # training set
    #-------------------------------------------------------
    fe_pipeline = Pipeline([
        ('drop_temporary_columns', pipeline_modules.drop_temporary_columns()),
        ('drop_with_low_variance', pipeline_modules.drop_numerical_with_variance(columns=numerical_columns)),
        ('encode_sex_column', pipeline_modules.encode_sex_column()),
        ('group_rare_regions', pipeline_modules.group_rare_categorical(columns=['region'], threshold=0.002)),
        ('encode_regions', pipeline_modules.encode_categorical(columns=['region'])),
        ('handle_outliers_max', pipeline_modules.outlier_handling(
            columns=numerical_columns, 
            method='gauss', 
            band=2.8, 
            action='max')),
        ('handle_outliers_mean', pipeline_modules.outlier_handling(
            columns=outlier_columns_mean, 
            method='gauss', 
            band=2.5, 
            action='mean')),
        ('handle_negative_values', pipeline_modules.handle_negative_values(columns=numerical_columns)),
        ('fill_missing_numerical_zero', pipeline_modules.fill_na_values_with_zero(
            columns=['ind_email','cadastro_stix','delta_de_cadastro','delta_de_stix'])),
        ('fill_missing_numerical_mean', pipeline_modules.fill_na_values_with_zero(
            columns=['pib_percapita','idade'])),
        ('transform_yeojohnson', pipeline_modules.data_transformation(
            columns=yeojohnson_columns, 
            method='yeojohnson'))
    ])

    Xtrain = fe_pipeline.fit_transform(Xtrain)

    # validation set
    #-------------------------------------------------------
    Xval = fe_pipeline.transform(Xval)
    
    # save feature engineering artifacts
    #-------------------------------------------------------
    file_name = fe_pipeline_.path + '.joblib'
    with open(file_name, 'wb') as file:
        joblib.dump(fe_pipeline, file)
    
    Xtrain.to_parquet(Xtrain_fe.path + '.parquet', index=False, compression='gzip')
    Xval.to_parquet(Xval_fe.path + '.parquet', index=False, compression='gzip')

In [12]:
@component(
    base_image="gcr.io/gpa-poc-001/churn-base-image-src-xgb@sha256:61db16ec13bba7d8023fff61329c6c28a7eb119f8f837fce4c09258776c16727",
    output_component_file="feature_selection_sequence.yaml"
)

def feature_selection_sequence(
    Xtrain_fe: Input[Dataset],
    Xval_fe: Input[Dataset],
    ytrain_: Input[Dataset],
    yval_: Input[Dataset],
    Xtrain_fs: Output[Dataset],
    Xval_fs: Output[Dataset],
    fs_pipeline_: Output[Model],
    baseline_df_: Output[Dataset]
    ):
    
    import os
    import sys
    import pytz
    import joblib
    import pandas as pd
    from datetime import datetime
    from google.cloud import storage
    from sklearn.pipeline import Pipeline

    sys.path.append('/usr/app/')
    sys.path.append('/usr/app/src')
    import src.utils as utils
    import src.pipeline_modules as pipeline_modules
    
    Xtrain = pd.read_parquet(Xtrain_fe.path + ".parquet")
    Xval = pd.read_parquet(Xval_fe.path + ".parquet")
    ytrain = pd.read_parquet(ytrain_.path + ".parquet")
    yval = pd.read_parquet(yval_.path + ".parquet")
    
    # training set
    #-------------------------------------------------------
    fs_pipeline = Pipeline([
            ('select_with_correlation', pipeline_modules.select_with_correlation(
                threshold=0.82, 
                method='recursive',
                objective='classification'))
        ])
        
    Xtrain = fs_pipeline.fit_transform(Xtrain, ytrain)
    
    # validation set
    #-------------------------------------------------------
    Xval = fs_pipeline.transform(Xval)
    
    # create baseline for model monitoring
    #-------------------------------------------------------
    baseline_df = pd.concat([Xtrain, ytrain], axis=1)
    
    # save feature selection artifacts
    #-------------------------------------------------------
    file_name = fs_pipeline_.path + '.joblib'
    with open(file_name, 'wb') as file:
        joblib.dump(fs_pipeline, file)
    
    # save pipeline datasets
    #-------------------------------------------------------
    Xtrain.to_parquet(Xtrain_fs.path + '.parquet', index=False, compression='gzip')
    Xval.to_parquet(Xval_fs.path + '.parquet', index=False, compression='gzip')
    baseline_df.to_parquet(baseline_df_.path + '.parquet', index=False, compression='gzip')

In [13]:
@component(
    base_image="gcr.io/gpa-poc-001/churn-base-image-src-xgb@sha256:61db16ec13bba7d8023fff61329c6c28a7eb119f8f837fce4c09258776c16727",
    output_component_file="train_model.yaml"
)

def train_model(
    Xtrain_fs: Input[Dataset],
    Xval_fs: Input[Dataset],
    ytrain_: Input[Dataset],
    yval_: Input[Dataset],
    model_: Output[Model],
    metrics_: Output[Dataset],
    bucket:str='gpa-churn',
    artifact_path:str='artifacts/training_pipeline/xgb/'
    ):
    
    import os
    import sys
    import pytz
    import joblib
    import pandas as pd
    import xgboost as xgb
    from datetime import datetime
    from google.cloud import storage
    from sklearn.pipeline import Pipeline
    from sklearn.metrics import precision_score, recall_score, f1_score

    sys.path.append('/usr/app/')
    sys.path.append('/usr/app/src')
    import src.utils as utils
    import src.pipeline_modules as pipeline_modules
    from src.guara.modeling.supervised_modelz import SupervisedModelz
    
    Xtrain = pd.read_parquet(Xtrain_fs.path + ".parquet")
    Xval = pd.read_parquet(Xval_fs.path + ".parquet")
    ytrain = pd.read_parquet(ytrain_.path + ".parquet")
    yval = pd.read_parquet(yval_.path + ".parquet")
    
    dtrain = xgb.DMatrix(Xtrain, ytrain)
    dval = xgb.DMatrix(Xval, yval)
    
    scale_pos_weight=ytrain.value_counts(normalize=True)[0]/ytrain.value_counts(normalize=True)[1]     
    params = {
        'objective':'binary:logistic',
        'gamma': 1, 
        'verbosity': 0, 
        'scale_pos_weight': 1.0, 
        'eta': 0.32924394564404313, 
        'colsample_bytree': 0.6997715470767337, 
        'num_iterations': 259.98061008076706, 
        'lambda': 9.840799645070883, 
        'n_estimators': 372, 
        'max_depth': 5, 
        'feature_fraction': 0,
        'eval_metric':'auc',
        'scale_pos_weight': scale_pos_weight
    }
    
    bst = xgb.train(params, dtrain, 20)
    
    # evaluate model performance
    #-------------------------------------------------------
    predicted_yval = bst.predict(dval)
    predicted_yval = [1 if i>0.54 else 0 for i in predicted_yval]
    metrics_dict = {
        'roc':float(bst.eval_set([(dval, '0')]).split(':')[1]),
        'precision':precision_score(yval, predicted_yval),
        'recall':recall_score(yval, predicted_yval),
        'f1':f1_score(yval, predicted_yval)
    }
    metrics = pd.DataFrame(metrics_dict, index=[0])
    
    # save model performance metrics
    #------------------------------------------------------- 
    metrics.to_parquet(metrics_.path + '.parquet', index=False, compression='gzip')
    
    # save model artifacts locally
    #-------------------------------------------------------
    bst.save_model('model.bst')
    
    # upload local model to cloud storage
    #-------------------------------------------------------
    bucket_name=bucket
    model_file=artifact_path + 'model.bst'
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(model_file)
    blob.upload_from_filename('model.bst')
    
    # save model as pipeline artifact
    #-------------------------------------------------------
    model_.metadata['framework'] = 'xgb'
    bst.save_model(model_.path + '.bst')

In [14]:
@component(
    base_image="gcr.io/gpa-poc-001/churn-base-image-src-xgb@sha256:61db16ec13bba7d8023fff61329c6c28a7eb119f8f837fce4c09258776c16727",
    output_component_file="evaluate_model.yaml"
)

def evaluate_model(
    metrics_: Input[Dataset],
    roc_threshold:float=0.80
    ) -> NamedTuple('output', [('deploy', str)]):
    
    import pandas as pd
    
    metrics = pd.read_parquet(metrics_.path + '.parquet')
    cond = "false"
    if float(metrics['roc'].iloc[0]) >= float(roc_threshold):
        cond="true"
        
    return (cond,)

In [15]:
@component(
    base_image="gcr.io/gpa-poc-001/churn-base-image-src-xgb@sha256:61db16ec13bba7d8023fff61329c6c28a7eb119f8f837fce4c09258776c16727",
    output_component_file="deploy_endpoint.yaml"
)

def deploy_endpoint(
    model_: Input[Model],
    vertex_endpoint: Output[Artifact],
    vertex_model: Output[Model],
    endpoint_information_: Output[Model],
    project_id:str='gpa-poc-001',
    model_label:str='churn',
    region:str="us-central1",
    container_uri:str="us-docker.pkg.dev/vertex-ai/prediction/xgboost-cpu.1-5:latest",
    artifact_uri:str='gs://gpa-churn/artifacts/training_pipeline/xgb/'
    ):
    
    import os
    import gc
    import sys
    import json
    import joblib
    import numpy as np
    import pandas as pd
    import xgboost as xgb
    from datetime import datetime
    from google.cloud import storage
    from google.cloud import aiplatform
    
    endpoint_name = f'{model_label}-endpoint'
    display_name = f'{model_label}-xgb'
    model_name = f'{model_label}-xgb'
    
    # Create endpoint
    #-------------------------------------------------------
    endpoints = aiplatform.Endpoint.list(
        filter='display_name="{}"'.format(endpoint_name),
        order_by='create_time desc',
        project=project_id, 
        location=region,
        )

    if len(endpoints) > 0:
        endpoint = endpoints[0]  # most recently created
    else:
        endpoint = aiplatform.Endpoint.create(
        display_name=endpoint_name, project=project_id, location=region)
    
    # Import a model programmatically
    #-------------------------------------------------------
    model_upload = aiplatform.Model.upload(
        display_name = display_name, 
        artifact_uri = artifact_uri,
        serving_container_image_uri =  container_uri,
        serving_container_health_route=f"/v1/models/{model_name}",
        serving_container_predict_route=f"/v1/models/{model_name}:predict",
        serving_container_environment_variables={
        "model_name": model_name,
        },       
        )
    
    model_deploy = model_upload.deploy(
        machine_type="n1-standard-4", 
        endpoint=endpoint,
        traffic_split={"0": 100},
        deployed_model_display_name=display_name,
        )

    # Save data to the output params
    #-------------------------------------------------------
    vertex_model.uri = model_deploy.resource_name
    
    # Save endpoint.resource_name for prediction reference
    #-------------------------------------------------------
    endpoint_information_dict = {
        'project_number':str(model_deploy.resource_name.split('/')[1]),
        'endpoint':str(model_deploy.resource_name.split('/')[-1])
    }
    with open(endpoint_information_.path+'.json', 'w') as file:
        json.dump(endpoint_information_dict, file)

In [16]:
@component(
    base_image="gcr.io/gpa-poc-001/churn-base-image-src-xgb-gcloud@sha256:0985f7e13d3d1f234462e7e0b32f8e2563c0ca312006a4361e593e27a43bce5c",
    output_component_file="deploy_model_monitoring_job.yaml"
)

def deploy_model_monitoring_job(
    baseline_df_: Input[Dataset],
    endpoint_information_: Input[Model],
    model_monitor_information_: Output[Model],
    region:str='us-central1',
    baseline_path:str='gs://gpa-churn/artifacts/training_pipeline/baseline/'
    ):
    
    import os
    import sys
    import json
    import pandas as pd
    
    # reading inputs
    #-------------------------------------------------------
    baseline_df = pd.read_parquet(baseline_df_.path+'.parquet')
    with open(endpoint_information_.path+'.json', 'r') as file:
        endpoint_information = json.load(file)
    endpoint = endpoint_information['endpoint']
    
    # saving baseline for model monitor in cloud storage
    #-------------------------------------------------------
    baseline_df.to_csv(baseline_path+'data.csv', index=False)
    
    # creating feature_thresh_description from feature_list
    #-------------------------------------------------------
    feature_list = list(baseline_df.columns) # ['feature1', 'feature2', ..., 'target']
    feature_list = [i for i in feature_list if i != 'target']
    feature_thresh_description = ''.join(i+'=0.3,' for i in feature_list)[0:-1]
    
    # deploy or update model monitoring job
    #-------------------------------------------------------
    sdk_command = f'gcloud beta ai model-monitoring-jobs list --region={region} --project=gpa-poc-001'
    os_string_output = os.popen(sdk_command).read()
    if os_string_output != '':
        mmjob_name = os_string_output.split('job-')[1].split('/')[0].split('\n')[0]
        update_mmjob_command = f'gcloud beta ai model-monitoring-jobs update ({mmjob_name} --region={region} -- project=gpa-poc-001) \
        --emails=italo.avila@tenbu.com.br \
        --endpoint={endpoint} \
        --prediction-sampling-rate=0.5 \
        --monitoring-frequency=1 \
        --target-field=target \
        --training-sampling-rate=1.0 \
        --data-format=csv \
        --gcs-uris=gs://gpa-churn/artifacts/training_pipeline/baseline/data.csv \
        --feature-thresholds={feature_thresh_description}'
        os.system(update_mmjob_command)
    else:
        print(f'No model monitoring jobs deployed in region={region}')
        deploy_mmjob_command = f'gcloud beta ai model-monitoring-jobs create --display-name=churn-model-monitor \
        --project=gpa-poc-001 \
        --emails=italo.avila@tenbu.com.br \
        --endpoint={endpoint} \
        --prediction-sampling-rate=0.5 \
        --monitoring-frequency=1 \
        --region={region} \
        --target-field=target \
        --training-sampling-rate=1.0 \
        --data-format=csv \
        --gcs-uris=gs://gpa-churn/artifacts/training_pipeline/baseline/data.csv \
        --feature-thresholds={feature_thresh_description}'
        os.system(deploy_mmjob_command)
    
    # save model monitor information
    #-------------------------------------------------------
    os_string_output = os.popen(sdk_command).read()
    mmjob_name = os_string_output.split('job-')[1].split('/')[0].split('\n')[0]
    model_monitor_information = {
        'project':'gpa-poc-001',
        'region':region,
        'model_monitor_id':mmjob_name
    }
    with open(model_monitor_information_.path+'.json', 'w') as file:
        json.dump(model_monitor_information, file)

In [17]:
@component(
    base_image="gcr.io/gpa-poc-001/churn-base-image-src-xgb@sha256:61db16ec13bba7d8023fff61329c6c28a7eb119f8f837fce4c09258776c16727",
    output_component_file="save_consolidated_artifacts.yaml"
)

def save_consolidated_artifacts(
    model_: Input[Model],
    fe_pipeline_: Input[Model],
    fs_pipeline_: Input[Model],
    endpoint_information_: Input[Model],
    model_monitor_information_: Input[Model],
    metrics_: Input[Dataset],
    bucket:str='gpa-churn',
    consolidated_artifacts_path:str='artifacts/training_pipeline/production/'
    ):
    
    import os
    import gc
    import sys
    import json
    import joblib
    import numpy as np
    import pandas as pd
    import xgboost as xgb
    from datetime import datetime
    from google.cloud import storage
    
    sys.path.append('/usr/app/')
    sys.path.append('/usr/app/src')
    import src.utils as utils
    import src.pipeline_modules as pipeline_modules
    
    # loading artifacts
    #-------------------------------------------------------
    bst = xgb.Booster()
    bst.load_model(model_.path+'.bst')
    fe_pipeline = joblib.load(fe_pipeline_.path+'.joblib')
    fs_pipeline = joblib.load(fs_pipeline_.path+'.joblib')
    with open(endpoint_information_.path+'.json', 'r') as file:
        endpoint_information = json.load(file)
    with open(model_monitor_information_.path+'.json', 'r') as file:
        model_monitor_information = json.load(file)
    metrics = pd.read_parquet(metrics_.path+'.parquet')
    
    # getting bucket
    #-------------------------------------------------------
    storage_client = storage.Client()
    bucket_ = storage_client.get_bucket(bucket)
    
    # saving pipelines in artifact path
    #-------------------------------------------------------
    pipe_list = [fe_pipeline, fs_pipeline]
    pipe_label_list = ['fe_pipeline.joblib', 'fs_pipeline.joblib']
    for i in range(len(pipe_list)):
        art_file=f'{consolidated_artifacts_path}{pipe_label_list[i]}'
        blob = bucket_.blob(art_file)
        joblib.dump(pipe_list[i], pipe_label_list[i])
        blob.upload_from_filename(pipe_label_list[i])
    
    # saving endpoint_information in artifact path
    #-------------------------------------------------------
    file_name = 'endpoint_information.json'
    with open(file_name, 'w') as file:
        json.dump(endpoint_information, file)
    art_file=f'{consolidated_artifacts_path}{file_name}'
    blob = bucket_.blob(art_file)
    blob.upload_from_filename(file_name)
    
    # saving model_monitoring_information in artifact path
    #-------------------------------------------------------
    file_name = 'model_monitor_information.json'
    with open(file_name, 'w') as file:
        json.dump(model_monitor_information, file)
    art_file=f'{consolidated_artifacts_path}{file_name}'
    blob = bucket_.blob(art_file)
    blob.upload_from_filename(file_name)
    
    # upload local model to cloud storage
    #-------------------------------------------------------
    file_name = 'model.bst'
    bst.save_model(file_name)
    art_file=f'{consolidated_artifacts_path}{file_name}'
    blob = bucket_.blob(art_file)
    blob.upload_from_filename(file_name)
    
    # saving metrics
    #-------------------------------------------------------
    file_name = 'metrics.parquet'
    metrics_path = f'gs://{bucket}/{consolidated_artifacts_path}{file_name}'
    metrics.to_parquet(metrics_path, index=False, compression='gzip')

---

In [18]:
# creating the pipeline
from datetime import datetime
timestamp=datetime.now().strftime("%Y%m%d%H%M%S")
pipeline_label = f'pipeline-churn-'

In [19]:
@dsl.pipeline(
    pipeline_root=PIPELINE_ROOT,
    name=pipeline_label)

def pipeline(
    bucket:str='gpa-churn',
    artifact_path:str='artifacts/training_pipeline/xgb/',
    project_id:str='gpa-poc-001',
    region:str="us-central1", 
    model_label:str='churn',
    roc_threshold: float=0.8,
    model_serving_image:str="us-docker.pkg.dev/vertex-ai/prediction/xgboost-cpu.1-5:latest",
    consolidated_artifacts_path:str='artifacts/training_pipeline/production/'
    ):
    
    data_op = get_preprocessed_data(
        prefix=f'gs://{bucket}/data/processed/input/'
        )
    
    fe_pipe_op = feature_engineering_sequence(
        data_op.outputs['Xtrain_'],
        data_op.outputs['Xval_']
        )
    
    fs_pipe_op = feature_selection_sequence(
        fe_pipe_op.outputs['Xtrain_fe'],
        fe_pipe_op.outputs['Xval_fe'],
        data_op.outputs['ytrain_'],
        data_op.outputs['yval_'],
        )
    
    train_model_op = train_model(
        fs_pipe_op.outputs['Xtrain_fs'],
        fs_pipe_op.outputs['Xval_fs'],
        data_op.outputs['ytrain_'],
        data_op.outputs['yval_'],
        bucket=bucket,
        artifact_path=artifact_path
        )
    
    model_evaluation_op = evaluate_model(
        train_model_op.outputs['metrics_'],
        roc_threshold=roc_threshold
        )
    
    with dsl.Condition(
        model_evaluation_op.outputs['deploy']=='true',
        name="deploy-endpoint",
        ):
        
        deploy_model_op = deploy_endpoint(
            train_model_op.outputs['model_'],
            project_id=project_id,
            model_label=model_label,
            region=region,
            container_uri=model_serving_image,
            artifact_uri=f'gs://{bucket}/{artifact_path}'
            )
        
        deploy_monitor_op = deploy_model_monitoring_job(
            fs_pipe_op.outputs['baseline_df_'],
            deploy_model_op.outputs['endpoint_information_'],
            region=region,
            baseline_path=f'gs://{bucket}/artifacts/training_pipeline/baseline/'
            )
        
        save_consolidated_artifacts(
            train_model_op.outputs['model_'],
            fe_pipe_op.outputs['fe_pipeline_'],
            fs_pipe_op.outputs['fs_pipeline_'],
            deploy_model_op.outputs['endpoint_information_'],
            deploy_monitor_op.outputs['model_monitor_information_'],
            train_model_op.outputs['metrics_'],
            bucket=bucket,
            consolidated_artifacts_path=consolidated_artifacts_path
            )

In [20]:
compiler.Compiler().compile(pipeline_func=pipeline,
        package_path='ml_pipeline.json')



---

In [21]:
start_pipeline = pipeline_jobs.PipelineJob(
    display_name=pipeline_label,
    template_path="ml_pipeline.json",
    enable_caching=True,
    location=REGION,
)

In [22]:
start_pipeline.run()

Creating PipelineJob
PipelineJob created. Resource name: projects/437364709834/locations/southamerica-east1/pipelineJobs/pipeline-churn-20220609142225
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/437364709834/locations/southamerica-east1/pipelineJobs/pipeline-churn-20220609142225')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/southamerica-east1/pipelines/runs/pipeline-churn-20220609142225?project=437364709834
PipelineJob projects/437364709834/locations/southamerica-east1/pipelineJobs/pipeline-churn-20220609142225 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/437364709834/locations/southamerica-east1/pipelineJobs/pipeline-churn-20220609142225 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/437364709834/locations/southamerica-east1/pipelineJobs/pipeline-churn-20220609142225 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/437364709834/l