In [1]:
# !pip install jupyter_contrib_nbextensions
# !jupyter contrib nbextension install - user
# from jedi import settings
# settings.case_insensitive_completion = True

In [2]:
# # Install ai platform and kfp
# USER_FLAG = "--user"
# !pip3 install {USER_FLAG} google-cloud-aiplatform==1.3.0 --upgrade
# !pip3 install {USER_FLAG} kfp --upgrade
# !pip install google_cloud_pipeline_components

In [3]:
# !pip install kfp --upgrade

In [4]:
# !gcloud services enable compute.googleapis.com         \
#                        containerregistry.googleapis.com  \
#                        aiplatform.googleapis.com  \
#                        cloudbuild.googleapis.com \
#                        cloudfunctions.googleapis.com

In [5]:
from typing import NamedTuple
from kfp.v2 import dsl
from kfp.v2.dsl import (Artifact,
                        Dataset,
                        Input,
                        Model,
                        Output,
                        Metrics,
                        ClassificationMetrics,
                        component, 
                        OutputPath, 
                        InputPath)

from kfp.v2 import compiler
from google.cloud import bigquery
from google.cloud import aiplatform
from google.cloud.aiplatform import pipeline_jobs
from google_cloud_pipeline_components import aiplatform as gcc_aip

In [6]:
USER_FLAG = "--user"
#!gcloud auth login if needed

In [7]:
# Get projet name
shell_output=!gcloud config get-value project 2> /dev/null
PROJECT_ID=shell_output[0]
PROJECT_ID

'gpa-poc-001'

In [8]:
# Set bucket name
BUCKET_NAME="gs://gpa-churn/artifacts"

# Create bucket
PIPELINE_ROOT = f"{BUCKET_NAME}/pipeline-vertexai/"
PIPELINE_ROOT

'gs://gpa-churn/artifacts/pipeline-vertexai/'

In [9]:
REGION="southamerica-east1"
REGION

'southamerica-east1'

In [10]:
requirement_list = [
    "pandas==1.3.5",
    "xgboost==1.5",
    "scikit-learn",
    "pickle-mixin",
    "numpy",
    "jupyterlab==3.1.12",
    "ipywidgets>=7.6",
    "matplotlib==3.3.4",
    "jupyter-dash",
    "plotly==5.3.1",
    "pytest==6.2.2",
    "seaborn==0.11.1",
    "glob2==0.7",
    "SQLAlchemy==1.3.24",
    "lightgbm==3.2.0",
    "tabulate==0.8.9",
    "shap==0.39.0",
    "optuna==2.6.0",
    "dython==0.6.4",
    "minepy==1.2.5",
    "pyarrow==3.0.0",
    "kmodes==0.11.0",
    "dash==1.19.0",
    "dash-daq==0.5.0",
    "nltk",
    "unidecode",
    "fsspec",
    "gcsfs",
    "joblib",
    "great-expectations==0.13.17",
    "google-cloud-storage",
]

---

## Creating pipeline components

In [11]:
@component(
    #packages_to_install=requirement_list,
    base_image="gcr.io/gpa-poc-001/churn-base-image-src-xgb@sha256:61db16ec13bba7d8023fff61329c6c28a7eb119f8f837fce4c09258776c16727",
    output_component_file="get_preprocessed_data.yaml"
)

def get_preprocessed_data(
    Xtrain_: Output[Dataset],
    Xval_: Output[Dataset],
    ytrain_: Output[Dataset],
    yval_: Output[Dataset],
    prefix:str='gs://gpa-churn/data/processed/input/'
    ):
    
    import os
    import gc
    import sys
    import numpy as np
    import pandas as pd
    from google.cloud import storage
    from sklearn.model_selection import train_test_split
    
    df_train = pd.read_parquet('gs://gpa-churn/data/processed/train/dataset.parquet')
    df_val = pd.read_parquet('gs://gpa-churn/data/processed/validation/dataset.parquet')
    n_samples = int(len(df_train[df_train['target']==1]))
    df_train_0 = df_train[df_train['target']==0].sample(n_samples)
    df_train_1 = df_train[df_train['target']==1]#.sample(n_samples)
    df_train = pd.concat([df_train_0, df_train_1], axis=0)
    df_train = df_train.sample(frac=1).reset_index(drop=True)

    target = 'target'
    features = list(df_train.columns)
    features = [i for i in features if i!=target]

    Xtrain = df_train[features]
    Xval = df_val[features]
    ytrain = df_train[[target]]
    yval = df_val[[target]]
    
    Xtrain.to_parquet(Xtrain_.path + '.parquet', index=False, compression='gzip')
    Xval.to_parquet(Xval_.path + '.parquet', index=False, compression='gzip')
    ytrain.to_parquet(ytrain_.path + '.parquet', index=False, compression='gzip')
    yval.to_parquet(yval_.path + '.parquet', index=False, compression='gzip')

In [12]:
@component(
    #packages_to_install=requirement_list,
    base_image="gcr.io/gpa-poc-001/churn-base-image-src-xgb@sha256:61db16ec13bba7d8023fff61329c6c28a7eb119f8f837fce4c09258776c16727",
    output_component_file="train_model.yaml"
)

def train_model(
    Xtrain_: Input[Dataset],
    Xval_: Input[Dataset],
    ytrain_: Input[Dataset],
    yval_: Input[Dataset],
    model_: Output[Model]
    ):
    
    import os
    import sys
    import pytz
    import joblib
    import pandas as pd
    import xgboost as xgb
    from datetime import datetime
    from google.cloud import storage
    from sklearn.pipeline import Pipeline

    sys.path.append('/usr/app/')
    sys.path.append('/usr/app/src')
    import src.utils as utils
    import src.pipeline_modules as pipeline_modules
    from src.guara.modeling.supervised_modelz import SupervisedModelz
    
    Xtrain = pd.read_parquet(Xtrain_.path + ".parquet")
    Xval = pd.read_parquet(Xval_.path + ".parquet")
    ytrain = pd.read_parquet(ytrain_.path + ".parquet")
    yval = pd.read_parquet(yval_.path + ".parquet")
    
    dtrain = xgb.DMatrix(Xtrain, ytrain)
    dval = xgb.DMatrix(Xval, yval)
    
    scale_pos_weight=ytrain.value_counts(normalize=True)[0]/ytrain.value_counts(normalize=True)[1]  
    
    params = {
        'gamma': 1, 
        'verbosity': 0, 
        'scale_pos_weight': 1.0, 
        'eta': 0.32924394564404313, 
        'colsample_bytree': 0.6997715470767337, 
        'num_iterations': 259.98061008076706, 
        'lambda': 9.840799645070883, 
        'n_estimators': 372, 
        'max_depth': 5, 
        'feature_fraction': 0,
        'scale_pos_weight': scale_pos_weight,
        'eval_set': dval
    }

    bst = xgb.train(params, dtrain, 20)
    model_.metadata['framework'] = 'xgb'
    model = model_.path + '.bst'
    bst.save_model(model)
    
    storage_client = storage.Client()
    bucket_name='gpa-churn'
    bucket = storage_client.get_bucket(bucket_name)
    model_file='artifacts/training_pipeline/xgb/model.bst'
    blob = bucket.blob(model_file)
    bst.save_model('model.bst')
    blob.upload_from_filename('model.bst')

In [13]:
@component(
    #packages_to_install=['pandas', 'numpy'],
    base_image="gcr.io/gpa-poc-001/churn-base-image-src-xgb@sha256:61db16ec13bba7d8023fff61329c6c28a7eb119f8f837fce4c09258776c16727",
    output_component_file="sim_deploy.yaml"
)

def deploy_endpoint(
    model_: Input[Model],
    vertex_endpoint: Output[Artifact],
    vertex_model: Output[Model]
    ):
    
    import os
    import gc
    import sys
    import joblib
    import numpy as np
    import pandas as pd
    import xgboost as xgb
    from datetime import datetime
    from google.cloud import storage
    from google.cloud import aiplatform
    
    PROJECT_ID='gpa-poc-001'
    ENDPOINT_NAME = 'churn-endpoint-simplepipe'
    REGION="us-central1"
    DISPLAY_NAME = 'churn-model-simplepipe'
    MODEL_NAME = 'churn-xgb-simplepipe'
    CONTAINER_URI = "us-docker.pkg.dev/vertex-ai/prediction/xgboost-cpu.1-5:latest"
    ARTIFACT_URI = 'gs://gpa-churn/artifacts/training_pipeline/xgb/'
    
    endpoints = aiplatform.Endpoint.list(
        filter='display_name="{}"'.format(ENDPOINT_NAME),
        order_by='create_time desc',
        project=PROJECT_ID, 
        location=REGION,
        )

    if len(endpoints) > 0:
        endpoint = endpoints[0]  # most recently created
    else:
        endpoint = aiplatform.Endpoint.create(
        display_name=ENDPOINT_NAME, project=PROJECT_ID, location=REGION
    )
    
    #Import a model programmatically
    model_upload = aiplatform.Model.upload(
        display_name = DISPLAY_NAME, 
        artifact_uri = ARTIFACT_URI,
        serving_container_image_uri =  CONTAINER_URI,
        serving_container_health_route=f"/v1/models/{MODEL_NAME}",
        serving_container_predict_route=f"/v1/models/{MODEL_NAME}:predict",
        serving_container_environment_variables={
        "MODEL_NAME": MODEL_NAME,
    },       
    )
    
    model_deploy = model_upload.deploy(
        machine_type="n1-standard-4", 
        endpoint=endpoint,
        traffic_split={"0": 100},
        deployed_model_display_name=DISPLAY_NAME,
    )

    # Save data to the output params
    vertex_model.uri = model_deploy.resource_name 

---

In [14]:
# creating the pipeline
from datetime import datetime
TIMESTAMP =datetime.now().strftime("%Y%m%d%H%M%S")
DISPLAY_NAME = 'pipeline-test-{}'.format(TIMESTAMP)

In [15]:
@dsl.pipeline(
    pipeline_root=PIPELINE_ROOT,
    name="pipeline-train-churn")

def pipeline(
    project: str = 'gpa-poc-001',
    region: str = "southamerica-east1", 
    serving_container_image_uri: str = "gcr.io/gpa-poc-001/churn-base-image-src-xgb@sha256:61db16ec13bba7d8023fff61329c6c28a7eb119f8f837fce4c09258776c16727"
    ):
    
    data_op = get_preprocessed_data(
        prefix='gs://gpa-churn/data/processed/input/'
        )
    
    train_model_op = train_model(
        data_op.outputs['Xtrain_'],
        data_op.outputs['Xval_'],
        data_op.outputs['ytrain_'],
        data_op.outputs['yval_']
        )
           
    deploy_model_op = deploy_endpoint(
        train_model_op.outputs['model_']
        )

In [16]:
compiler.Compiler().compile(pipeline_func=pipeline,
        package_path='ml_pipeline_test.json')



---

In [17]:
start_pipeline = pipeline_jobs.PipelineJob(
    display_name="churn-test-pipeline",
    template_path="ml_pipeline_test.json",
    enable_caching=True,
    location=REGION,
)

In [18]:
start_pipeline.run()

Creating PipelineJob
PipelineJob created. Resource name: projects/437364709834/locations/southamerica-east1/pipelineJobs/pipeline-train-churn-20220603015119
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/437364709834/locations/southamerica-east1/pipelineJobs/pipeline-train-churn-20220603015119')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/southamerica-east1/pipelines/runs/pipeline-train-churn-20220603015119?project=437364709834
PipelineJob projects/437364709834/locations/southamerica-east1/pipelineJobs/pipeline-train-churn-20220603015119 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/437364709834/locations/southamerica-east1/pipelineJobs/pipeline-train-churn-20220603015119 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/437364709834/locations/southamerica-east1/pipelineJobs/pipeline-train-churn-20220603015119 current state:
PipelineState.PIPELINE_STATE_RUNNING