In [1]:
# !pip install jupyter_contrib_nbextensions
# !jupyter contrib nbextension install - user
# from jedi import settings
# settings.case_insensitive_completion = True

In [2]:
# # Install ai platform and kfp
# USER_FLAG = "--user"
# !pip3 install {USER_FLAG} google-cloud-aiplatform==1.3.0 --upgrade
# !pip3 install {USER_FLAG} kfp --upgrade
# !pip install google_cloud_pipeline_components

In [3]:
# !pip install kfp --upgrade

In [4]:
# !gcloud services enable compute.googleapis.com         \
#                        containerregistry.googleapis.com  \
#                        aiplatform.googleapis.com  \
#                        cloudbuild.googleapis.com \
#                        cloudfunctions.googleapis.com

In [5]:
from typing import NamedTuple
from kfp.v2 import dsl
from kfp.v2.dsl import (Artifact,
                        Dataset,
                        Input,
                        Model,
                        Output,
                        Metrics,
                        ClassificationMetrics,
                        component, 
                        OutputPath, 
                        InputPath)

from kfp.v2 import compiler
from google.cloud import bigquery
from google.cloud import aiplatform
from google.cloud.aiplatform import pipeline_jobs
from google_cloud_pipeline_components import aiplatform as gcc_aip

In [6]:
USER_FLAG = "--user"
#!gcloud auth login if needed

In [7]:
# Get projet name
shell_output=!gcloud config get-value project 2> /dev/null
PROJECT_ID=shell_output[0]
PROJECT_ID

'gpa-poc-001'

In [8]:
# Set bucket name
BUCKET_NAME="gs://gpa-churn/artifacts"

# Create bucket
PIPELINE_ROOT = f"{BUCKET_NAME}/pipeline-vertexai/"
PIPELINE_ROOT

'gs://gpa-churn/artifacts/pipeline-vertexai/'

In [9]:
REGION="southamerica-east1"
REGION

'southamerica-east1'

In [10]:
requirement_list = [
    "pandas==1.3.5",
    "scikit-learn",
    "pickle-mixin",
    "numpy",
    "jupyterlab==3.1.12",
    "ipywidgets>=7.6",
    "matplotlib==3.3.4",
    "jupyter-dash",
    "plotly==5.3.1",
    "pytest==6.2.2",
    "seaborn==0.11.1",
    "glob2==0.7",
    "SQLAlchemy==1.3.24",
    "lightgbm==3.2.0",
    "tabulate==0.8.9",
    "shap==0.39.0",
    "optuna==2.6.0",
    "dython==0.6.4",
    "minepy==1.2.5",
    "pyarrow==3.0.0",
    "kmodes==0.11.0",
    "dash==1.19.0",
    "dash-daq==0.5.0",
    "nltk",
    "unidecode",
    "fsspec",
    "gcsfs",
    "joblib",
    "great-expectations==0.13.17",
    "google-cloud-storage",
]

---

## Creating pipeline components

In [11]:
@component(
    #packages_to_install=requirement_list,
    base_image="gcr.io/gpa-poc-001/churn-base-image-src@sha256:fdc6b2fed2deac0ea267878cfde028f3b30079b104a6f4047d120c820def1b83",
    output_component_file="get_preprocessed_data.yaml"
)

def get_preprocessed_data(
    Xtrain_: Output[Dataset],
    Xval_: Output[Dataset],
    ytrain_: Output[Dataset],
    yval_: Output[Dataset],
    prefix:str='gs://gpa-churn/data/processed/input/'
    ):
    
    import os
    import gc
    import sys
    import numpy as np
    import pandas as pd
    from google.cloud import storage
    from sklearn.model_selection import train_test_split
    
    bucket = prefix.split('/')[2]
    storage_client = storage.Client()
    obj_list = storage_client.list_blobs(bucket)
    obj_list = [i.name for i in obj_list if 'data/processed/input/' in i.name]
    obj_list = obj_list[1:]
    df_list = []
    for obj in obj_list:
        local_df = pd.read_parquet('gs://gpa-churn/'+obj)
        df_list.append(local_df)
        print(f'added {prefix}{obj}')

    df = pd.concat(df_list, axis=0)
    df.drop(columns=['cod_cliente'], inplace=True)
    df.drop_duplicates(inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    target = 'target'
    features = list(df.columns)
    features = [i for i in features if i != target]

    Xtrain, Xval, ytrain, yval = train_test_split(
        df[features], 
        df[[target]],
        test_size=0.15, 
        random_state=501
        )
    
    del df
    gc.collect()

    Xtrain.reset_index(drop=True, inplace=True)
    Xval.reset_index(drop=True, inplace=True)
    ytrain.reset_index(drop=True, inplace=True)
    yval.reset_index(drop=True, inplace=True)
    print('Successfully read training data.')
    print('shapes:')
    print(f'xtrain:{Xtrain.shape}, ytrain:{ytrain.shape}')
    print(f'xval:{Xval.shape}, yval:{yval.shape}')
    
    Xtrain.to_parquet(Xtrain_.path + '.parquet', index=False, compression='gzip')
    Xval.to_parquet(Xval_.path + '.parquet', index=False, compression='gzip')
    ytrain.to_parquet(ytrain_.path + '.parquet', index=False, compression='gzip')
    yval.to_parquet(yval_.path + '.parquet', index=False, compression='gzip')

In [12]:
@component(
    #packages_to_install=requirement_list,
    base_image="gcr.io/gpa-poc-001/churn-base-image-src@sha256:fdc6b2fed2deac0ea267878cfde028f3b30079b104a6f4047d120c820def1b83",
    output_component_file="feature_engineering_sequence.yaml"
)

def feature_engineering_sequence(
    Xtrain_: Input[Dataset],
    Xval_: Input[Dataset],
    Xtrain_fe: Output[Dataset],
    Xval_fe: Output[Dataset]
    ):
    
    import os
    import sys
    import pytz
    import joblib
    import pandas as pd
    from datetime import datetime
    from google.cloud import storage
    from sklearn.pipeline import Pipeline

    sys.path.append('/usr/app/')
    sys.path.append('/usr/app/src')
    import src.pipeline_modules as pipeline_modules
    
    Xtrain = pd.read_parquet(Xtrain_.path + ".parquet")
    Xval = pd.read_parquet(Xval_.path + ".parquet")
    
    numerical_columns = [
        'val_venda_bruta_cupom',
        'qtd_item_venda',
        'flg_vend_meu_desct',
        'valor_desconto',
        'flag_dev',
        'tipo_promo_0',
        'tipo_promo_1',
        'tipo_promo_2',
        'tipo_promo_3',
        'tipo_promo_4',
        'tipo_promo_5',
        'categoria_0',
        'categoria_1',
        'categoria_2',
        'categoria_3',
        'categoria_4',
        'categoria_5',
        'categoria_6',
        'categoria_7',
        'departamento_0',
        'compras_mes',
        'agg_l3m_val_venda_bruta_cupom',
        'agg_l3m_qtd_item_venda',
        'agg_l3m_flg_vend_meu_desct',
        'agg_l3m_valor_desconto',
        'agg_l3m_flag_dev',
        'agg_l3m_tipo_promo_0',
        'agg_l3m_tipo_promo_1',
        'agg_l3m_tipo_promo_2',
        'agg_l3m_tipo_promo_3',
        'agg_l3m_tipo_promo_4',
        'agg_l3m_tipo_promo_5',
        'agg_l3m_categoria_0',
        'agg_l3m_categoria_1',
        'agg_l3m_categoria_2',
        'agg_l3m_categoria_3',
        'agg_l3m_categoria_4',
        'agg_l3m_categoria_5',
        'agg_l3m_categoria_6',
        'agg_l3m_categoria_7',
        'agg_l3m_departamento_0',
        'agg_l3m_compras_mes',
    ]

    outlier_columns_mean = [
        'pib_percapita',
        'idade',
        'delta_de_cadastro',
        'delta_de_stix'
    ]

    yeojohnson_columns = [
        'val_venda_bruta_cupom',
        'qtd_item_venda',
        'flg_vend_meu_desct',
        'valor_desconto',
        'compras_mes',
        'agg_l3m_val_venda_bruta_cupom',
        'agg_l3m_qtd_item_venda',
        'agg_l3m_flg_vend_meu_desct',
        'agg_l3m_valor_desconto',
        'agg_l3m_compras_mes',
        'pib_percapita',
        'idade',
        'delta_de_cadastro'
    ]
    
    # training set
    #-------------------------------------------------------
    fe_pipeline = Pipeline([
        ('drop_temporary_columns', pipeline_modules.drop_temporary_columns()),
        ('drop_with_low_variance', pipeline_modules.drop_numerical_with_variance(columns=numerical_columns)),
        ('encode_sex_column', pipeline_modules.encode_sex_column()),
        ('group_rare_regions', pipeline_modules.group_rare_categorical(columns=['region'], threshold=0.002)),
        ('encode_regions', pipeline_modules.encode_categorical(columns=['region'])),
        ('handle_outliers_max', pipeline_modules.outlier_handling(
            columns=numerical_columns, 
            method='gauss', 
            band=2.8, 
            action='max')),
        ('handle_outliers_mean', pipeline_modules.outlier_handling(
            columns=outlier_columns_mean, 
            method='gauss', 
            band=2.5, 
            action='mean')),
        ('handle_negative_values', pipeline_modules.handle_negative_values(columns=numerical_columns)),
        ('fill_missing_numerical_zero', pipeline_modules.fill_na_values_with_zero(
            columns=['ind_email','cadastro_stix','delta_de_cadastro','delta_de_stix'])),
        ('fill_missing_numerical_mean', pipeline_modules.fill_na_values_with_zero(
            columns=['pib_percapita','idade'])),
        ('transform_yeojohnson', pipeline_modules.data_transformation(
            columns=yeojohnson_columns, 
            method='yeojohnson'))
    ])

    Xtrain = fe_pipeline.fit_transform(Xtrain)

    # validation set
    #-------------------------------------------------------
    Xval = fe_pipeline.transform(Xval)
    
    # save feature engineering artifacts
    #-------------------------------------------------------
    storage_client = storage.Client()
    bucket_name='gpa-churn'
    model_file='artifacts/training_pipeline/fe_pipeline/fe_pipeline.joblib'
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(model_file)
    joblib.dump(fe_pipeline, 'fe_pipeline.joblib')
    blob.upload_from_filename('fe_pipeline.joblib')
    
    Xtrain.to_parquet(Xtrain_fe.path + '.parquet', index=False, compression='gzip')
    Xval.to_parquet(Xval_fe.path + '.parquet', index=False, compression='gzip')

In [13]:
@component(
    #packages_to_install=requirement_list,
    base_image="gcr.io/gpa-poc-001/churn-base-image-src@sha256:fdc6b2fed2deac0ea267878cfde028f3b30079b104a6f4047d120c820def1b83",
    output_component_file="feature_selection_sequence.yaml"
)

def feature_selection_sequence(
    Xtrain_fe: Input[Dataset],
    Xval_fe: Input[Dataset],
    ytrain_: Input[Dataset],
    yval_: Input[Dataset],
    Xtrain_fs: Output[Dataset],
    Xval_fs: Output[Dataset],
    ):
    
    import os
    import sys
    import pytz
    import joblib
    import pandas as pd
    from datetime import datetime
    from google.cloud import storage
    from sklearn.pipeline import Pipeline

    sys.path.append('/usr/app/')
    sys.path.append('/usr/app/src')
    import src.utils as utils
    import src.pipeline_modules as pipeline_modules
    
    Xtrain = pd.read_parquet(Xtrain_fe.path + ".parquet")
    Xval = pd.read_parquet(Xval_fe.path + ".parquet")
    ytrain = pd.read_parquet(ytrain_.path + ".parquet")
    yval = pd.read_parquet(yval_.path + ".parquet")
    
    # training set
    #-------------------------------------------------------
    fs_pipeline = Pipeline([
            ('select_with_correlation', pipeline_modules.select_with_correlation(
                threshold=0.82, 
                method='recursive',
                objective='classification'))
        ])
        
    Xtrain = fs_pipeline.fit_transform(Xtrain, ytrain)
    
    # validation set
    #-------------------------------------------------------
    Xval = fs_pipeline.transform(Xval)
    
    # save feature selection artifacts
    #-------------------------------------------------------
    storage_client = storage.Client()
    bucket_name='gpa-churn'
    model_file=f'artifacts/training_pipeline/fs_pipeline/fs_pipeline.joblib'
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(model_file)
    joblib.dump(fs_pipeline, 'fs_pipeline.joblib')
    blob.upload_from_filename('fs_pipeline.joblib')
    
    Xtrain.to_parquet(Xtrain_fs.path + '.parquet', index=False, compression='gzip')
    Xval.to_parquet(Xval_fs.path + '.parquet', index=False, compression='gzip')

In [14]:
@component(
    #packages_to_install=requirement_list,
    base_image="gcr.io/gpa-poc-001/churn-base-image-src@sha256:fdc6b2fed2deac0ea267878cfde028f3b30079b104a6f4047d120c820def1b83",
    output_component_file="train_model.yaml"
)

def train_model(
    Xtrain_fs: Input[Dataset],
    Xval_fs: Input[Dataset],
    ytrain_: Input[Dataset],
    yval_: Input[Dataset],
    model_: Output[Model],
    metrics_: Output[Dataset]
    ):
    
    import os
    import sys
    import pytz
    import pickle
    import joblib
    import pandas as pd
    from datetime import datetime
    from google.cloud import storage
    from sklearn.pipeline import Pipeline

    sys.path.append('/usr/app/')
    sys.path.append('/usr/app/src')
    import src.utils as utils
    import src.pipeline_modules as pipeline_modules
    from src.guara.modeling.supervised_modelz import SupervisedModelz
    
    Xtrain = pd.read_parquet(Xtrain_fs.path + ".parquet")
    Xval = pd.read_parquet(Xval_fs.path + ".parquet")
    ytrain = pd.read_parquet(ytrain_.path + ".parquet")
    yval = pd.read_parquet(yval_.path + ".parquet")
    
    scale_pos_weight=ytrain.value_counts(normalize=True)[0]/ytrain.value_counts(normalize=True)[1]  
    params = {
        'random_state':501, 
        'boosting_type':'gbdt', 
        'device_type':'cpu',
        'scale_pos_weight':scale_pos_weight,
        'sub_sample':0.8,
        'min_child_samples':24,
        'learning_rate':0.38832846505493473,
        'colsample_bytree':0.31177546084715557,
        'n_estimators':499,
        'max_depth':4,
        'num_leaves':10
    }

    md = SupervisedModelz('lgbm', 'binary')
    model = md.fit(Xtrain, Xval, ytrain, yval, params)
    model_.metadata['framework'] = 'lgbm'
    file_name = model_.path + '.pkl'
    with open(file_name, 'wb') as file:
        pickle.dump(model, file)
    
    # evaluate model performance
    metrics = ['Accuracy', 'Precision', 'Recall', 'F1', 'ROC_AUC']
    ytrain_pred = md.predict(Xtrain)
    yval_pred = md.predict(Xval)

    md.eval_binary(ytrain['target'], ytrain_pred, yval['target'], yval_pred)

    if type(ytrain) == pd.Series:
        ytrain = ytrain.values
    if type(yval) == pd.Series:
        yval = yval.values

    print('\n ============== Resumo metricas ============== \n')
    print('TREINO:\n')
    metrics_train = md.metrics_print(
        ytrain.values.clip(0, None), 
        ytrain_pred.clip(0, None), 
        metrics
    )

    print('\nVALIDACAO:\n')
    metrics_val = md.metrics_print(
        yval.values.clip(0, None), 
        yval_pred.clip(0, None), 
        metrics
    )
    
    # save model artifacts
    #-------------------------------------------------------
    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
    storage_client = storage.Client()
    bucket_name='gpa-churn'
    model_file='artifacts/training_pipeline/model/model.joblib'
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(model_file)
    joblib.dump(model, 'model.joblib')
    blob.upload_from_filename('model.joblib')
    
    metrics_val.to_parquet(metrics_.path + '.parquet', index=False, compression='gzip')

In [15]:
@component(
    #packages_to_install=['pandas', 'numpy'],
    base_image="gcr.io/gpa-poc-001/churn-base-image-src@sha256:fdc6b2fed2deac0ea267878cfde028f3b30079b104a6f4047d120c820def1b83",
    output_component_file="evaluate_model.yaml"
)

def evaluate_model(
    metrics_: Input[Dataset],
    f1_threshold:int=0.6
    ) -> NamedTuple('output', [('deploy', str)]):
    
    import pandas as pd
    
    def threshold_check(val1, val2):
        cond = "false"
        if val1 >= val2 :
            cond = "true"
        return cond
    
    metrics = pd.read_parquet(metrics_.path + '.parquet')
    deploy = threshold_check(float(metrics['F1']), float(f1_threshold))
    return (deploy,)

  ' parameter "{}". {}'.format(parameter.name, ex))


In [16]:
@component(
    #packages_to_install=['pandas', 'numpy'],
    base_image="gcr.io/gpa-poc-001/churn-base-image-src@sha256:fdc6b2fed2deac0ea267878cfde028f3b30079b104a6f4047d120c820def1b83",
    output_component_file="sim_deploy.yaml"
)

def sim_deploy(model: Input[Model]):
    
    import joblib
    import pandas as pd
    from io import BytesIO
    from google.cloud import storage
    
    # reading files
    #-------------------------------------------------------
    storage_client = storage.Client()
    bucket_name='gpa-churn'
    artifact_list = ['model', 'fe_pipeline', 'fs_pipeline']
    
    for artifact_label in artifact_list:
        
        art_file=f'artifacts/training_pipeline/{artifact_label}/{artifact_label}.joblib'
        bucket = storage_client.get_bucket(bucket_name)
        blob = bucket.blob(art_file)
        art_file = BytesIO()
        blob.download_to_file(art_file)
        artifact=joblib.load(art_file)
        
        art_file=f'artifacts/training_pipeline/production/{artifact_label}.joblib'
        bucket = storage_client.get_bucket(bucket_name)
        blob = bucket.blob(art_file)
        joblib.dump(artifact, 'artifact-local.joblib')
        blob.upload_from_filename('artifact-local.joblib')
    
    return

---

In [17]:
# creating the pipeline
from datetime import datetime
TIMESTAMP =datetime.now().strftime("%Y%m%d%H%M%S")
DISPLAY_NAME = 'pipeline-test-{}'.format(TIMESTAMP)

In [18]:
@dsl.pipeline(
    pipeline_root=PIPELINE_ROOT,
    name="pipeline-train-churn")

def pipeline(
    prefix: str = 'gs://gpa-churn/data/processed/input/',
    project: str = PROJECT_ID,
    region: str = REGION, 
    display_name: str = DISPLAY_NAME,
    #api_endpoint: str = REGION+"-aiplatform.googleapis.com",
    f1_threshold: float = 0.6,
    serving_container_image_uri: str = "gcr.io/gpa-poc-001/churn-base-image@sha256:c18908472b7cc7663502e660943ea6c2e5c0a14054cdb20f762f37b7b5df55d7:latest"
    ):
    
    data_op = get_preprocessed_data(
        prefix='gs://gpa-churn/data/processed/input/'
        )
    
    fe_pipe_op = feature_engineering_sequence(
        data_op.outputs['Xtrain_'],
        data_op.outputs['Xval_']
        )
    
    fs_pipe_op = feature_selection_sequence(
        fe_pipe_op.outputs['Xtrain_fe'],
        fe_pipe_op.outputs['Xval_fe'],
        data_op.outputs['ytrain_'],
        data_op.outputs['yval_']
        )
    
    train_model_op = train_model(
        fs_pipe_op.outputs['Xtrain_fs'],
        fs_pipe_op.outputs['Xval_fs'],
        data_op.outputs['ytrain_'],
        data_op.outputs['yval_']
        )
    
    model_evaluation_op = evaluate_model(
        train_model_op.outputs['metrics_'],
        0.6
    )
    
    with dsl.Condition(
        model_evaluation_op.outputs['deploy']=='true',
        name="deploy-condition",
    ):
           
        deploy_model_op = sim_deploy(train_model_op.outputs['model_'])

In [19]:
compiler.Compiler().compile(pipeline_func=pipeline,
        package_path='ml_pipeline_test.json')



---

In [20]:
start_pipeline = pipeline_jobs.PipelineJob(
    display_name="churn-test-pipeline",
    template_path="ml_pipeline_test.json",
    enable_caching=True,
    location=REGION,
)

In [21]:
start_pipeline.run()

Creating PipelineJob
PipelineJob created. Resource name: projects/437364709834/locations/southamerica-east1/pipelineJobs/pipeline-train-churn-20220602125836
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/437364709834/locations/southamerica-east1/pipelineJobs/pipeline-train-churn-20220602125836')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/southamerica-east1/pipelines/runs/pipeline-train-churn-20220602125836?project=437364709834
PipelineJob projects/437364709834/locations/southamerica-east1/pipelineJobs/pipeline-train-churn-20220602125836 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/437364709834/locations/southamerica-east1/pipelineJobs/pipeline-train-churn-20220602125836 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/437364709834/locations/southamerica-east1/pipelineJobs/pipeline-train-churn-20220602125836 current state:
PipelineState.PIPELINE_STATE_RUNNING

RuntimeError: Job failed with:
code: 9
message: "The DAG failed because some tasks failed. The failed tasks are: [evaluate-model].; Job (project_id = gpa-poc-001, job_id = 2714333569165033472) is failed due to the above error.; Failed to handle the job: {project_number = 437364709834, job_id = 2714333569165033472}"
