### Import Libraries

In [None]:
# import required libraries
from kfp import dsl
from kfp.v2 import compiler
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output, OutputPath, ClassificationMetrics,
                        Metrics, component)
import os
import re
from pathlib import Path

from datetime import date
from datetime import timedelta
from dateutil.relativedelta import relativedelta

import google
from google.oauth2 import credentials
from google.oauth2 import service_account
from google.oauth2.service_account import Credentials
from google.cloud import storage
from google.cloud.aiplatform import pipeline_jobs
from google_cloud_pipeline_components.v1.batch_predict_job import \
    ModelBatchPredictOp as batch_prediction_op


### Parameters

In [None]:
#tag cell with parameters
PROJECT_ID =  ''
BUCKET_NAME=''
DATASET_ID = ''
RESOURCE_BUCKET = ''
FILE_BUCKET = ''
REGION = ''
MODEL_ID = '5090'

In [None]:
#tag cell with parameters
PROJECT_ID =  'divg-josh-pr-d1cc3a'
BUCKET_NAME='divg-josh-pr-d1cc3a-default'
DATASET_ID = 'call_to_retention_dataset'
RESOURCE_BUCKET = 'divg-josh-pr-d1cc3a-default'
FILE_BUCKET = 'divg-josh-pr-d1cc3a-default'
MODEL_ID = '5090'

### Service Parameters

In [None]:
SERVICE_TYPE = 'call_to_retention'
SERVICE_TYPE_NAME = 'call-to-retention'
TABLE_ID = 'bq_call_to_retention_targets'
REGION = "northamerica-northeast1"

### Pulumi Parameters

In [None]:
STACK_NAME = 'call_to_retention'
TRAIN_PIPELINE_NAME_PATH = 'train_pipeline'
PREDICT_PIPELINE_NAME_PATH = 'predict_pipeline'
TRAIN_PIPELINE_NAME = 'call-to-retention-train-pipeline' # Same name as pulumi.yaml
PREDICT_PIPELINE_NAME = 'call-to-retention-predict-pipeline' # Same name as pulumi.yaml
TRAIN_PIPELINE_DESCRIPTION = 'call-to-retention-train-pipeline'
PREDICT_PIPELINE_DESCRIPTION = 'call-to-retention-predict-pipeline'
REGION = "northamerica-northeast1"

### Query + Pre-Processing Component Parameters

In [None]:
TRAIN_QUERIES_PATH = f"{STACK_NAME}/{TRAIN_PIPELINE_NAME_PATH}/queries/" 
TRAIN_UTILS_FILE_PATH = f"{STACK_NAME}/{TRAIN_PIPELINE_NAME_PATH}/utils" 
UTILS_FILENAME = 'utils.py'

PROCESSED_SERVING_DATA_TABLENAME = 'processed_serving_data'
INPUT_SERVING_DATA_TABLENAME = 'input_serving_data'

QUERY_DATE = (date.today() - relativedelta(days=1)).strftime('%Y-%m-%d')
TARGET_TABLE_REF = '{}.{}.{}'.format(PROJECT_ID, DATASET_ID, TABLE_ID)

QUERIES_PATH = 'call_to_retention/queries/'


### Import Pipeline Components

In [None]:
# download required component files to local
prefix = f'{STACK_NAME}/{TRAIN_PIPELINE_NAME_PATH}/components/'
dl_dir = 'components/'

storage_client = storage.Client()
bucket = storage_client.bucket(RESOURCE_BUCKET)
blobs = bucket.list_blobs(prefix=prefix)  # Get list of files
for blob in blobs: # download each file that starts with "prefix" into "dl_dir"
    if blob.name.endswith("/"):
        continue
    file_split = blob.name.split(prefix)
    file_path = f"{dl_dir}{file_split[-1]}"
    directory = "/".join(file_path.split("/")[0:-1])
    Path(directory).mkdir(parents=True, exist_ok=True)
    blob.download_to_filename(file_path) 

# import main pipeline components
import components


### Date Parameters

In [None]:
scoringDate = date(2022, 9, 1)  # date.today() - relativedelta(days=2)- relativedelta(months=30)
valScoringDate = date(2022, 10, 1)  # scoringDate - relativedelta(days=2)

# training dates
SCORE_DATE = scoringDate.strftime('%Y%m%d')  # date.today().strftime('%Y%m%d')
SCORE_DATE_DASH = scoringDate.strftime('%Y-%m-%d')
SCORE_DATE_MINUS_6_MOS_DASH = ((scoringDate - relativedelta(months=6)).replace(day=1)).strftime('%Y-%m-%d')
SCORE_DATE_LAST_MONTH_START_DASH = (scoringDate.replace(day=1) - timedelta(days=1)).replace(day=1).strftime('%Y-%m-%d')
SCORE_DATE_LAST_MONTH_END_DASH = ((scoringDate.replace(day=1)) - timedelta(days=1)).strftime('%Y-%m-%d')
PROMO_EXPIRY_START = (scoringDate.replace(day=1) + relativedelta(months=3)).replace(day=1).strftime('%Y-%m-%d')
PROMO_EXPIRY_END = (scoringDate.replace(day=1) + relativedelta(months=4)).replace(day=1).strftime('%Y-%m-%d')

# validation dates
SCORE_DATE_VAL = valScoringDate.strftime('%Y%m%d')
SCORE_DATE_VAL_DASH = valScoringDate.strftime('%Y-%m-%d')
SCORE_DATE_VAL_MINUS_6_MOS_DASH = ((valScoringDate - relativedelta(months=6)).replace(day=1)).strftime('%Y-%m-%d')
SCORE_DATE_VAL_LAST_MONTH_START_DASH = (valScoringDate.replace(day=1) - timedelta(days=1)).replace(day=1).strftime('%Y-%m-%d')
SCORE_DATE_VAL_LAST_MONTH_END_DASH = ((valScoringDate.replace(day=1)) - timedelta(days=1)).strftime('%Y-%m-%d')
PROMO_EXPIRY_START_VAL = (valScoringDate.replace(day=1) + relativedelta(months=3)).replace(day=1).strftime('%Y-%m-%d')
PROMO_EXPIRY_END_VAL = (valScoringDate.replace(day=1) + relativedelta(months=4)).replace(day=1).strftime('%Y-%m-%d')

SCORE_DATE_DELTA = 0
SCORE_DATE_VAL_DELTA = 0
TICKET_DATE_WINDOW = 30  # Days of ticket data to be queried


### bq_create_dataset.py

In [None]:
import kfp
from kfp import dsl
# from kfp.v2.dsl import (Model, Input, component)
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output,HTML,
                        OutputPath, ClassificationMetrics, Metrics, component)
from typing import NamedTuple
# Create Training Dataset for training pipeline
@component(
    base_image="northamerica-northeast1-docker.pkg.dev/cio-workbench-image-np-0ddefe/wb-platform/pipelines/kubeflow-pycaret:latest",
    output_component_file="bq_create_dataset.yaml",
)
def bq_create_dataset(score_date: str,
                      score_date_delta: int,
                      project_id: str,
                      dataset_id: str,
                      region: str,
                      promo_expiry_start: str, 
                      promo_expiry_end: str, 
                      v_start_date: str,
                      v_end_date: str) -> NamedTuple("output", [("col_list", list)]):
 
    from google.cloud import bigquery
    import logging 
    from datetime import datetime
    # For wb
    # import google.oauth2.credentials
    # CREDENTIALS = google.oauth2.credentials.Credentials(token)
    
    def get_gcp_bqclient(project_id, use_local_credential=True):
        token = os.popen('gcloud auth print-access-token').read()
        token = re.sub(f'\n$', '', token)
        credentials = google.oauth2.credentials.Credentials(token)

        bq_client = bigquery.Client(project=project_id)
        if use_local_credential:
            bq_client = bigquery.Client(project=project_id, credentials=credentials)
        return bq_client

    client = get_gcp_bqclient(project_id)
    # client = bigquery.Client(project=project_id, location=region)
    job_config = bigquery.QueryJobConfig()
    
    # Change dataset / table + sp table name to version in bi-layer
    query =\
        f'''
            DECLARE score_date DATE DEFAULT "{score_date}";
            DECLARE promo_expiry_start DATE DEFAULT "{promo_expiry_start}";
            DECLARE promo_expiry_end DATE DEFAULT "{promo_expiry_end}";
            DECLARE start_date DATE DEFAULT "{v_start_date}";
            DECLARE end_date DATE DEFAULT "{v_end_date}";
        
            -- Change dataset / sp name to the version in the bi_layer
            CALL {dataset_id}.bq_sp_ctr_pipeline_dataset(score_date, promo_expiry_start, promo_expiry_end, start_date, end_date);

            SELECT
                *
            FROM {dataset_id}.INFORMATION_SCHEMA.PARTITIONS
            WHERE table_name='bq_ctr_pipeline_dataset'
            
        '''
    
    df = client.query(query, job_config=job_config).to_dataframe()
    logging.info(df.to_string())
    
    logging.info(f"Loaded {df.total_rows[0]} rows into \
             {df.table_catalog[0]}.{df.table_schema[0]}.{df.table_name[0]} on \
             {datetime.strftime((df.last_modified_time[0]), '%Y-%m-%d %H:%M:%S') } !")
    
    ######################################## Save column list_##########################
    query =\
        f'''
           SELECT
                *
            FROM {dataset_id}.bq_ctr_pipeline_dataset

        '''
    
    df = client.query(query, job_config=job_config).to_dataframe()
    
    col_list = list([col for col in df.columns])
    return (col_list,)
    

### Preprocess

In [None]:
def preprocess(
        pipeline_dataset: str, 
        save_data_path: str,
        project_id: str,
        dataset_id: str
):
    from google.cloud import bigquery
    import pandas as pd
    import gc
    import time

    def get_gcp_bqclient(project_id, use_local_credential=True):
        token = os.popen('gcloud auth print-access-token').read()
        token = re.sub(f'\n$', '', token)
        credentials = google.oauth2.credentials.Credentials(token)

        bq_client = bigquery.Client(project=project_id)
        if use_local_credential:
            bq_client = bigquery.Client(project=project_id, credentials=credentials)
        return bq_client

    client = get_gcp_bqclient(project_id)

    # bq_client = bigquery.Client(project=project_id)

    # pipeline_dataset 
    pipeline_dataset_name = f"{project_id}.{dataset_id}.{pipeline_dataset}" 
    build_df_pipeline_dataset = f'SELECT * FROM `{pipeline_dataset_name}`'
    df_pipeline_dataset = client.query(build_df_pipeline_dataset).to_dataframe()
    df_pipeline_dataset = df_pipeline_dataset.set_index('ban') 

    # demo columns
    df_pipeline_dataset['demo_urban_flag'] = df_pipeline_dataset.demo_sgname.str.lower().str.contains('urban').fillna(0).astype(int)
    df_pipeline_dataset['demo_rural_flag'] = df_pipeline_dataset.demo_sgname.str.lower().str.contains('rural').fillna(0).astype(int)
    df_pipeline_dataset['demo_family_flag'] = df_pipeline_dataset.demo_lsname.str.lower().str.contains('families').fillna(0).astype(int)

    df_income_dummies = pd.get_dummies(df_pipeline_dataset[['demo_lsname']]) 
    df_income_dummies.columns = df_income_dummies.columns.str.replace('&', 'and')
    df_income_dummies.columns = df_income_dummies.columns.str.replace(' ', '_')

    df_pipeline_dataset.drop(columns=['demo_sgname', 'demo_lsname'], axis=1, inplace=True)

    df_pipeline_dataset = df_pipeline_dataset.join(df_income_dummies)

    df_join = df_pipeline_dataset.copy()

    #column name clean-up
    df_join.columns = df_join.columns.str.replace(' ', '_')
    df_join.columns = df_join.columns.str.replace('-', '_')

    df_join.head()

    #df_final
    df_final = df_join.copy()
    del df_join
    gc.collect()
    print('......df_final done')

    for f in df_final.columns:
        df_final[f] = list(df_final[f])

    df_final.to_csv(save_data_path, index=True, compression='gzip') 
    del df_final
    gc.collect()
    print(f'......csv saved in {save_data_path}')
    time.sleep(120)


### Train and Save Model 

In [None]:
def train_and_save_model(
            file_bucket: str,
            service_type: str,
            score_date_dash: str,
            score_date_val_dash: str,
            project_id: str,
            dataset_id: str
):

    import gc
    import time
    import pandas as pd
    import numpy as np
    import pickle
    from google.cloud import storage
    from google.cloud import bigquery
    from sklearn.model_selection import train_test_split

    def get_lift(prob, y_test, q):
        result = pd.DataFrame(columns=['Prob', 'Call_To_Retention'])
        result['Prob'] = prob
        result['Call_To_Retention'] = y_test
        result['Decile'] = pd.qcut(result['Prob'], q, labels=[i for i in range(q, 0, -1)])
        add = pd.DataFrame(result.groupby('Decile')['Call_To_Retention'].mean()).reset_index()
        add.columns = ['Decile', 'avg_real_call_to_retention_rate']
        result = result.merge(add, on='Decile', how='left')
        result.sort_values('Decile', ascending=True, inplace=True)
        lg = pd.DataFrame(result.groupby('Decile')['Prob'].mean()).reset_index()
        lg.columns = ['Decile', 'avg_model_pred_call_to_retention_rate']
        lg.sort_values('Decile', ascending=False, inplace=True)
        lg['avg_call_to_retention_rate_total'] = result['Call_To_Retention'].mean()
        lg = lg.merge(add, on='Decile', how='left')
        lg['lift'] = lg['avg_real_call_to_retention_rate'] / lg['avg_call_to_retention_rate_total']

        return lg

    df_train = pd.read_csv('gs://{}/{}_train.csv.gz'.format(file_bucket, service_type),
                           compression='gzip')  
    df_test = pd.read_csv('gs://{}/{}_validation.csv.gz'.format(file_bucket, service_type),  
                          compression='gzip')

    #set up df_train
    client = bigquery.Client(project=project_id)
    sql_train = ''' SELECT * FROM `{}.{}.bq_call_to_retention_targets` '''.format(project_id, dataset_id) 
    df_target_train = client.query(sql_train).to_dataframe()
    df_target_train = df_target_train.loc[
        df_target_train['YEAR_MONTH'] == '-'.join(score_date_dash.split('-')[:2])]  # score_date_dash = '2022-08-31'
    df_target_train['ban'] = df_target_train['ban'].astype('int64')
    df_target_train = df_target_train.groupby('ban').tail(1)
    df_train = df_train.merge(df_target_train[['ban', 'target_ind']], on='ban', how='left')
    df_train.rename(columns={'target_ind': 'target'}, inplace=True)
    df_train.dropna(subset=['target'], inplace=True)
    df_train['target'] = df_train['target'].astype(int)
    print(df_train.shape)

    #set up df_test
    sql_test = ''' SELECT * FROM `{}.{}.bq_call_to_retention_targets` '''.format(project_id, dataset_id) 
    df_target_test = client.query(sql_test).to_dataframe()
    df_target_test = df_target_test.loc[
        df_target_test['YEAR_MONTH'] == '-'.join(score_date_val_dash.split('-')[:2])]  # score_date_dash = '2022-09-30'
    df_target_test['ban'] = df_target_test['ban'].astype('int64')
    df_target_test = df_target_test.groupby('ban').tail(1)
    df_test = df_test.merge(df_target_test[['ban', 'target_ind']], on='ban', how='left')
    df_test.rename(columns={'target_ind': 'target'}, inplace=True)
    df_test.dropna(subset=['target'], inplace=True)
    df_test['target'] = df_test['target'].astype(int)
    print(df_test.shape)

    #set up features (list)
    cols_1 = df_train.columns.values
    cols_2 = df_test.columns.values
    cols = set(cols_1).intersection(set(cols_2))
    features = [f for f in cols if f not in ['ban', 'target']]

    #train test split
    df_train, df_val = train_test_split(df_train, shuffle=True, test_size=0.2, random_state=42,
                                        stratify=df_train['target']
                                        )

    ban_train = df_train['ban']
    X_train = df_train[features]
    y_train = np.squeeze(df_train['target'].values)

    ban_val = df_val['ban']
    X_val = df_val[features]
    y_val = np.squeeze(df_val['target'].values)

    ban_test = df_test['ban']
    X_test = df_test[features]
    y_test = np.squeeze(df_test['target'].values)

    del df_train, df_val, df_test
    gc.collect()

    # build model and fit in training data
    import xgboost as xgb
    from sklearn.metrics import roc_auc_score

    xgb_model = xgb.XGBClassifier(
        learning_rate=0.01,
        n_estimators=100,
        max_depth=8,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective='binary:logistic',
        nthread=4,
        scale_pos_weight=1
        # seed=27
    )

    xgb_model.fit(X_train, y_train)
    print('xgb training done')

    from sklearn.preprocessing import normalize

#     #predictions on X_val
#     y_pred = xgb_model.predict_proba(X_val, ntree_limit=xgb_model.best_iteration)[:, 1]
#     y_pred_label = (y_pred > 0.5).astype(int)
#     auc = roc_auc_score(y_val, y_pred_label)
#     metrics.log_metric("AUC", auc)

    pred_prb = xgb_model.predict_proba(X_test, ntree_limit=xgb_model.best_iteration)[:, 1]
    lg = get_lift(pred_prb, y_test, 10)

    # save the model in GCS
    from datetime import datetime
    models_dict = {}
    create_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    models_dict['create_time'] = create_time
    models_dict['model'] = xgb_model
    models_dict['features'] = features
    lg.to_csv('gs://{}/lift_on_scoring_data_{}.csv'.format(file_bucket, create_time, index=False))

    with open('model_dict.pkl', 'wb') as handle:
        pickle.dump(models_dict, handle)
    handle.close()

    storage_client = storage.Client()
    bucket = storage_client.get_bucket(file_bucket)

    MODEL_PATH = '{}_xgb_models/'.format(service_type)
    blob = bucket.blob(MODEL_PATH)
    if not blob.exists(storage_client):
        blob.upload_from_string('')

    model_name_onbkt = '{}{}_models_xgb_{}'.format(MODEL_PATH, service_type, models_dict['create_time'])
    blob = bucket.blob(model_name_onbkt)
    blob.upload_from_filename('model_dict.pkl')

    print(f"....model loaded to GCS done at {str(create_time)}")

    time.sleep(120)


### pycaret_automl

In [None]:
import gc
import time
import pandas as pd
import numpy as np
import pickle
from google.cloud import storage
from google.cloud import bigquery
from sklearn.model_selection import train_test_split

project_id = PROJECT_ID
dataset_id = DATASET_ID
region = REGION
service_type = SERVICE_TYPE
bucket_name = BUCKET_NAME
file_bucket = FILE_BUCKET
score_date_dash= SCORE_DATE_DASH
score_date_val_dash= SCORE_DATE_VAL_DASH

from pycaret.classification import setup,create_model,tune_model, predict_model,get_config,compare_models,save_model,tune_model

df_train = pd.read_csv('gs://{}/{}_train.csv.gz'.format(file_bucket, service_type),
                       compression='gzip')  
df_test = pd.read_csv('gs://{}/{}_validation.csv.gz'.format(file_bucket, service_type),  
                      compression='gzip')

def get_gcp_bqclient(project_id, use_local_credential=True):
    token = os.popen('gcloud auth print-access-token').read()
    token = re.sub(f'\n$', '', token)
    credentials = google.oauth2.credentials.Credentials(token)

    bq_client = bigquery.Client(project=project_id)
    if use_local_credential:
        bq_client = bigquery.Client(project=project_id, credentials=credentials)
    return bq_client

client = get_gcp_bqclient(project_id)

#set up df_train
sql_train = ''' SELECT * FROM `{}.{}.bq_call_to_retention_targets` '''.format(project_id, dataset_id) 
df_target_train = client.query(sql_train).to_dataframe()
df_target_train = df_target_train.loc[
    df_target_train['YEAR_MONTH'] == '-'.join(score_date_dash.split('-')[:2])]  # score_date_dash = '2022-08-31'
df_target_train['ban'] = df_target_train['ban'].astype('int64')
df_target_train = df_target_train.groupby('ban').tail(1)
df_train = df_train.merge(df_target_train[['ban', 'target_ind']], on='ban', how='left')
df_train.rename(columns={'target_ind': 'target'}, inplace=True)
df_train.dropna(subset=['target'], inplace=True)
df_train['target'] = df_train['target'].astype(int)
print(df_train.shape)

#set up df_test
sql_test = ''' SELECT * FROM `{}.{}.bq_call_to_retention_targets` '''.format(project_id, dataset_id) 
df_target_test = client.query(sql_test).to_dataframe()
df_target_test = df_target_test.loc[
    df_target_test['YEAR_MONTH'] == '-'.join(score_date_val_dash.split('-')[:2])]  # score_date_dash = '2022-09-30'
df_target_test['ban'] = df_target_test['ban'].astype('int64')
df_target_test = df_target_test.groupby('ban').tail(1)
df_test = df_test.merge(df_target_test[['ban', 'target_ind']], on='ban', how='left')
df_test.rename(columns={'target_ind': 'target'}, inplace=True)
df_test.dropna(subset=['target'], inplace=True)
df_test['target'] = df_test['target'].astype(int)
print(df_test.shape)

#set up features (list)
cols_1 = df_train.columns.values
cols_2 = df_test.columns.values
cols = set(cols_1).intersection(set(cols_2))
features = [f for f in cols if f not in ['ban', 'target']]

#train test split
df_train, df_val = train_test_split(df_train, shuffle=True, test_size=0.3, random_state=42,
                                    stratify=df_train['target']
                                    )

train_sampled = df_train.drop(columns=['ban'], axis=1) 
valid_sampled = df_val.drop(columns=['ban'], axis=1) 
test_sampled = df_test.drop(columns=['ban'], axis=1) 

print(train_sampled.columns) 
print(test_sampled.columns)

In [None]:
print(train_sampled.shape) 
print(valid_sampled.shape) 
print(test_sampled.shape)

In [None]:

ban_train = df_train['ban']
X_train = df_train[features]
y_train = np.squeeze(df_train['target'].values)

ban_val = df_val['ban']
X_val = df_val[features]
y_val = np.squeeze(df_val['target'].values)

ban_test = df_test['ban']
X_test = df_test[features]
y_test = np.squeeze(df_test['target'].values)

del df_train, df_val, df_test
gc.collect()

################################ Pycaret Setup initialize  ############################ 
classification_setup = setup(data=train_sampled, 
                         # ignore_features=drop_cols,
                         test_data = valid_sampled,
                         target='target',
                         fix_imbalance=False,
                         remove_outliers = True,
                         normalize=True,
                         normalize_method='zscore',
                         log_experiment=False,
                         remove_multicollinearity=True,
                         multicollinearity_threshold=0.95,
                         feature_selection=True,
                         fold=5,
                         fold_shuffle=True,
                         session_id=123,
                         numeric_features=numeric_features,
                         silent=True)

### Pycaret top 3 models to analyze
best_model = compare_models(include = ['rf','xgboost','lightgbm','et'],errors='raise', n_select=3)

# save the model reports and report fig of all top 2 models to GCS
todays_date = datetime.now().strftime("%Y-%m-%d")
save_path = f'pycaret/{todays_date}/'
model_reports, model_to_report_map = evaluate_and_save_models(models=best_model.copy(), 
                                     bucket_name=bucket_name,
                                     save_path=save_path, 
                                     test_df=test_sampled,
                                     actual_label_str='target',
                                     columns = get_config('X_train').columns,
                                     save_columns=True,
                                     show_report=False)



In [None]:


from google.cloud import bigquery
    from google.cloud import storage
    from datetime import datetime
    import logging 
    from pycaret.classification import setup,create_model,tune_model, predict_model,get_config,compare_models,save_model,tune_model
    from sklearn.metrics import accuracy_score, roc_auc_score, precision_recall_curve, mean_squared_error, f1_score, precision_score, recall_score, confusion_matrix, roc_curve
 

    ### import data
    # CREDENTIALS = google.oauth2.credentials.Credentials(token)
    # # import google.oauth2.credentials
   
    client = bigquery.Client(project=project_id, location='northamerica-northeast1')
    storage_client = storage.Client(project=project_id)


    # Get utils.py
    bucket = storage_client.get_bucket(resources_bucket_name)
    blob = bucket.get_blob(f"{utils_file_path}/{utils_filename}")
    blob.download_to_filename(utils_filename)
    blob = bucket.get_blob(f"{utils_file_path}/{plot_utils_filename}")
    blob.download_to_filename(plot_utils_filename)
    
    from preprocessing_utils import pre_process_data
    from preprocessing_utils import downsampling
    from plotly_utils import evaluate_and_save_models,create_folder_if_not_exists,ploty_model_metrics,plotly_feature_importance,plotly_lift_curve, plotly_model_report,plotly_roc, plotly_confusion_matrix,plotly_output_hist,plotly_precision_recall 
    # specify the path to the training data
    training_table = f"{project_id}.{dataset}.{training_dataset}"

    # generate the query
    train_query = '''
       SELECT * 
                FROM `{training_table}`
    '''.format(training_table = training_table)
   

    job_config = bigquery.QueryJobConfig()

    # create a dataframe with the training data
    train_all = client.query(train_query, job_config=job_config).to_dataframe()

     ##############  Split train/valid/test based of Dev Training Sample Size   #######################
    # training_perc = 0.62
    train_df = train_all.sort_values(["partition_dt"]).iloc[:int(train_all.shape[0]*training_perc)]


    lower_bound = int(train_all.shape[0]*training_perc)
    upper_bound = lower_bound + int(train_all.shape[0]*((1-training_perc)/2))
    valid_df = train_all.sort_values(["partition_dt"]).iloc[lower_bound:upper_bound]

    lower_bound = train_df.shape[0] + valid_df.shape[0]
    upper_bound = lower_bound + int(train_all.shape[0]*((1-training_perc)/2))
    test_df = train_all.sort_values(["partition_dt"]).iloc[lower_bound:]
    

### Pipeline

In [None]:
# @dsl.pipeline(
#     # A name for the pipeline.
#     name="{}-xgb-pipeline".format(SERVICE_TYPE_NAME),
#     description=' pipeline for training {} model'.format(SERVICE_TYPE_NAME)
# )
def pipeline(
        project_id: str = PROJECT_ID,
        region: str = REGION,
        resource_bucket: str = RESOURCE_BUCKET, 
        file_bucket: str = FILE_BUCKET
    ):
    
    # ----- create training set --------
    bq_create_training_dataset_op = bq_create_dataset(score_date=SCORE_DATE_DASH,
                          score_date_delta=SCORE_DATE_DELTA,
                          project_id=PROJECT_ID,
                          dataset_id=DATASET_ID,
                          region=REGION,
                          promo_expiry_start=PROMO_EXPIRY_START, 
                          promo_expiry_end=PROMO_EXPIRY_END, 
                          v_start_date=SCORE_DATE_MINUS_6_MOS_DASH,
                          v_end_date=SCORE_DATE_LAST_MONTH_END_DASH)
    
    # ----- preprocessing train data --------
    preprocess_train_op = preprocess(
        pipeline_dataset='bq_ctr_pipeline_dataset', 
        save_data_path='gs://{}/{}_train.csv.gz'.format(FILE_BUCKET, SERVICE_TYPE),
        project_id=PROJECT_ID,
        dataset_id=DATASET_ID
    )

    # preprocess_train_op.set_memory_limit('128G')
    # preprocess_train_op.set_cpu_limit('32')

    bq_create_training_dataset_op 
    preprocess_train_op

    # ----- create validation set --------
    bq_create_validation_dataset_op = bq_create_dataset(score_date=SCORE_DATE_VAL_DASH,
                          score_date_delta=SCORE_DATE_VAL_DELTA,
                          project_id=PROJECT_ID,
                          dataset_id=DATASET_ID,
                          region=REGION,
                          promo_expiry_start=PROMO_EXPIRY_START_VAL, 
                          promo_expiry_end=PROMO_EXPIRY_END_VAL, 
                          v_start_date=SCORE_DATE_VAL_MINUS_6_MOS_DASH,
                          v_end_date=SCORE_DATE_VAL_LAST_MONTH_END_DASH)
    
    # ----- preprocessing validation data --------
    preprocess_validation_op = preprocess(
        pipeline_dataset='bq_ctr_pipeline_dataset', 
        save_data_path='gs://{}/{}_validation.csv.gz'.format(FILE_BUCKET, SERVICE_TYPE),
        project_id=PROJECT_ID,
        dataset_id=DATASET_ID
    )

    # preprocess_validation_op.set_memory_limit('256G')
    # preprocess_validation_op.set_cpu_limit('32')

    bq_create_validation_dataset_op
    preprocess_validation_op

    train_and_save_model_op = train_and_save_model(file_bucket=FILE_BUCKET,
                                                   service_type=SERVICE_TYPE,
                                                   score_date_dash=SCORE_DATE_DASH,
                                                   score_date_val_dash=SCORE_DATE_VAL_DASH,
                                                   project_id=PROJECT_ID,
                                                   dataset_id=DATASET_ID,
                                                   )
    
    train_and_save_model_op
    
#     train_and_save_model_op.set_memory_limit('256G')
#     train_and_save_model_op.set_cpu_limit('32')

#     train_and_save_model_op.after(preprocess_train_op)
#     train_and_save_model_op.after(preprocess_validation_op)


### Run the Pipeline Job

In [None]:
# pipeline(project_id = PROJECT_ID,
#         region = REGION,
#         resource_bucket = RESOURCE_BUCKET,
#         file_bucket = FILE_BUCKET)


pipeline(project_id = PROJECT_ID,
        region = REGION,
        resource_bucket = RESOURCE_BUCKET, 
        file_bucket = FILE_BUCKET)

In [None]:
# from kfp.v2 import compiler
# from google.cloud.aiplatform import pipeline_jobs

# import json

# compiler.Compiler().compile(
#    pipeline_func=pipeline, package_path="pipeline.json"
# )

# job = pipeline_jobs.PipelineJob(
#                                display_name=PIPELINE_NAME,
#                                template_path="pipeline.json",
#                                location=REGION,
#                                enable_caching=False,
#                                pipeline_root = f"gs://{RESOURCE_BUCKET}"
# )
# job.run(
#    service_account = f"bilayer-sa@{PROJECT_ID}.iam.gserviceaccount.com"
# )