### Import Libraries

In [None]:
# import required libraries
from kfp import dsl
from kfp.v2 import compiler
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output, OutputPath, ClassificationMetrics,
                        Metrics, component)
import os
import re
from pathlib import Path

from datetime import date
from datetime import timedelta
from datetime import datetime
from dateutil.relativedelta import relativedelta

import google
from google.oauth2 import credentials
from google.oauth2 import service_account
from google.oauth2.service_account import Credentials
from google.cloud import storage
from google.cloud.aiplatform import pipeline_jobs
from google_cloud_pipeline_components.v1.batch_predict_job import \
    ModelBatchPredictOp as batch_prediction_op

import logging 

In [None]:
from google.cloud import bigquery

import pandas as pd
import numpy as np
from datetime import datetime
pd.options.display.max_columns = 170
from datetime import date, timedelta
import sys

from sklearn import preprocessing
import joblib

import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib
matplotlib.rcParams['figure.figsize'] = (15, 5)

import plotly.express as px

import math

import scipy.stats as stats
import statsmodels.stats.api as sms

from pycaret.utils import check_metric
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

np.random.seed(0)

from pycaret.classification import * 

dtype_bq_mapping = {
    np.dtype('int64') : "INTEGER",
    np.dtype('float64') : "FLOAT",
    np.dtype('float32') : "FLOAT",
    np.dtype('object') : "STRING",
    np.dtype('bool') : "BOOLEAN",
    np.dtype('datetime64[ns]') : "DATE",
    pd.Int64Dtype() : "INTEGER"

}

def export_dataframe_to_bq(df, table_id='', schema_list=[], generate_schema=True, write='overwrite'):
    """
    Inputs:
    df: dataframe that you want to export to BQ
    table_id: string with dataset and table name. Ie: 'customer_personas_reports.firefly_campaign_output' 
    schema_list: List of the SchemaFields if provided, otherwise the function can generate it for you.
    generate_schema: True (Function will generate schema for you). False: Provide own schema list
    
    Ie. table_id = project_id.dataset_id.table_name
    write = 'overwrite' will overwrite the existing table in BQ
    """
    if write == 'overwrite':
        write_type = 'WRITE_TRUNCATE'
    else:
        write_type = 'WRITE_APPEND'
    
    if ((generate_schema == False) & (len(schema_list) == 0)):
        print('Error: Provide Schema List, otherwise set generate_schema to True')
        return 
    if table_id=='':
        print('Error: Provide table_id')
    else:
        if generate_schema==True:
            schema_list=[]
            for column in df.columns:
                schema_list.append(bigquery.SchemaField(column, dtype_bq_mapping[df.dtypes[column]], mode='NULLABLE'))
        print(schema_list)
        
        #Sending to bigquery
        client=bigquery.Client()
        job_config = bigquery.LoadJobConfig(schema=schema_list, write_disposition=write_type)
        job=client.load_table_from_dataframe(df, table_id, job_config=job_config)
        job.result()
        table = client.get_table(table_id)  # Make an API request.
        print("Loaded {} rows and {} columns to {}".format(table.num_rows, len(table.schema), table_id))


def woe_cat_columns(df,cat_columns):
    dict_list = []
    for col in cat_columns :
        print(col)
        feature,target = col,'rpc_flag'
        df_woe_iv = (pd.crosstab(df[feature],df[target],normalize='columns')
                                                                             .assign(woe=lambda dfx: np.log(dfx[1] / dfx[0]))
                                                                             .assign(iv=lambda dfx: np.sum(dfx['woe']*(dfx[1]-dfx[0])))).reset_index()
       
        # df_woe_iv["iv"] = df_woe_iv.assign(iv=lambda dfx: np.sum(dfx['woe']* (dfx[1]-dfx[0])))
        df[feature] = df[feature].map(dict(zip(df_woe_iv[feature], df_woe_iv.woe)))  
       
        dict_list.append(dict(zip(df_woe_iv[col], df_woe_iv.woe)))
    return df, dict_list

def pre_process_data(df, plan_df, whsia_plan_df):
    df_missing_values = pd.DataFrame(df.isnull().sum()/df.shape[0]*100, columns=['perc_missing'])
    # Display(df_missing_values.transpose())

    # Drop features where entire column is empty
    drop_cols = list(df_missing_values.loc[df_missing_values['perc_missing'] == 100].index)
    df.drop(columns=drop_cols, inplace=True)

    numeric_features = df.select_dtypes(include=['int64','float64']).columns
    ccai_features = [col for col in df.columns if 'ccai' in col or 'convrstn_durtn' in col]
    marketing_msg_features = [col for col in df.columns if 'mrkt_msg' in col]
    trip_bank_features = [col for col in df.columns if 'total_trips' in col or 'total_vacation' in col]
    billing_features = [col for col in df.columns if 'befor_discnt' in col or 'disc_amt' in col]
    wls_clckstrm_features = [col for col in df.columns if 'wls' in col]
    data_usg_features = [col for col in df.columns if 'data_usg' in col]
    txt_usg_features = [col for col in df.columns if 'sms_actl_unit' in col]
    call_usg_features = ['total_airtime_dur_min_1m','total_calls_1m','total_ld_calls_1m','total_ld_us_og_calls_1m','total_ld_us_og_airtime_dur_min_1m','total_airtime_chrg_amt_1m',
                        'total_airtime_num_mins_to_rate_1m','ld_call_duration_dly_mins_1m','local_call_duration_dly_mins_1m','total_toll_dur_min_1m','total_toll_num_mins_to_rate_1m','total_toll_chrg_amt_1m',]
    memo_features = [col for col in df.columns if 'memo' in col]

    # Create plan_type feature and drop rows where price plan is not cellular

    df['plan_type'] = 'other'
    df.loc[df.pp_bus_pp_catlg_itm_cd.isin(plan_df.Plan_5G), 'plan_type'] = '5G'
    df.loc[df.pp_bus_pp_catlg_itm_cd.isin(plan_df.Plan_5G_Plus), 'plan_type'] = '5G+'
    df.loc[df.pp_bus_pp_catlg_itm_cd.isin(plan_df.Unlimited_Plan), 'plan_type'] = 'Unlimited'
    df.loc[df.pp_bus_pp_catlg_itm_cd.isin(plan_df.POM_POMC), 'plan_type'] = 'POM_POMC'
    df.loc[df.pp_bus_pp_catlg_itm_cd.isin(plan_df.POM), 'plan_type'] = 'POM'
    df.loc[df.pp_bus_pp_catlg_itm_cd.isin(plan_df.POMC), 'plan_type'] = 'POMC'
    df.loc[df.pp_bus_pp_catlg_itm_cd.isin(plan_df.Plan_4G), 'plan_type'] = 'Plan_4G'
    df.loc[df.pp_bus_pp_catlg_itm_cd.isin(whsia_plan_df.soc_cd), 'plan_type'] = 'WHSIA'

    df = df.loc[df.pp_catlg_itm_nm.str.contains('(?i)watch|tablet|wireless internet')==False].reset_index(drop=True)
    df = df.loc[df.plan_type!='WHSIA'].reset_index(drop=True)
    df = df.loc[~(df.allowance_qty_normalized > 500)].reset_index(drop=True)

    # Fill in missing values
    df[['is_5g_capable', 'is_4g_capable']] = df[['is_5g_capable', 'is_4g_capable']].fillna('Unknown')
    df['device_age_days'] = (datetime.today() - pd.to_datetime(df['proj_launch_dt1'])).dt.days

    df['total_completed_checkouts'] = df['total_completed_checkouts'].fillna(0)
    df['cnt_sub'] = df['cnt_sub'].fillna(1)

    df['pp_recur_chrg_amt'] = df['pp_recur_chrg_amt'].fillna(df['pp_recur_chrg_amt'].mean())
    df['allowance_qty_normalized'] = df['allowance_qty_normalized'].fillna(df['allowance_qty_normalized'].mean())
    df['device_age_days'] = df['device_age_days'].fillna(df['device_age_days'].mean())

    num_feats = [ccai_features, marketing_msg_features, trip_bank_features, billing_features, wls_clckstrm_features, data_usg_features, txt_usg_features, call_usg_features, memo_features]
    num_feats_flatten = [col for num_feats_col in num_feats for col in num_feats_col]

    df[num_feats_flatten] = df[num_feats_flatten].fillna(0)
    cat_cols = ['is_5g_capable' ,'is_4g_capable', 'plan_type']
    #df[cat_cols + ['rpc_flag']], cat_cols_transformed_dict = woe_cat_columns(df=df[cat_cols + ['rpc_flag']],cat_columns=cat_cols)
    df['soc_effective_ts'] = df['soc_effective_ts'].astype('datetime64[ns]')
    
    dbdate_cols = ['partition_dt', 'starting_date_data_collection' ,'ending_date_data_collection',
               'effective_dt' ,'proj_launch_dt1']

    for col in dbdate_cols:
        df[col] = pd.to_datetime(df[col], errors = 'coerce')
        
    convert_ts_to_datetime = [col for col in df.columns if ("ts" in col or "dt" in col) and df.dtypes[col] == object]
    for col in convert_ts_to_datetime:
        df[col] = pd.to_datetime(df[col], errors = 'coerce')
        
    df.loc[df.bacct_ebill_ind == 'Y', 'bacct_ebill_ind'] = 1
    df.loc[df.bacct_ebill_ind == 'N', 'bacct_ebill_ind'] = 0
    df.bacct_ebill_ind = df.bacct_ebill_ind.astype(int)
    return df

def downsampling(df,true_to_false_ratio):
    expected_rpc_flag_0 = df[df["rpc_flag"]==1]["rpc_flag"].value_counts().values[0]/true_to_false_ratio
    perc_samples = expected_rpc_flag_0/df[df["rpc_flag"]==0]["rpc_flag"].value_counts().values[0]
    
    unique_trips_data = df[df["rpc_flag"]==0][["MSISDN","partition_dt"]].drop_duplicates()
    unique_trips_data["event_id"] = unique_trips_data["MSISDN"] +"#"+unique_trips_data["partition_dt"].astype(str)
    unique_trips_sampled = unique_trips_data.sample(int(unique_trips_data.shape[0]*perc_samples))
    data_negtv_class = df[df["rpc_flag"]==0]
    data_negtv_class["event_id"] = data_negtv_class["MSISDN"] +"#"+data_negtv_class["partition_dt"].astype(str)
    data_ngtv_sampled = data_negtv_class[data_negtv_class["event_id"].isin(unique_trips_sampled.event_id.values)]
    data_ngtv_sampled.drop(columns=["event_id"],inplace=True)
    data_downsampled = pd.concat([data_ngtv_sampled,df[df["rpc_flag"]!=0]])
    return data_downsampled

In [None]:
import plotly.graph_objs as go
from plotly.subplots import make_subplots

telus_purple = '#4B286D'
telus_green = '#66CC00'
telus_grey = '#F4F4F7'
from google.cloud import storage
import matplotlib.pyplot as plt
from sklearn.metrics import *
import pandas as pd
import numpy as np
from datetime import datetime
import plotly.express as px
from pycaret.classification import * 
import joblib
import os 
import pickle

def create_folder_if_not_exists(path):
    """
    Create a new folder based on a path if that path does not currently exist.
    """
    if not os.path.exists(path):
        os.makedirs(path)
        print(f"Folder created: {path}")
    else:
        print(f"Folder already exists: {path}")

def ploty_model_metrics(actual, predicted,  plot=False):
    f1_score_ = f1_score(actual, predicted)
    recall_score_ = recall_score(actual, predicted)
    acc_score_ = accuracy_score(actual, predicted)
    pr_score_ = precision_score(actual, predicted)
    
    metrics_df = pd.DataFrame(data=[[acc_score_, pr_score_,recall_score_, f1_score_,]],
                              columns=['Accuracy', 'Precision', 'Recall', 'F1_score'])

    trace = go.Bar(x = (metrics_df.T[0].values), 
                    y = list(metrics_df.columns), 
                    text = np.round_(metrics_df.T[0].values,4),
                    textposition = 'auto',
                    orientation = 'h', 
                    opacity = 0.8,
                    marker=dict(
                                color=[telus_purple] * 4,
                                line=dict(color='#000000',width=1.5)
                            )
                   )
    fig = go.Figure()
    fig.add_trace(trace)
    fig.update_layout(title='Model Metrics')
    
    if plot:
        fig.show()
    return  metrics_df, fig

def plotly_confusion_matrix(actual, 
                            predicted, 
                            axis_labels='',
                            plot=False):
    cm=confusion_matrix(actual, predicted)
 
    if axis_labels=='':
        x = [str(x) for x in range(actual.nunique())]
        #list(np.arange(0, actual.nunique()))
        y = x
    else:
        y = axis_labels
        x = axis_labels

    fig = px.imshow(cm, 
                text_auto=True,
                aspect='auto',
                color_continuous_scale = 'Blues',
                labels = dict(x = "Predicted Labels",
                              y= "Actual Labels"),
                x = x,
                y = y
                )
    if plot:
        fig.show()

    return cm, fig

def plotly_output_hist(actual,
                       prediction_probs,
                       plot=False
                      ):
    hist_ = px.histogram(x = prediction_probs,
                         color = actual,
                         nbins=100,
                         labels=dict(color='True Labels',
                                     x = 'Prediction Probability'
                                    )
                        )
    if plot:
        hist_.show()
        
        
    return hist_


def plotly_precision_recall(actual,
                            predictions_prob,
                            plot=False
                           ):
    prec, recall, threshold = precision_recall_curve(actual, predictions_prob)

    trace = go.Scatter(
                        x=recall,
                        y=prec,
                        mode='lines',
                        line=dict(color=telus_purple),
                        fill='tozeroy',
                        name='Precision-Recall curve'
                    )
    layout = go.Layout(
                        title='Precision-Recall Curve',
                        xaxis=dict(title='Recall'),
                        yaxis=dict(title='Precision')
                    )
    fig = go.Figure(data=[trace], layout=layout)
        
    if plot:
        fig.show()
        
    return fig

def plotly_roc(actual,
                predictions_prob,
                plot=False
               ):
    auc_score = roc_auc_score(actual, predictions_prob)
    fpr, tpr, thresholds  = roc_curve(actual, predictions_prob)
    
    df = pd.DataFrame({
                        'False Positive Rate': fpr,
                        'True Positive Rate': tpr
                      }, 
                        index=thresholds)
    df.index.name = "Thresholds"
    df.columns.name = "Rate"
    df = df.loc[df.index <= 1]
    fig_tpr_fpr = 0
    
    fig_tpr_fpr= px.line(
                        df, 
                        title='TPR and FPR at every threshold',
                    )

    # ROC Curve with AUC
    trace = go.Scatter(
                x=fpr,
                y=tpr,
                mode='lines',
                line=dict(color=telus_purple),
                fill='tozeroy',
                name='Precision-Recall curve'
            )
    layout = go.Layout(
                        title=f'ROC Curve (AUC={auc_score:.4f})',
                        xaxis=dict(title='False Positive Rate'),
                        yaxis=dict(title='True Positive Rate')
                    )
    fig_roc = go.Figure(data=[trace], layout=layout)

    fig_roc.add_shape(
        type='line', line=dict(dash='dash'),
        x0=0, x1=1, y0=0, y1=1
    )
    fig_roc.update_xaxes(constrain='domain')
        
    if plot:
        fig_tpr_fpr.show()
        fig_roc.show()
        

    return fig_tpr_fpr, fig_roc, df, auc_score

def plotly_lift_curve(actual,
                      predictions_prob,
                      step=0.01,
                      plot=False
                     ):
    #Define an auxiliar dataframe to plot the curve
    aux_lift = pd.DataFrame()
    #Create a real and predicted column for our new DataFrame and assign values
    aux_lift['real'] = actual
    aux_lift['predicted'] = predictions_prob
    #Order the values for the predicted probability column:
    aux_lift.sort_values('predicted',ascending=False,inplace=True)

    #Create the values that will go into the X axis of our plot
    x_val = np.arange(step,1+step,step)
    #Calculate the ratio of ones in our data
    ratio_ones = aux_lift['real'].sum() / len(aux_lift)
    #Create an empty vector with the values that will go on the Y axis our our plot
    y_v = []

    #Calculate for each x value its correspondent y value
    for x in x_val:
        num_data = int(np.ceil(x*len(aux_lift))) #The ceil function returns the closest integer bigger than our number 
        data_here = aux_lift.iloc[:num_data,:]   # ie. np.ceil(1.4) = 2
        ratio_ones_here = data_here['real'].sum()/len(data_here)
        y_v.append(ratio_ones_here / ratio_ones)
        
    
   
    # Lift Curve 
    trace = go.Scatter(
                x=x_val,
                y=y_v,
                mode='lines',
                line=dict(color=telus_purple),

                name='Lift Curve'
            )
    layout = go.Layout(
                        title=f'Lift Curve',
                        xaxis=dict(title='Proportion of Sample'),
                        yaxis=dict(title='Lift')
                    )
    fig_lift = go.Figure(data=[trace], layout=layout)

    fig_lift.add_shape(
        type='line', line=dict(dash='dash'),
        x0=0, x1=1, y0=1, y1=1
    )
    fig_lift.update_xaxes(constrain='domain')
        
    if plot:
        fig_lift.show()
    
    return fig_lift

def plotly_feature_importance(model,
                              columns,
                              plot=False):
    coefficients  = pd.DataFrame(model.feature_importances_)
    column_data   = pd.DataFrame(columns)
    coef_sumry    = (pd.merge(coefficients,column_data,left_index= True,
                              right_index= True, how = "left"))
    
    coef_sumry.columns = ["coefficients","features"]
    coef_sumry = coef_sumry.sort_values(by = "coefficients",ascending = False)
    
    fig_feats = 0
    trace= go.Bar(y = coef_sumry["features"].head(15).iloc[::-1],
                  x = coef_sumry["coefficients"].head(15).iloc[::-1],
                  name = "coefficients",
                  marker = dict(color = coef_sumry["coefficients"],
                                colorscale = "Viridis",
                                line = dict(width = .6,color = "black")
                               ),
                  orientation='h'
                 )
    layout = go.Layout(
                        title='Feature Importance',
                        yaxis=dict(title='Features')

                    )
    fig_feats = go.Figure(data=[trace], layout=layout)
        
    if plot:
        fig_feats.update_yaxes(automargin=True)
        fig_feats.show()
    return coef_sumry, fig_feats

def plotly_model_report(model,
                        actual,
                        predicted,
                        predictions_prob,
                        bucket_name,
                        show_report = True,
                        columns=[],
                        save_path = ''
                       ):
    print(model.__class__.__name__)
    
    metrics_df, fig_metrics = ploty_model_metrics(actual, 
                                              predicted, 
                                              plot=False)
    cm, fig_cm = plotly_confusion_matrix(actual, 
                                        predicted, 
                                        axis_labels='',
                                        plot=False)
    fig_hist = plotly_output_hist(actual, 
                                  prediction_probs=predictions_prob,
                                  plot=False)
    fig_pr = plotly_precision_recall(actual,
                                predictions_prob,
                                plot=False
                               )
    fig_tpr_fpr, fig_roc, _, auc_score = plotly_roc(actual,
                                predictions_prob,
                                plot=False
                               )
    try:
        coefs_df, fig_feats = plotly_feature_importance(model=model, 
                                                        columns = columns,
                                                          plot=False)
        coefs_df=coefs_df.to_dict()
    except:
        coefs_df = 0
        pass
    fig_lift = plotly_lift_curve(actual,
                          predictions_prob,
                          step=0.01,
                          plot=False
                         )
    # Figure out how to put this into report on Monday -> Not Urgent
    cr=classification_report(actual,predicted, output_dict=True)

    # Generate dataframe with summary of results in one row
    
    results_cols = ['date', 'model_name', 'estimator_type', 
                    'confusion_matrix','classification_report', 
                    'auc_score', 'feature_importances']
    results_list = [datetime.now().strftime("%Y-%m-%d"), model.__class__.__name__,  model._estimator_type,
                    cm, cr,
                    auc_score, coefs_df
                   ]

    results_df_combined = pd.concat([pd.DataFrame([results_list], columns=results_cols),
                                     metrics_df],
                                   axis=1)

    # Generate Plotly page report
    
    report_fig = make_subplots(rows=4, 
                            cols=2, 
                            print_grid=False, 
                            specs=[[{}, {}], 
                                 [{}, {}],
                                 [{}, {}],
                                 [{}, {}],
                                 ],
                            subplot_titles=('Confusion Matrix',
                                        'Model Metrics',
                                        'Probability Output Histogram',
                                        'Precision - Recall curve',
                                        'TPR & FPR Vs. Threshold',
                                        f'ROC Curve: AUC Score {np.round(auc_score, 3)}',                                        
                                        'Feature importance',
                                        'Lift Curve'
                                        )
                            )        

    report_fig.append_trace(fig_cm.data[0],1,1)
    report_fig.update_coloraxes(showscale=False)
    report_fig.append_trace(fig_metrics.data[0],1,2)

    report_fig.append_trace(fig_hist.data[0],2,1)
    report_fig.append_trace(fig_hist.data[1],2,1)
    report_fig.append_trace(fig_pr.data[0],2,2)

    report_fig.append_trace(fig_tpr_fpr.data[0],3,1)
    report_fig.append_trace(fig_tpr_fpr.data[1],3,1)
    report_fig.append_trace(fig_roc.data[0],3,2)
    try:
        report_fig.append_trace(fig_feats.data[0],4,1)
    except:
        pass
    report_fig.append_trace(fig_lift.data[0],4,2)    
    title_str = f"{model.__class__.__name__} : Model performance report"
    report_fig['layout'].update(title = f'<b>{title_str}</b><br>',
                    autosize = True, height = 1500,width = 1200,
                    plot_bgcolor = 'rgba(240,240,240, 0.95)',
                    paper_bgcolor = 'rgba(240,240,240, 0.95)',
                    margin = dict(b = 195))

    report_fig["layout"]["xaxis1"].update(dict(title = "Predicted Labels"))
    report_fig["layout"]["yaxis1"].update(dict(title = "Actual Labels"))

    report_fig["layout"]["xaxis3"].update(dict(title = "Prediction Probabilities"))
    report_fig["layout"]["yaxis3"].update(dict(title = "Count"))

    report_fig["layout"]["xaxis4"].update(dict(title = "Recall"))
    report_fig["layout"]["yaxis4"].update(dict(title = "Precision"))

    report_fig["layout"]["xaxis5"].update(dict(title = "Thresholds"))
    report_fig["layout"]["yaxis5"].update(dict(title = "Rate"))

    report_fig["layout"]["xaxis6"].update(dict(title = "False Positive Rate"))
    report_fig["layout"]["yaxis6"].update(dict(title = "True Positive Rate"))

    report_fig["layout"]["yaxis7"].update(dict(title = "Features"))

    report_fig["layout"]["xaxis8"].update(dict(title = "Proportion of Sample"))
    report_fig["layout"]["yaxis8"].update(dict(title = "Lift"))             

    if show_report:
        report_fig.show()       

    #Save html report
    todays_date = datetime.now().strftime("%Y-%m-%d")
    report_fig.write_html(f"{save_path}{todays_date}_{model.__class__.__name__}.html")
    bucket = storage.Client().bucket(bucket_name)
    filename = f"{todays_date}_{model.__class__.__name__}.html"
    blob = bucket.blob(f"{save_path}{filename}")
    blob.upload_from_filename(f"{save_path}{todays_date}_{model.__class__.__name__}.html")
    print(f"{filename} sucessfully uploaded to GCS bucket!")
    # Return dataframe with metrics
    return results_df_combined,report_fig

    

def evaluate_and_save_models(models,bucket_name, save_path, test_df, actual_label_str, columns, save_columns=False, show_report=False):
    # define_the_bucket
    bucket = storage.Client().bucket(bucket_name)
    date=datetime.now().strftime("%Y-%m-%d")
    model_test_set_reports = []
    model_to_report_map = {}
    
    # If single model passed through
    if type(models) != list:
        models = [models]
        
    create_folder_if_not_exists(save_path)
    
    if save_columns:
        with open(f"{save_path}{date}_columns.pkl" , "wb") as f:
            pickle.dump(columns, f)
        print(f"Columns saved as {save_path}{date}_columns.pkl !")
        filename = f"{date}_columns.pkl"
        blob = bucket.blob(f"{save_path}{filename}")
        blob.upload_from_filename(f"{save_path}{date}_columns.pkl")
        print(f"{save_path}/{date}_columns.pkl sucessfully uploaded to GCS bucket!")
    
     # Add code to set model to a list if only 1 model passed
    for i in range(len(models)):
        print(models[i])
        # Save model
        
        # Add code to create new folder if it does not exist
        model_file_name = '{save_path}{model_type}_{date}'.format(save_path = save_path,                                                                     model_type=models[i].__class__.__name__,                                                                    date=datetime.now().strftime("%Y-%m-%d"))
        save_model(models[i],model_file_name )
        filename = '{model_type}_{date}.pkl'.format(model_type=models[i].__class__.__name__,                                        date=datetime.now().strftime("%Y-%m-%d"))
        blob = bucket.blob(f"{save_path}{filename}")
        blob.upload_from_filename(f"{model_file_name}.pkl")
        print(f"{filename} sucessfully uploaded to GCS bucket!")
        # joblib.dump(models[i], '{save_path}{model_type}_{date}.joblib'.format(
        #                                                                     save_path = save_path,
        #                                                                     model_type=models[i].__class__.__name__,
        #                                                                     date=datetime.now().strftime("%Y-%m-%d")))

        # Get predictions on test set for model
        predictions = predict_model(models[i], data=test_df)
        # Normalize prediction probabilities 
        predictions['Score_Normalized']=predictions['Score']
        predictions.loc[predictions['Label'] == 0,'Score_Normalized'] = 1 - predictions['Score']
        predictions_prob = predictions["Score_Normalized"].astype(float)

        actual = predictions[actual_label_str].astype(int)
        predicted = predictions["Label"].astype(int)
        
        # Pass data to generate plotly_report
        report_df,report_fig = plotly_model_report(model=models[i],
                                        actual=actual,
                                        predicted=predicted,
                                        predictions_prob=predictions_prob,
                                        bucket_name  = bucket_name  ,
                                        show_report = show_report,
                                        columns = columns,
                                        save_path = save_path      
                                       )
        todays_date = datetime.now().strftime("%Y-%m-%d")
        model_to_report_map[models[i].__class__.__name__ ]=report_fig

    # report_fig.write_html(f"{save_path}{todays_date}_{model.__class__.__name__}.html")
        model_test_set_reports.append(report_df)
        
    model_test_set_reports_concat = pd.concat(model_test_set_reports)
    model_test_set_reports_concat.to_csv(f"{save_path}{date}_model_reports.csv", index=False)
    
    filename = f"{date}_model_reports.csv"
    blob = bucket.blob(f"{save_path}{filename}")
    blob.upload_from_filename(f"{save_path}{date}_model_reports.csv")
    print(f"{filename} sucessfully uploaded to GCS bucket!")
    return model_test_set_reports_concat,model_to_report_map

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import *
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
from pycaret.classification import * 

def plot_lift_curve(y_val, y_pred,  step=0.01):

    #Define an auxiliar dataframe to plot the curve
    aux_lift = pd.DataFrame()
    #Create a real and predicted column for our new DataFrame and assign values
    aux_lift['real'] = y_val
    aux_lift['predicted'] = y_pred
    #Order the values for the predicted probability column:
    aux_lift.sort_values('predicted',ascending=False,inplace=True)

    #Create the values that will go into the X axis of our plot
    x_val = np.arange(step,1+step,step)
    #Calculate the ratio of ones in our data
    ratio_ones = aux_lift['real'].sum() / len(aux_lift)
    #Create an empty vector with the values that will go on the Y axis our our plot
    y_v = []

    #Calculate for each x value its correspondent y value
    for x in x_val:
        num_data = int(np.ceil(x*len(aux_lift))) #The ceil function returns the closest integer bigger than our number 
        data_here = aux_lift.iloc[:num_data,:]   # ie. np.ceil(1.4) = 2
        ratio_ones_here = data_here['real'].sum()/len(data_here)
        y_v.append(ratio_ones_here / ratio_ones)

    #Plot the figure
    
    fig, axis = plt.subplots()
    fig.figsize = (40,40)
    axis.plot(x_val, y_v, 'g-', linewidth = 3, markersize = 5)
    axis.plot(x_val, np.ones(len(x_val)), 'k-')
    axis.set_xlabel('Proportion of sample')
    axis.set_ylabel('Lift')
    plt.title('Lift Curve')
    plt.show()


def prediction_metrics_plots(df, model, display_output=True, actual_label_str=''):
    print(model.__class__.__name__)
    # Get predictions using pycaret predict_model function
    
    predictions = predict_model(model, data=df)
    
    # Normalize prediction probabilities 
    predictions['Score_Normalized']=predictions['Score']
    predictions.loc[predictions['Label'] == 0,'Score_Normalized'] = 1 - predictions['Score']
    predictions_prob = predictions["Score_Normalized"].astype(float)
    
    actual = predictions[actual_label_str].astype(float)
    predicted = predictions["Label"].astype(float)
    
    # Precision - Recall
    prec, recall, threshold = precision_recall_curve(actual, predictions_prob)
    baseline = len(actual[actual==1]) / len(actual)
    
    if display_output:
        plt.clf()
        print("Precision Recall Curve")
        plt.plot([0, 1], [baseline, baseline], linestyle='--', label='baseline')
        plt.plot(recall, prec, marker='.', label='Model')
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.legend()
        plt.show()
    
    # ROC Curve
    
    baseline_prob = [0 for _ in range(len(actual))]
    baseline_fpr, baseline_tpr, _ = roc_curve(actual, baseline_prob)
    
    auc_score = roc_auc_score(actual, predictions_prob)
    fpr, tpr, _ = roc_curve(actual, predictions_prob)
    
    if display_output:
        print("ROC Curve")
        plt.plot(baseline_fpr, baseline_tpr, linestyle='--', label='Baseline')
        plt.plot(fpr, tpr, marker='.', label='Model ==>  AUC : {} '.format(auc_score))
        # axis labels
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.legend()
        plt.show()
    
    # Lift Curve
    if display_output:
        print("lift curve")
        plot_lift_curve(actual,predictions_prob)
        
    # Other metrics
    f1_score_ = f1_score(actual, predicted)
    recall_score_ = recall_score(actual, predicted)
    acc_score_ = accuracy_score(actual, predicted)
    pr_score_ = precision_score(actual, predicted)
    
    # Classification Report
    cr=classification_report(actual,predicted, output_dict=True)
    if display_output:
        display(pd.DataFrame(cr))
        
    # Confusion Matrix
    cm=confusion_matrix(actual, predicted)
    if display_output:
        print(cm)
        ax= plt.subplot()
        sns.heatmap(cm, annot=True, fmt='g', ax=ax);  
        ax.set_xlabel('Predicted labels');
        ax.set_ylabel('True labels'); 
        ax.set_title('Confusion Matrix'); 
        #ax.xaxis.set_ticklabels(labels); ax.yaxis.set_ticklabels(labels);
    


    results_cols = ['date', 'model_name', 'estimator_type','actual_label', 'predicted_label',
                    'score_normalized', 'auc', 'f1', 'recall', 'precision', 'accuracy',
                    'classification_report', 'confusion_matrix']
    
    results_list = [datetime.now().strftime("%Y-%m-%d"),
                    model.__class__.__name__, 
                    model._estimator_type, 
                    actual, 
                    predicted,
                    predictions_prob, 
                    auc_score, 
                    f1_score_, 
                    recall_score_,
                    pr_score_, 
                    acc_score_, 
                    cr, 
                    cm]
    
    results_df = pd.DataFrame([results_list], columns=results_cols)
    
    return results_df

    


### Parameters

In [None]:
#tag cell with parameters
PROJECT_ID =  ''
BUCKET_NAME=''
DATASET_ID = ''
RESOURCE_BUCKET = ''
FILE_BUCKET = ''
REGION = ''
MODEL_ID = '5090'

In [None]:
#tag cell with parameters
PROJECT_ID =  'divg-josh-pr-d1cc3a'
BUCKET_NAME='divg-josh-pr-d1cc3a-default'
DATASET_ID = 'call_to_retention_dataset'
RESOURCE_BUCKET = 'divg-josh-pr-d1cc3a-default'
FILE_BUCKET = 'divg-josh-pr-d1cc3a-default'
MODEL_ID = '5090'

### Service Parameters

In [None]:
SERVICE_TYPE = 'call_to_retention'
SERVICE_TYPE_NAME = 'call-to-retention'
TABLE_ID = 'bq_call_to_retention_targets'
REGION = "northamerica-northeast1"

### Pulumi Parameters

In [None]:
STACK_NAME = 'call_to_retention'
TRAIN_PIPELINE_NAME_PATH = 'train_pipeline'
PREDICT_PIPELINE_NAME_PATH = 'predict_pipeline'
TRAIN_PIPELINE_NAME = 'call-to-retention-train-pipeline' # Same name as pulumi.yaml
PREDICT_PIPELINE_NAME = 'call-to-retention-predict-pipeline' # Same name as pulumi.yaml
TRAIN_PIPELINE_DESCRIPTION = 'call-to-retention-train-pipeline'
PREDICT_PIPELINE_DESCRIPTION = 'call-to-retention-predict-pipeline'
REGION = "northamerica-northeast1"

### Query + Pre-Processing Component Parameters

In [None]:
TRAIN_QUERIES_PATH = f"{STACK_NAME}/{TRAIN_PIPELINE_NAME_PATH}/queries/" 
TRAIN_UTILS_FILE_PATH = f"{STACK_NAME}/{TRAIN_PIPELINE_NAME_PATH}/utils" 
UTILS_FILENAME = 'utils.py'

PROCESSED_SERVING_DATA_TABLENAME = 'processed_serving_data'
INPUT_SERVING_DATA_TABLENAME = 'input_serving_data'

QUERY_DATE = (date.today() - relativedelta(days=1)).strftime('%Y-%m-%d')
TARGET_TABLE_REF = '{}.{}.{}'.format(PROJECT_ID, DATASET_ID, TABLE_ID)

QUERIES_PATH = 'call_to_retention/queries/'


### Import Pipeline Components

In [None]:
# download required component files to local
prefix = f'{STACK_NAME}/{TRAIN_PIPELINE_NAME_PATH}/components/'
dl_dir = 'components/'

storage_client = storage.Client()
bucket = storage_client.bucket(RESOURCE_BUCKET)
blobs = bucket.list_blobs(prefix=prefix)  # Get list of files
for blob in blobs: # download each file that starts with "prefix" into "dl_dir"
    if blob.name.endswith("/"):
        continue
    file_split = blob.name.split(prefix)
    file_path = f"{dl_dir}{file_split[-1]}"
    directory = "/".join(file_path.split("/")[0:-1])
    Path(directory).mkdir(parents=True, exist_ok=True)
    blob.download_to_filename(file_path) 

# import main pipeline components
import components


### Date Parameters

In [None]:
scoringDate = date(2022, 9, 1)  # date.today() - relativedelta(days=2)- relativedelta(months=30)
valScoringDate = date(2022, 10, 1)  # scoringDate - relativedelta(days=2)

# training dates
SCORE_DATE = scoringDate.strftime('%Y%m%d')  # date.today().strftime('%Y%m%d')
SCORE_DATE_DASH = scoringDate.strftime('%Y-%m-%d')
SCORE_DATE_MINUS_6_MOS_DASH = ((scoringDate - relativedelta(months=6)).replace(day=1)).strftime('%Y-%m-%d')
SCORE_DATE_LAST_MONTH_START_DASH = (scoringDate.replace(day=1) - timedelta(days=1)).replace(day=1).strftime('%Y-%m-%d')
SCORE_DATE_LAST_MONTH_END_DASH = ((scoringDate.replace(day=1)) - timedelta(days=1)).strftime('%Y-%m-%d')
PROMO_EXPIRY_START = (scoringDate.replace(day=1) + relativedelta(months=3)).replace(day=1).strftime('%Y-%m-%d')
PROMO_EXPIRY_END = (scoringDate.replace(day=1) + relativedelta(months=4)).replace(day=1).strftime('%Y-%m-%d')

# validation dates
SCORE_DATE_VAL = valScoringDate.strftime('%Y%m%d')
SCORE_DATE_VAL_DASH = valScoringDate.strftime('%Y-%m-%d')
SCORE_DATE_VAL_MINUS_6_MOS_DASH = ((valScoringDate - relativedelta(months=6)).replace(day=1)).strftime('%Y-%m-%d')
SCORE_DATE_VAL_LAST_MONTH_START_DASH = (valScoringDate.replace(day=1) - timedelta(days=1)).replace(day=1).strftime('%Y-%m-%d')
SCORE_DATE_VAL_LAST_MONTH_END_DASH = ((valScoringDate.replace(day=1)) - timedelta(days=1)).strftime('%Y-%m-%d')
PROMO_EXPIRY_START_VAL = (valScoringDate.replace(day=1) + relativedelta(months=3)).replace(day=1).strftime('%Y-%m-%d')
PROMO_EXPIRY_END_VAL = (valScoringDate.replace(day=1) + relativedelta(months=4)).replace(day=1).strftime('%Y-%m-%d')

SCORE_DATE_DELTA = 0
SCORE_DATE_VAL_DELTA = 0
TICKET_DATE_WINDOW = 30  # Days of ticket data to be queried


### bq_create_dataset.py

In [None]:
import kfp
from kfp import dsl
# from kfp.v2.dsl import (Model, Input, component)
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output,HTML,
                        OutputPath, ClassificationMetrics, Metrics, component)
from typing import NamedTuple
# Create Training Dataset for training pipeline
@component(
    base_image="northamerica-northeast1-docker.pkg.dev/cio-workbench-image-np-0ddefe/wb-platform/pipelines/kubeflow-pycaret:latest",
    output_component_file="bq_create_dataset.yaml",
)
def bq_create_dataset(score_date: str,
                      score_date_delta: int,
                      project_id: str,
                      dataset_id: str,
                      region: str,
                      promo_expiry_start: str, 
                      promo_expiry_end: str, 
                      v_start_date: str,
                      v_end_date: str) -> NamedTuple("output", [("col_list", list)]):
 
    from google.cloud import bigquery
    import logging 
    from datetime import datetime
    # For wb
    # import google.oauth2.credentials
    # CREDENTIALS = google.oauth2.credentials.Credentials(token)
    
    def get_gcp_bqclient(project_id, use_local_credential=True):
        token = os.popen('gcloud auth print-access-token').read()
        token = re.sub(f'\n$', '', token)
        credentials = google.oauth2.credentials.Credentials(token)

        bq_client = bigquery.Client(project=project_id)
        if use_local_credential:
            bq_client = bigquery.Client(project=project_id, credentials=credentials)
        return bq_client

    client = get_gcp_bqclient(project_id)
    # client = bigquery.Client(project=project_id, location=region)
    job_config = bigquery.QueryJobConfig()
    
    # Change dataset / table + sp table name to version in bi-layer
    query =\
        f'''
            DECLARE score_date DATE DEFAULT "{score_date}";
            DECLARE promo_expiry_start DATE DEFAULT "{promo_expiry_start}";
            DECLARE promo_expiry_end DATE DEFAULT "{promo_expiry_end}";
            DECLARE start_date DATE DEFAULT "{v_start_date}";
            DECLARE end_date DATE DEFAULT "{v_end_date}";
        
            -- Change dataset / sp name to the version in the bi_layer
            CALL {dataset_id}.bq_sp_ctr_pipeline_dataset(score_date, promo_expiry_start, promo_expiry_end, start_date, end_date);

            SELECT
                *
            FROM {dataset_id}.INFORMATION_SCHEMA.PARTITIONS
            WHERE table_name='bq_ctr_pipeline_dataset'
            
        '''
    
    df = client.query(query, job_config=job_config).to_dataframe()
    logging.info(df.to_string())
    
    logging.info(f"Loaded {df.total_rows[0]} rows into \
             {df.table_catalog[0]}.{df.table_schema[0]}.{df.table_name[0]} on \
             {datetime.strftime((df.last_modified_time[0]), '%Y-%m-%d %H:%M:%S') } !")
    
    ######################################## Save column list_##########################
    query =\
        f'''
           SELECT
                *
            FROM {dataset_id}.bq_ctr_pipeline_dataset

        '''
    
    df = client.query(query, job_config=job_config).to_dataframe()
    
    col_list = list([col for col in df.columns])
    return (col_list,)
    

### Preprocess

In [None]:
def preprocess(
        pipeline_dataset: str, 
        save_data_path: str,
        project_id: str,
        dataset_id: str
):
    from google.cloud import bigquery
    import pandas as pd
    import gc
    import time

    def get_gcp_bqclient(project_id, use_local_credential=True):
        token = os.popen('gcloud auth print-access-token').read()
        token = re.sub(f'\n$', '', token)
        credentials = google.oauth2.credentials.Credentials(token)

        bq_client = bigquery.Client(project=project_id)
        if use_local_credential:
            bq_client = bigquery.Client(project=project_id, credentials=credentials)
        return bq_client

    client = get_gcp_bqclient(project_id)

    # bq_client = bigquery.Client(project=project_id)

    # pipeline_dataset 
    pipeline_dataset_name = f"{project_id}.{dataset_id}.{pipeline_dataset}" 
    build_df_pipeline_dataset = f'SELECT * FROM `{pipeline_dataset_name}`'
    df_pipeline_dataset = client.query(build_df_pipeline_dataset).to_dataframe()
    df_pipeline_dataset = df_pipeline_dataset.set_index('ban') 

    # demo columns
    df_pipeline_dataset['demo_urban_flag'] = df_pipeline_dataset.demo_sgname.str.lower().str.contains('urban').fillna(0).astype(int)
    df_pipeline_dataset['demo_rural_flag'] = df_pipeline_dataset.demo_sgname.str.lower().str.contains('rural').fillna(0).astype(int)
    df_pipeline_dataset['demo_family_flag'] = df_pipeline_dataset.demo_lsname.str.lower().str.contains('families').fillna(0).astype(int)

    df_income_dummies = pd.get_dummies(df_pipeline_dataset[['demo_lsname']]) 
    df_income_dummies.columns = df_income_dummies.columns.str.replace('&', 'and')
    df_income_dummies.columns = df_income_dummies.columns.str.replace(' ', '_')

    df_pipeline_dataset.drop(columns=['demo_sgname', 'demo_lsname'], axis=1, inplace=True)

    df_pipeline_dataset = df_pipeline_dataset.join(df_income_dummies)

    df_join = df_pipeline_dataset.copy()

    #column name clean-up
    df_join.columns = df_join.columns.str.replace(' ', '_')
    df_join.columns = df_join.columns.str.replace('-', '_')

    df_join.head()

    #df_final
    df_final = df_join.copy()
    del df_join
    gc.collect()
    print('......df_final done')

    for f in df_final.columns:
        df_final[f] = list(df_final[f])

    df_final.to_csv(save_data_path, index=True, compression='gzip') 
    del df_final
    gc.collect()
    print(f'......csv saved in {save_data_path}')
    time.sleep(120)


### Train and Save Model 

In [None]:
def train_and_save_model(
            file_bucket: str,
            service_type: str,
            score_date_dash: str,
            score_date_val_dash: str,
            project_id: str,
            dataset_id: str
):

    import gc
    import time
    import pandas as pd
    import numpy as np
    import pickle
    from google.cloud import storage
    from google.cloud import bigquery
    from sklearn.model_selection import train_test_split

    def get_lift(prob, y_test, q):
        result = pd.DataFrame(columns=['Prob', 'Call_To_Retention'])
        result['Prob'] = prob
        result['Call_To_Retention'] = y_test
        result['Decile'] = pd.qcut(result['Prob'], q, labels=[i for i in range(q, 0, -1)])
        add = pd.DataFrame(result.groupby('Decile')['Call_To_Retention'].mean()).reset_index()
        add.columns = ['Decile', 'avg_real_call_to_retention_rate']
        result = result.merge(add, on='Decile', how='left')
        result.sort_values('Decile', ascending=True, inplace=True)
        lg = pd.DataFrame(result.groupby('Decile')['Prob'].mean()).reset_index()
        lg.columns = ['Decile', 'avg_model_pred_call_to_retention_rate']
        lg.sort_values('Decile', ascending=False, inplace=True)
        lg['avg_call_to_retention_rate_total'] = result['Call_To_Retention'].mean()
        lg = lg.merge(add, on='Decile', how='left')
        lg['lift'] = lg['avg_real_call_to_retention_rate'] / lg['avg_call_to_retention_rate_total']

        return lg

    df_train = pd.read_csv('gs://{}/{}_train.csv.gz'.format(file_bucket, service_type),
                           compression='gzip')  
    df_test = pd.read_csv('gs://{}/{}_validation.csv.gz'.format(file_bucket, service_type),  
                          compression='gzip')

    #set up df_train
    client = bigquery.Client(project=project_id)
    sql_train = ''' SELECT * FROM `{}.{}.bq_call_to_retention_targets` '''.format(project_id, dataset_id) 
    df_target_train = client.query(sql_train).to_dataframe()
    df_target_train = df_target_train.loc[
        df_target_train['YEAR_MONTH'] == '-'.join(score_date_dash.split('-')[:2])]  # score_date_dash = '2022-08-31'
    df_target_train['ban'] = df_target_train['ban'].astype('int64')
    df_target_train = df_target_train.groupby('ban').tail(1)
    df_train = df_train.merge(df_target_train[['ban', 'target_ind']], on='ban', how='left')
    df_train.rename(columns={'target_ind': 'target'}, inplace=True)
    df_train.dropna(subset=['target'], inplace=True)
    df_train['target'] = df_train['target'].astype(int)
    print(df_train.shape)

    #set up df_test
    sql_test = ''' SELECT * FROM `{}.{}.bq_call_to_retention_targets` '''.format(project_id, dataset_id) 
    df_target_test = client.query(sql_test).to_dataframe()
    df_target_test = df_target_test.loc[
        df_target_test['YEAR_MONTH'] == '-'.join(score_date_val_dash.split('-')[:2])]  # score_date_dash = '2022-09-30'
    df_target_test['ban'] = df_target_test['ban'].astype('int64')
    df_target_test = df_target_test.groupby('ban').tail(1)
    df_test = df_test.merge(df_target_test[['ban', 'target_ind']], on='ban', how='left')
    df_test.rename(columns={'target_ind': 'target'}, inplace=True)
    df_test.dropna(subset=['target'], inplace=True)
    df_test['target'] = df_test['target'].astype(int)
    print(df_test.shape)

    #set up features (list)
    cols_1 = df_train.columns.values
    cols_2 = df_test.columns.values
    cols = set(cols_1).intersection(set(cols_2))
    features = [f for f in cols if f not in ['ban', 'target']]

    #train test split
    df_train, df_val = train_test_split(df_train, shuffle=True, test_size=0.2, random_state=42,
                                        stratify=df_train['target']
                                        )

    ban_train = df_train['ban']
    X_train = df_train[features]
    y_train = np.squeeze(df_train['target'].values)

    ban_val = df_val['ban']
    X_val = df_val[features]
    y_val = np.squeeze(df_val['target'].values)

    ban_test = df_test['ban']
    X_test = df_test[features]
    y_test = np.squeeze(df_test['target'].values)

    del df_train, df_val, df_test
    gc.collect()

    # build model and fit in training data
    import xgboost as xgb
    from sklearn.metrics import roc_auc_score

    xgb_model = xgb.XGBClassifier(
        learning_rate=0.01,
        n_estimators=100,
        max_depth=8,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective='binary:logistic',
        nthread=4,
        scale_pos_weight=1
        # seed=27
    )

    xgb_model.fit(X_train, y_train)
    print('xgb training done')

    from sklearn.preprocessing import normalize

#     #predictions on X_val
#     y_pred = xgb_model.predict_proba(X_val, ntree_limit=xgb_model.best_iteration)[:, 1]
#     y_pred_label = (y_pred > 0.5).astype(int)
#     auc = roc_auc_score(y_val, y_pred_label)
#     metrics.log_metric("AUC", auc)

    pred_prb = xgb_model.predict_proba(X_test, ntree_limit=xgb_model.best_iteration)[:, 1]
    lg = get_lift(pred_prb, y_test, 10)

    # save the model in GCS
    from datetime import datetime
    models_dict = {}
    create_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    models_dict['create_time'] = create_time
    models_dict['model'] = xgb_model
    models_dict['features'] = features
    lg.to_csv('gs://{}/lift_on_scoring_data_{}.csv'.format(file_bucket, create_time, index=False))

    with open('model_dict.pkl', 'wb') as handle:
        pickle.dump(models_dict, handle)
    handle.close()

    storage_client = storage.Client()
    bucket = storage_client.get_bucket(file_bucket)

    MODEL_PATH = '{}_xgb_models/'.format(service_type)
    blob = bucket.blob(MODEL_PATH)
    if not blob.exists(storage_client):
        blob.upload_from_string('')

    model_name_onbkt = '{}{}_models_xgb_{}'.format(MODEL_PATH, service_type, models_dict['create_time'])
    blob = bucket.blob(model_name_onbkt)
    blob.upload_from_filename('model_dict.pkl')

    print(f"....model loaded to GCS done at {str(create_time)}")

    time.sleep(120)


### pycaret_automl

In [None]:
import gc
import time
import pandas as pd
import numpy as np
import pickle
from google.cloud import storage
from google.cloud import bigquery
from sklearn.model_selection import train_test_split

project_id = PROJECT_ID
dataset_id = DATASET_ID
region = REGION
service_type = SERVICE_TYPE
bucket_name = BUCKET_NAME
file_bucket = FILE_BUCKET
score_date_dash= SCORE_DATE_DASH
score_date_val_dash= SCORE_DATE_VAL_DASH

from pycaret.classification import setup,create_model,tune_model, predict_model,get_config,compare_models,save_model,tune_model

df_train = pd.read_csv('gs://{}/{}_train.csv.gz'.format(file_bucket, service_type),
                       compression='gzip')  
df_test = pd.read_csv('gs://{}/{}_validation.csv.gz'.format(file_bucket, service_type),  
                      compression='gzip')

def get_gcp_bqclient(project_id, use_local_credential=True):
    token = os.popen('gcloud auth print-access-token').read()
    token = re.sub(f'\n$', '', token)
    credentials = google.oauth2.credentials.Credentials(token)

    bq_client = bigquery.Client(project=project_id)
    if use_local_credential:
        bq_client = bigquery.Client(project=project_id, credentials=credentials)
    return bq_client

client = get_gcp_bqclient(project_id)

#set up df_train
sql_train = ''' SELECT * FROM `{}.{}.bq_call_to_retention_targets` '''.format(project_id, dataset_id) 
df_target_train = client.query(sql_train).to_dataframe()
df_target_train = df_target_train.loc[
    df_target_train['YEAR_MONTH'] == '-'.join(score_date_dash.split('-')[:2])]  # score_date_dash = '2022-08-31'
df_target_train['ban'] = df_target_train['ban'].astype('int64')
df_target_train = df_target_train.groupby('ban').tail(1)
df_train = df_train.merge(df_target_train[['ban', 'target_ind']], on='ban', how='left')
df_train.rename(columns={'target_ind': 'target'}, inplace=True)
df_train.dropna(subset=['target'], inplace=True)
df_train['target'] = df_train['target'].astype(int)
print(df_train.shape)

#set up df_test
sql_test = ''' SELECT * FROM `{}.{}.bq_call_to_retention_targets` '''.format(project_id, dataset_id) 
df_target_test = client.query(sql_test).to_dataframe()
df_target_test = df_target_test.loc[
    df_target_test['YEAR_MONTH'] == '-'.join(score_date_val_dash.split('-')[:2])]  # score_date_dash = '2022-09-30'
df_target_test['ban'] = df_target_test['ban'].astype('int64')
df_target_test = df_target_test.groupby('ban').tail(1)
df_test = df_test.merge(df_target_test[['ban', 'target_ind']], on='ban', how='left')
df_test.rename(columns={'target_ind': 'target'}, inplace=True)
df_test.dropna(subset=['target'], inplace=True)
df_test['target'] = df_test['target'].astype(int)
print(df_test.shape)

#set up features (list)
cols_1 = df_train.columns.values
cols_2 = df_test.columns.values
cols = set(cols_1).intersection(set(cols_2))
features = [f for f in cols if f not in ['ban', 'target']]

#train test split
df_train, df_val = train_test_split(df_train, shuffle=True, test_size=0.3, random_state=42,
                                    stratify=df_train['target']
                                    )

#train test split
df_test, df_final = train_test_split(df_test, shuffle=True, test_size=0.3, random_state=42,
                                    stratify=df_test['target']
                                    )

train_sampled = df_train.drop(columns=['ban'], axis=1) 
valid_sampled = df_val.drop(columns=['ban'], axis=1) 
test_sampled = df_test.drop(columns=['ban'], axis=1) 
final_sampled = df_final.drop(columns=['ban'], axis=1) 


In [None]:
print(train_sampled.shape) 
print(valid_sampled.shape) 
print(test_sampled.shape)
print(final_sampled.shape)

In [None]:
ban_train = df_train['ban']
X_train = df_train[features]
y_train = np.squeeze(df_train['target'].values)

ban_val = df_val['ban']
X_val = df_val[features]
y_val = np.squeeze(df_val['target'].values)

ban_test = df_test['ban']
X_test = df_test[features]
y_test = np.squeeze(df_test['target'].values)

# del df_train, df_val, df_test
# gc.collect()

################################ Pycaret Setup initialize  ############################ 
classification_setup = setup(data=train_sampled, 
                         # ignore_features=drop_cols,
                         test_data = valid_sampled,
                         target='target',
                         fix_imbalance=False,
                         remove_outliers = True,
                         normalize=True,
                         normalize_method='zscore',
                         log_experiment=False,
                         remove_multicollinearity=True,
                         multicollinearity_threshold=0.95,
                         feature_selection=True,
                         fold=5,
                         fold_shuffle=True,
                         session_id=123,
                         numeric_features=features,
                         silent=True)

### Pycaret top 3 models to analyze
best_model = compare_models(include = ['rf','xgboost','lightgbm'],errors='raise', n_select=3)

# save the model reports and report fig of all top 2 models to GCS
todays_date = datetime.now().strftime("%Y-%m-%d")
save_path = f'pycaret/{todays_date}/'
model_reports, model_to_report_map = evaluate_and_save_models(models=best_model.copy(), 
                                     bucket_name=bucket_name,
                                     save_path=save_path, 
                                     test_df=test_sampled,
                                     actual_label_str='target',
                                     columns = get_config('X_train').columns,
                                     save_columns=True,
                                     show_report=False)

# Find the top Model and top model's report Figs
top_model = None
for i in range(len(best_model)):
    if best_model[i].__class__.__name__ == model_reports.sort_values(["Recall","Precision"],ascending=False).head(1)["model_name"][0]:
        top_model = best_model.copy()[i]

best_model = model_reports.sort_values(["Recall","Precision"],ascending=False).head(1)["model_name"][0]
best_model_report = model_to_report_map[top_model.__class__.__name__]


In [None]:

################ Export the top model's report and figs to GCS ###############################
bucket = storage.Client().bucket(bucket_name)
save_path = f'models/best_model/{todays_date}/'
create_folder_if_not_exists(save_path)

######### Save HTML report of the best model
best_model_report.write_html(f"{save_path}{todays_date}_{top_model.__class__.__name__}.html")

filename = f"{todays_date}_{top_model.__class__.__name__}.html"
blob = bucket.blob(f"{save_path}{filename}")
blob.upload_from_filename(f"{save_path}{todays_date}_{top_model.__class__.__name__}.html")
logging.info(f"{filename} sucessfully uploaded to GCS bucket!")

####### Save the model
model_file_name = '{save_path}{model_type}_{date}'.format(save_path = save_path,
                                  model_type = top_model.__class__.__name__,    
                                  date=datetime.now().strftime("%Y-%m-%d"))                                                                   
save_model(top_model,model_file_name)
filename = '{model_type}_{date}.pkl'.format(model_type=top_model.__class__.__name__,date=datetime.now().strftime("%Y-%m-%d"))
blob = bucket.blob(f"{save_path}{filename}")
blob.upload_from_filename(f"{model_file_name}.pkl")
logging.info(f"{filename} sucessfully uploaded to GCS bucket!")

###############################  Tuned Model  ##############################
model_base = create_model(top_model)
tuned_model, tuner = tune_model(model_base, optimize='recall', return_tuner = True, n_iter = 20)
save_path = f'models/best_model/tuned/{todays_date}/'
model_reports_tuned, model_to_report_map_tuned = evaluate_and_save_models(models=tuned_model, 
                                     bucket_name=bucket_name,
                                     save_path=save_path, 
                                     test_df=test_sampled,
                                     actual_label_str='target',
                                     columns = get_config('X_train').columns,
                                     save_columns=True,
                                     show_report=False)

###############################   Define  Final Model     ##############################

final_model_report = None
final_model_class_name = None
final_model_file = None
if model_reports_tuned.Recall.values[0] >= 1 :
    logging.info("CAUTION : TUNED MODEL had 100% recall. TUNED model was not selected as best model. ")
    final_model_class_name = top_model.__class__.__name__
    final_model_report = best_model_report
    final_model_file = top_model
elif model_reports_tuned.Recall.values[0] > model_reports.sort_values(["Recall","Precision"],ascending=False).head(1).Recall.values[0]:
    base_recall = model_reports.sort_values(["Recall","Precision"],ascending=False).head(1).Recall.values[0]
    logging.info(f"TUNED MODEL had {model_reports_tuned.Recall.values[0]*100} recall and Base model without tuning had {base_recall*100} Recall. TUNED model was selected as best model. ")

    final_model_class_name = tuned_model.__class__.__name__
    final_model_report = model_to_report_map_tuned[final_model_class_name]
    final_model_file = tuned_model
else:
    base_recall = model_reports.sort_values(["Recall","Precision"],ascending=False).head(1).Recall.values[0]
    logging.info(f"TUNED MODEL had {model_reports_tuned.Recall.values[0]*100} recall and Base model without tuning had {base_recall*100} Recall. TUNED model was selected as best model. ")
    final_model_class_name = top_model.__class__.__name__
    final_model_report = best_model_report
    final_model_file = top_model

###############################  Save the Report and model    ###############################
# Save HTML report of the selected model
# final_model_report = model_to_report_map_tuned[tuned_model.__class__.__name__]
save_path = f'models/final_selected/{todays_date}/'
create_folder_if_not_exists(save_path)
final_model_report.write_html(f"{save_path}{todays_date}_{final_model_class_name}.html")
# bucket = storage.Client().bucket(bucket)
filename = f"{todays_date}_{final_model_class_name}.html"
blob = bucket.blob(f"{save_path}{filename}")
blob.upload_from_filename(f"{save_path}{todays_date}_{final_model_class_name}.html")
print(f"{filename} sucessfully uploaded to GCS bucket!")


model_file_name = '{save_path}{model_type}_{date}'.format(save_path = save_path,
                                  model_type = final_model_class_name,    
                                  date=datetime.now().strftime("%Y-%m-%d"))                                                                   
save_model(final_model_file,model_file_name)
filename = 'model.pkl'.format(model_type=final_model_class_name,date=datetime.now().strftime("%Y-%m-%d"))
blob = bucket.blob(f"{save_path}{filename}")
blob.upload_from_filename(f"{model_file_name}.pkl")
print(f"{filename} sucessfully uploaded to GCS bucket!")

######################## Save the final Model for Upload Moel componet with Renaming ##############
# model.uri = f'gs://{bucket_name}/models/final_selected/{todays_date}/'

# save_model(final_model_file,'model.pkl')
# final_model_file.save_model(model.path + ".bst")

###################### Output Final Selected Model's HTML Report View ########################

model_metrics_report.path = f'gs://{bucket_name}/{save_path}{todays_date}_{final_model_class_name}.html'

In [None]:


from google.cloud import bigquery
    from google.cloud import storage
    from datetime import datetime
    import logging 
    from pycaret.classification import setup,create_model,tune_model, predict_model,get_config,compare_models,save_model,tune_model
    from sklearn.metrics import accuracy_score, roc_auc_score, precision_recall_curve, mean_squared_error, f1_score, precision_score, recall_score, confusion_matrix, roc_curve
 

    ### import data
    # CREDENTIALS = google.oauth2.credentials.Credentials(token)
    # # import google.oauth2.credentials
   
    client = bigquery.Client(project=project_id, location='northamerica-northeast1')
    storage_client = storage.Client(project=project_id)


    # Get utils.py
    bucket = storage_client.get_bucket(resources_bucket_name)
    blob = bucket.get_blob(f"{utils_file_path}/{utils_filename}")
    blob.download_to_filename(utils_filename)
    blob = bucket.get_blob(f"{utils_file_path}/{plot_utils_filename}")
    blob.download_to_filename(plot_utils_filename)
    
    from preprocessing_utils import pre_process_data
    from preprocessing_utils import downsampling
    from plotly_utils import evaluate_and_save_models,create_folder_if_not_exists,ploty_model_metrics,plotly_feature_importance,plotly_lift_curve, plotly_model_report,plotly_roc, plotly_confusion_matrix,plotly_output_hist,plotly_precision_recall 
    # specify the path to the training data
    training_table = f"{project_id}.{dataset}.{training_dataset}"

    # generate the query
    train_query = '''
       SELECT * 
                FROM `{training_table}`
    '''.format(training_table = training_table)
   

    job_config = bigquery.QueryJobConfig()

    # create a dataframe with the training data
    train_all = client.query(train_query, job_config=job_config).to_dataframe()

     ##############  Split train/valid/test based of Dev Training Sample Size   #######################
    # training_perc = 0.62
    train_df = train_all.sort_values(["partition_dt"]).iloc[:int(train_all.shape[0]*training_perc)]


    lower_bound = int(train_all.shape[0]*training_perc)
    upper_bound = lower_bound + int(train_all.shape[0]*((1-training_perc)/2))
    valid_df = train_all.sort_values(["partition_dt"]).iloc[lower_bound:upper_bound]

    lower_bound = train_df.shape[0] + valid_df.shape[0]
    upper_bound = lower_bound + int(train_all.shape[0]*((1-training_perc)/2))
    test_df = train_all.sort_values(["partition_dt"]).iloc[lower_bound:]
    

### Pipeline

In [None]:
# @dsl.pipeline(
#     # A name for the pipeline.
#     name="{}-xgb-pipeline".format(SERVICE_TYPE_NAME),
#     description=' pipeline for training {} model'.format(SERVICE_TYPE_NAME)
# )
def pipeline(
        project_id: str = PROJECT_ID,
        region: str = REGION,
        resource_bucket: str = RESOURCE_BUCKET, 
        file_bucket: str = FILE_BUCKET
    ):
    
    # ----- create training set --------
    bq_create_training_dataset_op = bq_create_dataset(score_date=SCORE_DATE_DASH,
                          score_date_delta=SCORE_DATE_DELTA,
                          project_id=PROJECT_ID,
                          dataset_id=DATASET_ID,
                          region=REGION,
                          promo_expiry_start=PROMO_EXPIRY_START, 
                          promo_expiry_end=PROMO_EXPIRY_END, 
                          v_start_date=SCORE_DATE_MINUS_6_MOS_DASH,
                          v_end_date=SCORE_DATE_LAST_MONTH_END_DASH)
    
    # ----- preprocessing train data --------
    preprocess_train_op = preprocess(
        pipeline_dataset='bq_ctr_pipeline_dataset', 
        save_data_path='gs://{}/{}_train.csv.gz'.format(FILE_BUCKET, SERVICE_TYPE),
        project_id=PROJECT_ID,
        dataset_id=DATASET_ID
    )

    # preprocess_train_op.set_memory_limit('128G')
    # preprocess_train_op.set_cpu_limit('32')

    bq_create_training_dataset_op 
    preprocess_train_op

    # ----- create validation set --------
    bq_create_validation_dataset_op = bq_create_dataset(score_date=SCORE_DATE_VAL_DASH,
                          score_date_delta=SCORE_DATE_VAL_DELTA,
                          project_id=PROJECT_ID,
                          dataset_id=DATASET_ID,
                          region=REGION,
                          promo_expiry_start=PROMO_EXPIRY_START_VAL, 
                          promo_expiry_end=PROMO_EXPIRY_END_VAL, 
                          v_start_date=SCORE_DATE_VAL_MINUS_6_MOS_DASH,
                          v_end_date=SCORE_DATE_VAL_LAST_MONTH_END_DASH)
    
    # ----- preprocessing validation data --------
    preprocess_validation_op = preprocess(
        pipeline_dataset='bq_ctr_pipeline_dataset', 
        save_data_path='gs://{}/{}_validation.csv.gz'.format(FILE_BUCKET, SERVICE_TYPE),
        project_id=PROJECT_ID,
        dataset_id=DATASET_ID
    )

    # preprocess_validation_op.set_memory_limit('256G')
    # preprocess_validation_op.set_cpu_limit('32')

    bq_create_validation_dataset_op
    preprocess_validation_op

    train_and_save_model_op = train_and_save_model(file_bucket=FILE_BUCKET,
                                                   service_type=SERVICE_TYPE,
                                                   score_date_dash=SCORE_DATE_DASH,
                                                   score_date_val_dash=SCORE_DATE_VAL_DASH,
                                                   project_id=PROJECT_ID,
                                                   dataset_id=DATASET_ID,
                                                   )
    
    train_and_save_model_op
    
#     train_and_save_model_op.set_memory_limit('256G')
#     train_and_save_model_op.set_cpu_limit('32')

#     train_and_save_model_op.after(preprocess_train_op)
#     train_and_save_model_op.after(preprocess_validation_op)


### Run the Pipeline Job

In [None]:
# pipeline(project_id = PROJECT_ID,
#         region = REGION,
#         resource_bucket = RESOURCE_BUCKET,
#         file_bucket = FILE_BUCKET)


pipeline(project_id = PROJECT_ID,
        region = REGION,
        resource_bucket = RESOURCE_BUCKET, 
        file_bucket = FILE_BUCKET)

In [None]:
# from kfp.v2 import compiler
# from google.cloud.aiplatform import pipeline_jobs

# import json

# compiler.Compiler().compile(
#    pipeline_func=pipeline, package_path="pipeline.json"
# )

# job = pipeline_jobs.PipelineJob(
#                                display_name=PIPELINE_NAME,
#                                template_path="pipeline.json",
#                                location=REGION,
#                                enable_caching=False,
#                                pipeline_root = f"gs://{RESOURCE_BUCKET}"
# )
# job.run(
#    service_account = f"bilayer-sa@{PROJECT_ID}.iam.gserviceaccount.com"
# )