In [None]:

import os
import re
import google
from google.oauth2 import credentials
from google.oauth2 import service_account
from google.oauth2.service_account import Credentials
from datetime import date
from datetime import timedelta
from dateutil.relativedelta import relativedelta

# build model
import xgboost as xgb
from sklearn.metrics import roc_auc_score

SERVICE_TYPE = 'tos_crosssell'
DATASET_ID = 'tos_crosssell'
PROJECT_ID = 'divg-josh-pr-d1cc3a' #mapping['PROJECT_ID']
RESOURCE_BUCKET = 'divg-josh-pr-d1cc3a-default' #mapping['resources_bucket']
FILE_BUCKET = 'divg-josh-pr-d1cc3a-default' #mapping['gcs_csv_bucket']
REGION = 'northamerica-northeast1' #mapping['REGION']
MODEL_ID = '9999'
FOLDER_NAME = 'xgb_tos_cross_sell_train_deploy'.format(MODEL_ID)
QUERIES_PATH = 'vertex_pipelines/' + FOLDER_NAME + '/queries/'

scoringDate = date(2022, 6, 30)  # date.today() - relativedelta(days=2)- relativedelta(months=30)
valScoringDate = date(2022, 9, 30)  # scoringDate - relativedelta(days=2)

# training views
# training views
CONSL_VIEW_NAME = '{}_pipeline_consl_data_training_bi_layer'.format(SERVICE_TYPE)  # done
FFH_BILLING_VIEW_NAME = '{}_pipeline_ffh_billing_data_training_bi_layer'.format(SERVICE_TYPE)  # done
HS_USAGE_VIEW_NAME = '{}_pipeline_hs_usage_data_training_bi_layer'.format(SERVICE_TYPE)  # done
DEMO_INCOME_VIEW_NAME = '{}_pipeline_demo_income_data_training_bi_layer'.format(SERVICE_TYPE)  # done
PROMO_EXPIRY_VIEW_NAME = '{}_pipeline_promo_expiry_data_training_bi_layer'.format(SERVICE_TYPE)  # done
TROUBLE_TICKETS_VIEW_NAME = '{}_pipeline_trouble_tickets_data_training_bi_layer'.format(SERVICE_TYPE)  # done
GPON_COPPER_VIEW_NAME = '{}_pipeline_gpon_copper_data_training_bi_layer'.format(SERVICE_TYPE)  # done
CALL_DATA_VIEW_NAME = '{}_pipeline_call_data_training_bi_layer'.format(SERVICE_TYPE)  # done
HSIA_DROPS_VIEW_NAME = '{}_pipeline_hsia_drops_training_bi_layer'.format(SERVICE_TYPE)

# validation views
CONSL_VIEW_VALIDATION_NAME = '{}_pipeline_consl_data_validation_bi_layer'.format(SERVICE_TYPE)
FFH_BILLING_VIEW_VALIDATION_NAME = '{}_pipeline_ffh_billing_data_validation_bi_layer'.format(SERVICE_TYPE)
HS_USAGE_VIEW_VALIDATION_NAME = '{}_pipeline_hs_usage_data_validation_bi_layer'.format(SERVICE_TYPE)
DEMO_INCOME_VIEW_VALIDATION_NAME = '{}_pipeline_demo_income_data_validation_bi_layer'.format(SERVICE_TYPE)
PROMO_EXPIRY_VIEW_VALIDATION_NAME = '{}_pipeline_promo_expiry_data_validation_bi_layer'.format(SERVICE_TYPE)
TROUBLE_TICKETS_VIEW_VALIDATION_NAME = '{}_pipeline_trouble_tickets_data_validation_bi_layer'.format(SERVICE_TYPE)
GPON_COPPER_VIEW_VALIDATION_NAME = '{}_pipeline_gpon_copper_data_validation_bi_layer'.format(SERVICE_TYPE)
CALL_DATA_VIEW_VALIDATION_NAME = '{}_pipeline_call_data_validation_bi_layer'.format(SERVICE_TYPE)
HSIA_DROPS_VIEW_VALIDATION_NAME = '{}_pipeline_hsia_drops_validation_bi_layer'.format(SERVICE_TYPE)

# training dates
SCORE_DATE = scoringDate.strftime('%Y%m%d')  # date.today().strftime('%Y%m%d')
SCORE_DATE_DASH = scoringDate.strftime('%Y-%m-%d')
SCORE_DATE_MINUS_6_MOS_DASH = ((scoringDate - relativedelta(months=6)).replace(day=1)).strftime('%Y-%m-%d')
SCORE_DATE_LAST_MONTH_END_DASH = ((scoringDate.replace(day=1)) - timedelta(days=1)).strftime('%Y-%m-%d')
SCORE_DATE_LAST_MONTH_YEAR = ((scoringDate.replace(day=1)) - timedelta(days=1)).year
SCORE_DATE_LAST_MONTH_MONTH = ((scoringDate.replace(day=1)) - timedelta(days=1)).month

# validation dates
SCORE_DATE_VAL = valScoringDate.strftime('%Y%m%d')

SCORE_DATE_VAL_DASH = valScoringDate.strftime('%Y-%m-%d')
SCORE_DATE_VAL_MINUS_6_MOS_DASH = ((valScoringDate - relativedelta(months=6)).replace(day=1)).strftime('%Y-%m-%d')
SCORE_DATE_VAL_LAST_MONTH_END_DASH = ((valScoringDate.replace(day=1)) - timedelta(days=1)).strftime('%Y-%m-%d')
SCORE_DATE_VAL_LAST_MONTH_YEAR = ((valScoringDate.replace(day=1)) - timedelta(days=1)).year
SCORE_DATE_VAL_LAST_MONTH_MONTH = ((valScoringDate.replace(day=1)) - timedelta(days=1)).month

SCORE_DATE_DELTA = 0
SCORE_DATE_VAL_DELTA = 0
TICKET_DATE_WINDOW = 30  # Days of ticket data to be queried

ACCOUNT_CONSL_QUERY_PATH = QUERIES_PATH + 'create_input_account_consl_query.txt'
ACCOUNT_HSIA_DROPS_QUERY_PATH = QUERIES_PATH + 'create_input_account_hsia_drops_query.txt'
ACCOUNT_CALL_DATA_QUERY_PATH = QUERIES_PATH + 'create_input_account_call_data_query.txt'
ACCOUNT_GPON_COPPER_QUERY_PATH = QUERIES_PATH + 'create_input_account_gpon_copper_query.txt'
ACCOUNT_TROUBLE_TICKETS_QUERY_PATH = QUERIES_PATH + 'create_input_account_trouble_tickets_query.txt'
ACCOUNT_PROMO_EXPIRY_QUERY_PATH = QUERIES_PATH + 'create_input_account_promo_expiry_query.txt'
# ACCOUNT_TV_USAGE_QUERY_PATH = QUERIES_PATH + 'create_input_account_tv_usage_query.txt'
ACCOUNT_DEMO_INCOME_QUERY_PATH = QUERIES_PATH + 'create_input_account_demo_income_query.txt'
ACCOUNT_HS_USAGE_QUERY_PATH = QUERIES_PATH + 'create_input_account_hs_usage_query.txt'
ACCOUNT_FFH_BILLING_QUERY_PATH = QUERIES_PATH + 'create_input_account_ffh_billing_query.txt'

def train_and_save_model(
            file_bucket: str,
            service_type: str,
            score_date_dash: str,
            score_date_val_dash: str,
            project_id: str,
            dataset_id: str,
            # metrics: Output[Metrics],
            # metricsc: Output[ClassificationMetrics],
    ):

        import gc
        import time
        import pandas as pd
        import numpy as np
        import pickle
        from google.cloud import storage
        from google.cloud import bigquery
        from sklearn.model_selection import train_test_split

        def get_lift(prob, y_test, q):
            result = pd.DataFrame(columns=['Prob', 'Churn'])
            result['Prob'] = prob
            result['Churn'] = y_test
            # result['Decile'] = pd.qcut(1-result['Prob'], 10, labels = False)
            result['Decile'] = pd.qcut(result['Prob'], q, labels=[i for i in range(q, 0, -1)])
            add = pd.DataFrame(result.groupby('Decile')['Churn'].mean()).reset_index()
            add.columns = ['Decile', 'avg_real_churn_rate']
            result = result.merge(add, on='Decile', how='left')
            result.sort_values('Decile', ascending=True, inplace=True)
            lg = pd.DataFrame(result.groupby('Decile')['Prob'].mean()).reset_index()
            lg.columns = ['Decile', 'avg_model_pred_churn_rate']
            lg.sort_values('Decile', ascending=False, inplace=True)
            lg['avg_churn_rate_total'] = result['Churn'].mean()
            lg = lg.merge(add, on='Decile', how='left')
            lg['lift'] = lg['avg_real_churn_rate'] / lg['avg_churn_rate_total']

            return lg

        df_train = pd.read_csv('gs://{}/{}_train.csv.gz'.format(file_bucket, service_type),
                               compression='gzip')  # for 2022-08-01
        df_test = pd.read_csv('gs://{}/{}_validation.csv.gz'.format(file_bucket, service_type),  # 2022-09-01
                              compression='gzip')

        sql_train = ''' SELECT * FROM `{}.{}.bq_tos_cross_sell_targets_q3` '''.format(project_id, dataset_id)
        sql_test = ''' SELECT * FROM `{}.{}.bq_tos_cross_sell_targets_q4` '''.format(project_id, dataset_id)

        def get_gcp_bqclient(project_id, use_local_credential=True):
            token = os.popen('gcloud auth print-access-token').read()
            token = re.sub(f'\n$', '', token)
            credentials = google.oauth2.credentials.Credentials(token)

            bq_client = bigquery.Client(project=project_id)
            if use_local_credential:
                bq_client = bigquery.Client(project=project_id, credentials=credentials)
            return bq_client
        
        client = get_gcp_bqclient(project_id)
        df_target_train = client.query(sql_train).to_dataframe()
        df_target_train = df_target_train.loc[
            df_target_train['YEAR_MONTH'] == "2022-Q3"]  #'-'.join(score_date_dash.split('-')[:2])]  # score_date_dash = '2022-08-31'

        df_target_test = client.query(sql_test).to_dataframe()
        df_target_test = df_target_test.loc[
            df_target_test['YEAR_MONTH'] == "2022-Q4"]  #'-'.join(score_date_val_dash.split('-')[:2])]  # score_date_dash = '2022-09-30'

        df_target_test['ban'] = df_target_test['ban'].astype('int64')
        df_target_test = df_target_test.groupby('ban').tail(1)
        
        df_test = df_test.merge(df_target_test[['ban', 'product_crosssell_ind']], on='ban', how='left')
        df_test.rename(columns={'product_crosssell_ind': 'target'}, inplace=True)
        df_test.dropna(subset=['target'], inplace=True)
        df_test['target'] = df_test['target'].astype(int)

        df_target_train['ban'] = df_target_train['ban'].astype('int64')
        df_target_train = df_target_train.groupby('ban').tail(1)
        
        df_train = df_train.merge(df_target_train[['ban', 'product_crosssell_ind']], on='ban', how='left')
        df_train.rename(columns={'product_crosssell_ind': 'target'}, inplace=True)
        df_train.dropna(subset=['target'], inplace=True)
        df_train['target'] = df_train['target'].astype(int)

        cols_1 = df_train.columns.values
        cols_2 = df_test.columns.values
        cols = set(cols_1).intersection(set(cols_2))
        features = [f for f in cols if f not in ['ban', 'target']]

        df_train, df_val = train_test_split(df_train, shuffle=True, test_size=0.3, random_state=42,
                                            stratify=df_train['target']
                                            )
        ban_train = df_train['ban']
        X_train = df_train[features]
        y_train = np.squeeze(df_train['target'].values)
        
        ban_val = df_val['ban']
        X_val = df_val[features]
        y_val = np.squeeze(df_val['target'].values)

        ban_test = df_test['ban']
        X_test = df_test[features]
        y_test = np.squeeze(df_test['target'].values)

        del df_train, df_val, df_test
        gc.collect()

        # build model
        import xgboost as xgb
        from sklearn.metrics import roc_auc_score

        xgb_model = xgb.XGBClassifier(
            learning_rate=0.02,
            n_estimators=1000,
            max_depth=10,
            min_child_weight=1,
            gamma=0,
            subsample=0.8,
            colsample_bytree=0.8,
            objective='binary:logistic',
            nthread=4,
            scale_pos_weight=1,
            seed=27
        )

        xgb_model.fit(X_train, y_train)
        print(' xgb training done ')

        y_pred = xgb_model.predict_proba(X_val, ntree_limit=xgb_model.best_iteration)[:, 1]
        y_pred_label = (y_pred > 0.5).astype(int)
        auc = roc_auc_score(y_val, y_pred_label)
        # metrics.log_metric("AUC", auc)

        pred_prb = xgb_model.predict_proba(X_test, ntree_limit=xgb_model.best_iteration)[:, 1]
        
        #join ban, X_test and pred_prb and print to csv
        
        lg = get_lift(pred_prb, y_test, 10)

        # save the model in GCS
        from datetime import datetime
        models_dict = {}
        create_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        models_dict['create_time'] = create_time
        models_dict['model'] = xgb_model
        models_dict['features'] = features
        lg.to_csv('gs://{}/lift_on_scoring_data_{}.csv'.format(file_bucket, create_time, index=False))
        print("....lift_to_csv done")

        with open('model_dict.pkl', 'wb') as handle:
            pickle.dump(models_dict, handle)
        handle.close()

        storage_client = storage.Client()
        bucket = storage_client.get_bucket(file_bucket)

        MODEL_PATH = '{}_xgb_models/'.format(service_type)
        blob = bucket.blob(MODEL_PATH)
        if not blob.exists(storage_client):
            blob.upload_from_string('')

        model_name_onbkt = '{}{}_models_xgb_{}'.format(MODEL_PATH, service_type, models_dict['create_time'])
        blob = bucket.blob(model_name_onbkt)
        blob.upload_from_filename('model_dict.pkl')
        time.sleep(300)

In [None]:
def pipeline(
        project_id: str = PROJECT_ID,
        region: str = REGION,
        resource_bucket: str = RESOURCE_BUCKET,
        file_bucket: str = FILE_BUCKET,
):

    train_and_save_model_op = train_and_save_model(file_bucket=FILE_BUCKET,
                                                       service_type=SERVICE_TYPE,
                                                       score_date_dash=SCORE_DATE_DASH,
                                                       score_date_val_dash=SCORE_DATE_VAL_DASH,
                                                       project_id=PROJECT_ID,
                                                       dataset_id=DATASET_ID,
                                                       )
    
    train_and_save_model_op

    return pipeline

In [None]:
pipeline(project_id = PROJECT_ID, region = REGION, resource_bucket = RESOURCE_BUCKET, file_bucket = FILE_BUCKET)

In [None]:
def pipeline(
        project_id: str = PROJECT_ID,
        region: str = REGION,
        resource_bucket: str = RESOURCE_BUCKET,
        file_bucket: str = FILE_BUCKET,
):

    train_and_save_model_op = train_and_save_model(file_bucket=FILE_BUCKET,
                                                       service_type=SERVICE_TYPE,
                                                       score_date_dash=SCORE_DATE_DASH,
                                                       score_date_val_dash=SCORE_DATE_VAL_DASH,
                                                       project_id=PROJECT_ID,
                                                       dataset_id=DATASET_ID,
                                                       )
    
    train_and_save_model_op

    return pipeline