### Import Libraries, declare variables

In [None]:
import os
import re
import google
from google.oauth2 import credentials
from google.oauth2 import service_account
from google.oauth2.service_account import Credentials
from datetime import date
from datetime import timedelta
from dateutil.relativedelta import relativedelta

# build model
import xgboost as xgb
from sklearn.metrics import roc_auc_score

SERVICE_TYPE = 'telus_postpaid_churn'
DATASET_ID = 'telus_postpaid_churn'
PROJECT_ID = 'divg-groovyhoon-pr-d2eab4' #mapping['PROJECT_ID']
RESOURCE_BUCKET = 'divg-groovyhoon-pr-d2eab4-default' #mapping['resources_bucket']
FILE_BUCKET = 'divg-groovyhoon-pr-d2eab4-default' #mapping['gcs_csv_bucket']
REGION = 'northamerica-northeast1' #mapping['REGION']
MODEL_ID = '9999'
FOLDER_NAME = 'telus_postpaid_churn'.format(MODEL_ID)
QUERIES_PATH = 'vertex_pipelines/' + FOLDER_NAME + '/queries/'
TRAIN_TABLE_ID = 'bq_tpc_training_dataset'
VAL_TABLE_ID = 'bq_tpc_validation_dataset'
SCORE_TABLE_ID = 'bq_telus_postpaid_churn_scores'

scoringDate = date(2023, 8, 1)  # date.today() - relativedelta(days=2)- relativedelta(months=30)
valScoringDate = date(2023, 10, 1)  # scoringDate - relativedelta(days=2)


### import bq to dataframe function

In [None]:
import pandas as pd 
import numpy as np 
from google.cloud import bigquery
from google.oauth2 import credentials

def import_bq_to_dataframe(project_id, dataset_id, table_id, client): 
    
    """
    Imports a specific table from BigQuery to a DataFrame. 
    
    Args: 
        project_id: The name of the project_id where the table is located.
        dataset_id: The name of the dataset_id where the table is located.
        table_id: The name of the table_id you wish to import to DataFrame.
        client: A BigQuery client instance. e.g. client = bigquery.Client(project=project_id).

    Returns: 
        A DataFrame
        
    Example: 
        import_bq_to_dataframe('bi-stg-divg-speech-pr-9d940b', 'call_to_retention_dataset', 'bq_ctr_pipeline_dataset')
        
    """
    
    sql = f"SELECT * FROM `{project_id}.{dataset_id}.{table_id}`"
    
    df_return = client.query(sql).to_dataframe()

    return df_return
    

### define get_lift function, import df_train and df_test from gcs bucket

In [None]:
import gc
import time
import pandas as pd
import numpy as np
import pickle
from google.cloud import storage
from google.cloud import bigquery
from sklearn.model_selection import train_test_split

project_id = PROJECT_ID
region = REGION
resource_bucket = RESOURCE_BUCKET
file_bucket = FILE_BUCKET
service_type=SERVICE_TYPE
project_id=PROJECT_ID
dataset_id=DATASET_ID
train_table_id = TRAIN_TABLE_ID
val_table_id = VAL_TABLE_ID

def get_lift(prob, y_test, q):
    result = pd.DataFrame(columns=['Prob', 'Churn'])
    result['Prob'] = prob
    result['Churn'] = y_test
    # result['Decile'] = pd.qcut(1-result['Prob'], 10, labels = False)
    result['Decile'] = pd.qcut(result['Prob'], q, labels=[i for i in range(q, 0, -1)])
    add = pd.DataFrame(result.groupby('Decile')['Churn'].mean()).reset_index()
    add.columns = ['Decile', 'avg_real_churn_rate']
    result = result.merge(add, on='Decile', how='left')
    result.sort_values('Decile', ascending=True, inplace=True)
    lg = pd.DataFrame(result.groupby('Decile')['Prob'].mean()).reset_index()
    lg.columns = ['Decile', 'avg_model_pred_churn_rate']
    lg.sort_values('Decile', ascending=False, inplace=True)
    lg['avg_churn_rate_total'] = result['Churn'].mean()
    lg = lg.merge(add, on='Decile', how='left')
    lg['lift'] = lg['avg_real_churn_rate'] / lg['avg_churn_rate_total']

    return lg

def get_gcp_bqclient(project_id, use_local_credential=True):
    token = os.popen('gcloud auth print-access-token').read()
    token = re.sub(f'\n$', '', token)
    credentials = google.oauth2.credentials.Credentials(token)

    bq_client = bigquery.Client(project=project_id)
    if use_local_credential:
        bq_client = bigquery.Client(project=project_id, credentials=credentials)
    return bq_client

client = get_gcp_bqclient(project_id)

df_train = import_bq_to_dataframe(project_id, dataset_id, train_table_id, client)
df_val = import_bq_to_dataframe(project_id, dataset_id, val_table_id, client)

print(f'df_train: {df_train.shape}')
print(f'df_val: {df_val.shape}')


### add targets to df_train and df_target 

- df_target_train is from `divg-josh-pr-d1cc3a.tos_crosssell.bq_tos_cross_sell_targets_q3` 
- df_target_test is from `divg-josh-pr-d1cc3a.tos_crosssell.bq_tos_cross_sell_targets_q4` 
- some parts of the code and sql queries need to be dynamically adjusted to be included in the deploy model

In [None]:
#train test split
df_train, df_test = train_test_split(df_train, test_size=0.3, random_state=42, stratify=df_train['target_ind'])

df_train.to_csv('gs://{}/{}/{}_train_final.csv'.format(FILE_BUCKET, SERVICE_TYPE, SERVICE_TYPE), index=False)
df_test.to_csv('gs://{}/{}/{}_test_final.csv'.format(FILE_BUCKET, SERVICE_TYPE, SERVICE_TYPE), index=False)
df_val.to_csv('gs://{}/{}/{}_val_final.csv'.format(FILE_BUCKET, SERVICE_TYPE, SERVICE_TYPE), index=False)

#set up features (list)
cols_1 = df_train.columns.values
cols_2 = df_test.columns.values
cols_3 = df_val.columns.values

cols = set(cols_1).intersection(set(cols_2))
cols = set(cols).intersection(set(cols_3))

features_to_exclude = ['ban', 'subscriber_no', 'province', 'postal_code', 'pref_lang', 'target_ind', 'start_dvc_bal_amt']

features = [f for f in cols if f not in features_to_exclude]

ban_train = df_train[['ban', 'subscriber_no']]
X_train = df_train[features]
y_train = np.squeeze(df_train['target_ind'].values)
target_train = df_train['target_ind']

ban_test = df_test[['ban', 'subscriber_no']]
X_test = df_test[features]
y_test = np.squeeze(df_test['target_ind'].values)
target_test = df_test['target_ind']

ban_val = df_val[['ban', 'subscriber_no']]
X_val = df_val[features]
y_val = np.squeeze(df_val['target_ind'].values)
target_val = df_val['target_ind']


In [None]:
X_test.shape

### preprocess

In [None]:
def to_categorical(df, cat_feature_names): 
    
    df_income_dummies = pd.get_dummies(df[cat_feature_names]) 
    df_income_dummies.columns = df_income_dummies.columns.str.replace('&', 'and')
    df_income_dummies.columns = df_income_dummies.columns.str.replace(' ', '_')

    df.drop(columns=cat_feature_names, axis=1, inplace=True)

    df = df.join(df_income_dummies)
    
    #column name clean-up
    df.columns = df.columns.str.replace(' ', '_')
    df.columns = df.columns.str.replace('-', '_')
    
    return df

cat_feature_names = ['revenue_band', 'payment_mthd', 'ebill_ind', 'dvc_non_telus_ind', 'credit_class', 'contract_type', 'bacct_delinq_ind', 'urbn_rur_ind',
                     'dnc_sms_ind', 'dnc_em_ind', 'data_usg_trend', 'wls_data_plan_ind', 'wls_data_shr_plan_ind', 'demo_sgname', 'demo_lsname']

# cat_feature_names = ['revenue_band', 'payment_mthd', 'ebill_ind', 'dvc_non_telus_ind', 'credit_class', 'contract_type', 'bacct_delinq_ind', 'urbn_rur_ind',
#                      'dnc_sms_ind', 'dnc_em_ind', 'data_usg_trend', 'wls_data_plan_ind', 'wls_data_shr_plan_ind']
    
X_train = to_categorical(X_train, cat_feature_names)
X_test = to_categorical(X_test, cat_feature_names)
X_val = to_categorical(X_val, cat_feature_names)

#set up features (list)
cols_1 = X_train.columns.values
cols_2 = X_test.columns.values
cols_3 = X_val.columns.values

cols = set(cols_1).intersection(set(cols_2))
cols = set(cols).intersection(set(cols_3))

features = [f for f in cols if f not in features_to_exclude]

# features = ['contract_type_MTM'
# , 'contract_type_BYOD'
# , 'contract_type_On_Contract'
# , 'cntrct_end_recency'
# , 'cntrct_start_recency'
# , 'ban_tenure'
# , 'contract_mth'
# , 'urbn_rur_ind_Rural'
# , 'sub_tenure'
# , 'urbn_rur_ind_Urban'
# , 'dvc_non_telus_ind_Y'
# , 'demo_lsname_Large_Diverse_Families'
# , 'dvc_non_telus_ind_N'
# , 'demo_sgname_Diverse_Urban_Fringe'
# , 'demo_sgname_Lower_Middle_Rural'
# , 'data_usg_trend_unknown'
# , 'demo_sgname_Midscale_Urban_Fringe'
# , 'clk_app_offer'
# , 'bacct_delinq_ind_Y'
# , 'revenue_band_D'
# , 'payment_mthd_R'
# , 'clk_app_usage'
# , 'demo_sgname_Upper_Middle_Suburbia'
# , 'demo_sgname_Urban_Diversity'
# , 'easy_pymt_avg'
# , 'rate_plan_amt'
# , 'demo_sgname_Upscale_Suburban_Diversity'
# , 'sub_cnt'
# , 'clk_app_ovrview'
# , 'credit_class_D'
# , 'demo_sgname_Town_Mix'
# , 'demo_sgname_Young_Urban_Core'
# , 'payment_mthd_C'
# , 'bacct_delinq_ind_N'
# , 'artm_min_qty_avg'
# , 'credit_class_V'
# , 'hm_call_cnt_avg'
# , 'wls_data_shr_plan_ind_Y'
# , 'demo_lsname_Older_Families_and_Empty_Nests'
# , 'dnc_sms_ind_N'
# , 'demo_lsname_School_Age_Families'
# , 'credit_class_X'
# , 'clk_web_plan'
# , 'hm_data_usg_avg'
# , 'revenue_band_F'
# , 'demo_lsname_Young_Families'
# , 'demo_lsname_Middle_Age_Families'
# , 'clk_app_subslct'
# , 'cr_disc_amt_avg'
# , 'tot_data_usg_avg'
# , 'age'
# , 'demo_avg_income'
# , 'data_usg_trend_increasing'
# , 'revenue_band_N'
# , 'revenue_band_C'
# , 'dvc_non_telus_ind_U'
# , 'wls_data_plan_ind_N'
# , 'ebill_ind_N'
# , 'net_inv_amt_avg'
# , 'clk_app_bill'
# , 'clk_app_changefg'
# , 'revenue_band_NA'
# , 'demo_sgname_Older_Urban_Francophone'
# , 'demo_sgname_Upper_Middle_Rural'
# , 'sms_dnc_recency'
# , 'demo_sgname_Upper_Middle_Suburban_Francophone'
# , 'demo_sgname_Unassigned'
# , 'clk_web_bill'
# , 'clk_app_drawer'
# , 'demo_sgname_Rural_Francophone'
# , 'clk_web_phnumber'
# , 'ld_min_qty_avg'
# , 'em_dnc_recency'
# , 'effort_durtn_sec_qty_s'
# ]

X_train = X_train[features] 
X_test = X_test[features] 
X_val = X_val[features]

X_train.head()

In [None]:

# del df_train, df_val, df_test
del df_train, df_test, df_val
gc.collect()


### fit training data in xgboost classifier

In [None]:

# build model and fit in training data
import xgboost as xgb
from sklearn.metrics import roc_auc_score

xgb_model = xgb.XGBClassifier(
    learning_rate=0.05,
    n_estimators=250,
    max_depth=12,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27
)

xgb_model.fit(X_train, y_train)
print('xgb training done')


In [None]:
print(X_train.shape)
print(X_test.shape)

### make predictions on X_test set, assign deciles to the predicted values, and save in df_test_exp

In [None]:
#predictions on X_test
y_pred = xgb_model.predict_proba(X_test, ntree_limit=xgb_model.best_iteration)[:, 1]

#join ban_test, X_test, y_test and y_pred and print to csv
#CHECK THE SIZE OF EACH COMPONENT BEFORE JOINING
q=10
df_ban_test = ban_test
df_test_exp = df_ban_test.join(X_test) 
df_test_exp['y_test'] = y_test
df_test_exp['y_pred_proba'] = y_pred
df_test_exp['y_pred'] = (df_test_exp['y_pred_proba'] > 0.5).astype(int)
df_test_exp['decile'] = pd.qcut(df_test_exp['y_pred_proba'], q, labels=[i for i in range(q, 0, -1)])

lg = get_lift(y_pred, y_test, q)

lg

### export df_test_exp and lift scores to gcs bucket

In [None]:
df_test_exp.to_csv('gs://{}/{}/{}_df_test_exp.csv'.format(file_bucket, SERVICE_TYPE, SERVICE_TYPE, index=True))
print("....df_test_exp exported")

lg.to_csv('gs://{}/{}/{}_lift_on_test_data.csv'.format(file_bucket, SERVICE_TYPE, SERVICE_TYPE, index=False))
print("....lift_to_csv done")

### make predictions on X_val set, assign deciles to the predicted values, and save in df_val_exp

In [None]:
#predictions on X_val
y_pred = xgb_model.predict_proba(X_val, ntree_limit=xgb_model.best_iteration)[:, 1]

#join ban_val, X_val, y_val and y_pred and print to csv
#CHECK THE SIZE OF EACH COMPONENT BEFORE JOINING
q=10
df_ban_val = ban_val
df_val_exp = df_ban_val.join(X_val) 
df_val_exp['y_val'] = y_val
df_val_exp['y_pred_proba'] = y_pred
df_val_exp['y_pred'] = (df_val_exp['y_pred_proba'] > 0.5).astype(int)
df_val_exp['decile'] = pd.qcut(df_val_exp['y_pred_proba'], q, labels=[i for i in range(q, 0, -1)])

lg = get_lift(y_pred, y_val, q)

lg

### export df_val_exp and lift scores to gcs bucket

In [None]:
df_val_exp.to_csv('gs://{}/{}/{}_df_val_exp.csv'.format(file_bucket, SERVICE_TYPE, SERVICE_TYPE, index=True))
print("....df_val_exp exported")

lg.to_csv('gs://{}/{}/{}_lift_on_val_data.csv'.format(file_bucket, SERVICE_TYPE, SERVICE_TYPE, index=False))
print("....lift_to_csv done")


### get feature importances from xgboost model

In [None]:
# Get feature importances from xgboost model
importances = xgb_model.feature_importances_

# Get the index of importances from greatest importance to least
sorted_index = np.argsort(importances)[::-1]
x = range(len(importances))

feature_names = X_train.columns

# Create tick labels 
labels = np.array(feature_names)[sorted_index]

In [None]:
for idx in sorted_index: 
    print(f'{feature_names[idx]}, {importances[idx]}', end='\n')

In [None]:
labels

In [None]:
importances

In [None]:
np.argsort(importances)[::-1]

### Load results to BQ - WIP

In [None]:
SCORE_TABLE_ID = 'bq_telus_12_months_churn_scores'

project_id = PROJECT_ID 
dataset_id = DATASET_ID
score_table_id = SCORE_TABLE_ID

# get full score to cave into bucket
pred_prob = xgb_model.predict_proba(X_val, ntree_limit=xgb_model.best_iteration)[:, 1]
result = pd.DataFrame(columns=['ban', 'subscriber_no', 'score_date', 'y_true', 'y_pred'])

result['ban'] = list(ban_val['ban'])
result['ban'] = result['ban'].astype('str')

result['subscriber_no'] = list(ban_val['ban'])
result['subscriber_no'] = result['subscriber_no'].astype('str')

result['score_date'] = "2023-11-04"

result['y_true'] = list(y_val)
result['y_true'] = result['y_true'].fillna(0.0).astype('float64')

result['y_pred'] = list(pred_prob)
result['y_pred'] = result['y_pred'].fillna(0.0).astype('float64')

############# updated up to here ############

result.to_csv('gs://{}/{}/ucar/{}_prediction_v2.csv'.format(file_bucket, service_type, service_type), index=False)

# define dtype_bq_mapping
dtype_bq_mapping = {np.dtype('int64'): 'INTEGER', 
np.dtype('float64'):  'FLOAT', 
np.dtype('float32'):  'FLOAT', 
np.dtype('object'):  'STRING', 
np.dtype('bool'):  'BOOLEAN', 
np.dtype('datetime64[ns]'):  'DATE', 
pd.Int64Dtype(): 'INTEGER'} 

# export df_final to bigquery 
schema_list = [] 
for column in result.columns: 
    schema_list.append(bigquery.SchemaField(column, dtype_bq_mapping[result.dtypes[column]], mode='NULLABLE')) 
print(schema_list) 

dest_table = f'{project_id}.{dataset_id}.{score_table_id}'

# Sending to bigquery 
client = get_gcp_bqclient(project_id)
job_config = bigquery.LoadJobConfig(schema=schema_list, write_disposition='WRITE_TRUNCATE') 
job = client.load_table_from_dataframe(result, dest_table, job_config=job_config) 
job.result() 

table = client.get_table(dest_table) # Make an API request 
print("Loaded {} rows and {} columns to {}".format(table.num_rows, len(table.schema), table_id)) 

time.sleep(60)

# table_ref = f'{project_id}.{dataset_id}.{score_table}'
# client = bigquery.Client(project=project_id)
# table = client.get_table(table_ref)
# schema = table.schema

# ll = []
# for item in schema:
#     col = item.name
#     d_type = item.field_type
#     if 'float' in str(d_type).lower():
#         d_type = 'FLOAT64'
#     ll.append((col, d_type))

#     if 'integer' in str(d_type).lower():
#         result[col] = result[col].fillna(0).astype(int)
#     if 'float' in str(d_type).lower():
#         result[col] = result[col].fillna(0.0).astype(float)
#     if 'string' in str(d_type).lower():
#         result[col] = result[col].fillna('').astype(str)

# table_ref = '{}.{}.{}'.format(project_id, dataset_id, temp_table)
# client = bigquery.Client(project=project_id)
# if if_tbl_exists(client, table_ref):
#     client.delete_table(table_ref)

# client.create_table(table_ref)
# config = bigquery.LoadJobConfig(schema=schema)
# config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
# bq_table_instance = client.load_table_from_dataframe(result, table_ref, job_config=config)
# time.sleep(5)

# drop_sql = f"""delete from `{project_id}.{dataset_id}.{score_table}` where score_date = '{score_date_dash}'"""  # .format(project_id, dataset_id, score_date_dash)
# client.query(drop_sql)
# #
# load_sql = f"""insert into `{project_id}.{dataset_id}.{score_table}`
#               select * from `{project_id}.{dataset_id}.{temp_table}`"""
# client.query(load_sql)


In [None]:
error

### save the model in gcs

In [None]:
# save the model in GCS
from datetime import datetime
models_dict = {}
create_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
models_dict['create_time'] = create_time
models_dict['model'] = xgb_model
models_dict['features'] = features

with open('model_dict.pkl', 'wb') as handle:
    pickle.dump(models_dict, handle)
handle.close()

storage_client = storage.Client()
bucket = storage_client.get_bucket(file_bucket)

MODEL_PATH = '{}_xgb_models/'.format(service_type)
blob = bucket.blob(MODEL_PATH)
if not blob.exists(storage_client):
    blob.upload_from_string('')

model_name_onbkt = '{}{}_models_xgb_{}'.format(MODEL_PATH, service_type, models_dict['create_time'])
blob = bucket.blob(model_name_onbkt)
blob.upload_from_filename('model_dict.pkl')

print(f"....model loaded to GCS done at {str(create_time)}")

time.sleep(300)

### load the latest saved xgb_model to the environment

In [None]:
# MODEL_PATH = '{}_xgb_models/'.format(service_type)
# df_score = pd.read_csv('gs://{}/{}_score.csv.gz'.format(file_bucket, service_type), compression='gzip')
# df_score.dropna(subset=['ban'], inplace=True)
# df_score.reset_index(drop=True, inplace=True)
# print('......scoring data loaded:{}'.format(df_score.shape))
# time.sleep(10)
from google.cloud import bigquery
from google.cloud import storage

MODEL_PATH = '{}_xgb_models/'.format(service_type)

def load_model(file_bucket: str, service_type: str): 
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(file_bucket)
    blobs = storage_client.list_blobs(file_bucket, prefix='{}{}_models_xgb_'.format(MODEL_PATH, service_type))

    model_lists = []
    for blob in blobs:
        model_lists.append(blob.name)

    blob = bucket.blob(model_lists[-1])
    blob_in = blob.download_as_string()
    model_dict = pickle.loads(blob_in)
    xgb_model = model_dict['model']
    features = model_dict['features']
    print('...... model loaded')
    time.sleep(10)
    
    return xgb_model, features

xgb_model, features = load_model(file_bucket = FILE_BUCKET, service_type = SERVICE_TYPE) 

### backup codes 

In [None]:
def get_gcp_bqclient(project_id, use_local_credential=True):
    token = os.popen('gcloud auth print-access-token').read()
    token = re.sub(f'\n$', '', token)
    credentials = google.oauth2.credentials.Credentials(token)

    bq_client = bigquery.Client(project=project_id)
    if use_local_credential:
        bq_client = bigquery.Client(project=project_id, credentials=credentials)
    return bq_client

client = get_gcp_bqclient(project_id)

#instantiate df_target_train and df_target_test
sql_train = ''' SELECT * FROM `{}.{}.bq_tos_cross_sell_targets_q3` '''.format(project_id, dataset_id)
df_target_train = client.query(sql_train).to_dataframe()
df_target_train = df_target_train.loc[
    df_target_train['YEAR_MONTH'] == "2022-Q3"] #'-'.join(score_date_dash.split('-')[:2])]  # score_date_dash = '2022-08-31'

#set up df_train and df_test (add 'target')
df_target_train['ban'] = df_target_train['ban'].astype('int64')
df_target_train = df_target_train.groupby('ban').tail(1)

df_train = df_train.merge(df_target_train[['ban', 'product_crosssell_ind']], on='ban', how='left')
df_train.rename(columns={'product_crosssell_ind': 'target'}, inplace=True)
df_train.dropna(subset=['target'], inplace=True)
df_train['target'] = df_train['target'].astype(int)

df_train

In [None]:
sum(df_train['target'])

In [None]:
def get_gcp_bqclient(project_id, use_local_credential=True):
    token = os.popen('gcloud auth print-access-token').read()
    token = re.sub(f'\n$', '', token)
    credentials = google.oauth2.credentials.Credentials(token)

    bq_client = bigquery.Client(project=project_id)
    if use_local_credential:
        bq_client = bigquery.Client(project=project_id, credentials=credentials)
    return bq_client

client = get_gcp_bqclient(project_id)
sql_test = ''' SELECT * FROM `{}.{}.bq_tos_cross_sell_targets_q4` '''.format(project_id, dataset_id)
df_target_test = client.query(sql_test).to_dataframe()
df_target_test = df_target_test.loc[
    df_target_test['YEAR_MONTH'] == "2022-Q4"] #'-'.join(score_date_val_dash.split('-')[:2])]  # score_date_dash = '2022-09-30'

#set up df_train and df_test (add 'target')
df_target_test['ban'] = df_target_test['ban'].astype('int64')
df_target_test = df_target_test.groupby('ban').tail(1)

df_test = df_test.merge(df_target_test[['ban', 'product_crosssell_ind']], on='ban', how='left')
df_test.rename(columns={'product_crosssell_ind': 'target'}, inplace=True)
df_test.dropna(subset=['target'], inplace=True)
df_test['target'] = df_test['target'].astype(int)

df_test

In [None]:
sum(df_test['target'])