## Predict callers to retention

### read csv

In [1]:
#import libraries
import os
import re
import time
import google
from google.oauth2 import credentials
from google.oauth2 import service_account
from google.oauth2.service_account import Credentials
from datetime import date
from datetime import timedelta
from dateutil.relativedelta import relativedelta
import pandas as pd
import numpy as np
import pickle
from google.cloud import storage
from google.cloud import bigquery
from sklearn.model_selection import train_test_split

# build model
import xgboost as xgb
from sklearn.metrics import roc_auc_score

file_bucket = 'divg-josh-pr-d1cc3a-default' 
folder_name = 'promo_expiry_analysis'

df = pd.read_csv('gs://{}/{}/data_final.csv'.format(file_bucket, folder_name))

### preprocess

- Tenure Group: cat
- PROV: cat
- Pcount: cat
- Price Plan Grouping: cat
- Technology Group: cat
- demographics: cat
- CampaignFlag: cat
- price_sensitivity: cat
- TOTALCalls: remove

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112399 entries, 0 to 112398
Data columns (total 46 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   BAN                               112399 non-null  int64  
 1   Tenure Group                      112399 non-null  object 
 2   PROV                              112399 non-null  object 
 3   Pcount                            112399 non-null  object 
 4   Product Count                     112399 non-null  int64  
 5   productMix_product_mix_all        112399 non-null  int64  
 6   productMix_hsic_count             112399 non-null  int64  
 7   productMix_sing_count             112399 non-null  int64  
 8   productMix_ttv_count              112399 non-null  int64  
 9   productMix_shs_count              112399 non-null  int64  
 10  Price Plan Grouping               112399 non-null  object 
 11  Technology Group                  112399 non-null  o

In [3]:
df.columns

Index(['BAN', 'Tenure Group', 'PROV', 'Pcount', 'Product Count',
       'productMix_product_mix_all', 'productMix_hsic_count',
       'productMix_sing_count', 'productMix_ttv_count', 'productMix_shs_count',
       'Price Plan Grouping', 'Technology Group', 'TOTAL_CHARGE',
       'HSIA_CHARGE', 'HP_CHARGE', 'TV_CHARGE', 'tot_disc_amt',
       'hsic_disc_amt', 'sing_disc_amt', 'ttv_disc_amt',
       'TOTAL_CHARGE_NO_DISC', 'HSIC_CHARGE_NO_DISC', 'SING_CHARGE_NO_DISC',
       'TTV_CHARGE_NO_DISC', 'total_disc_pct', 'hsic_disc_pct',
       'sing_disc_pct', 'ttv_disc_pct', 'demographics_demo_avg_income',
       'callCentre_avg_talk_time_30_days', 'hsiaUsage_hs_tot_gb_average',
       'infra_num_srvc_typ_copper_sum', 'infra_num_srvc_typ_gpon_sum',
       'demographics', 'CampaignFlag', 'clckstrmData_wln_tot_cnt_r90d',
       'clckstrmData_deal_tot_cnt_r90d', 'clckstrmData_upgr_tot_cnt_r90d',
       'clckstrmData_chg_tot_cnt_r90d', 'clckstrmData_move_tot_cnt_r90d',
       'clckstrmData_cancel

In [4]:
cols_to_dummy = ['Tenure Group', 'PROV', 'Pcount', 'Price Plan Grouping', 'Technology Group', 'demographics', 'CampaignFlag', 'price_sensitivity']

for col in cols_to_dummy: 

    # Create dummy variables for the Country column
    df = pd.get_dummies(df, columns=[col], drop_first=True, prefix=None, dtype="int64")

# reorder the df columns so that 'target' comes last
df_processed = df[[c for c in df if c not in ['target']] 
       + ['target']]

df_processed.columns = df_processed.columns.str.replace('<', 'less_than_')
df_processed.columns = df_processed.columns.str.replace(' ', '_')

df_processed.head()

Unnamed: 0,BAN,Product_Count,productMix_product_mix_all,productMix_hsic_count,productMix_sing_count,productMix_ttv_count,productMix_shs_count,TOTAL_CHARGE,HSIA_CHARGE,HP_CHARGE,...,demographics_rural_family,demographics_unassigned,demographics_urban,demographics_urban_family,demographics_urban_young,CampaignFlag_Reached,CampaignFlag_Targeted,price_sensitivity_Not_Sensitive,price_sensitivity_Very_Sensitive,target
0,126956000,2,2,1,1,0,0,50.28,43.21,7.07,...,1,0,0,0,0,0,0,1,0,1
1,126991928,3,3,1,1,1,0,153.3,63.21,12.14,...,1,0,0,0,0,1,0,1,0,1
2,200201191,3,3,1,1,1,0,114.09,51.67,18.7,...,0,0,0,1,0,0,1,1,0,1
3,200225413,3,3,1,1,1,0,170.1,70.48,14.14,...,0,0,0,0,0,0,1,1,0,1
4,200232415,3,3,1,1,1,0,110.8,55.0,15.8,...,0,0,0,1,0,0,1,1,0,1


In [5]:
df_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112399 entries, 0 to 112398
Data columns (total 58 columns):
 #   Column                                          Non-Null Count   Dtype  
---  ------                                          --------------   -----  
 0   BAN                                             112399 non-null  int64  
 1   Product_Count                                   112399 non-null  int64  
 2   productMix_product_mix_all                      112399 non-null  int64  
 3   productMix_hsic_count                           112399 non-null  int64  
 4   productMix_sing_count                           112399 non-null  int64  
 5   productMix_ttv_count                            112399 non-null  int64  
 6   productMix_shs_count                            112399 non-null  int64  
 7   TOTAL_CHARGE                                    112399 non-null  float64
 8   HSIA_CHARGE                                     112399 non-null  float64
 9   HP_CHARGE                 

### register lift function 

In [6]:
def get_lift(prob, y_test, q):
    result = pd.DataFrame(columns=['Prob', 'CallToRetention'])
    result['Prob'] = prob
    result['CallToRetention'] = y_test
    result['Decile'] = pd.qcut(result['Prob'], q, labels=[i for i in range(q, 0, -1)])
    add = pd.DataFrame(result.groupby('Decile')['CallToRetention'].mean()).reset_index()
    add.columns = ['Decile', 'avg_real_call_rate']
    add2 = pd.DataFrame(result.groupby('Decile')['CallToRetention'].count()).reset_index()
    add2.columns = ['Decile', 'ban_count']
    result = result.merge(add, on='Decile', how='left')
    result = result.merge(add2, on='Decile', how='left')
    result.sort_values('Decile', ascending=True, inplace=True)
    lg = pd.DataFrame(result.groupby('Decile')['Prob'].mean()).reset_index()
    lg.columns = ['Decile', 'avg_model_pred_call_rate']
    lg.sort_values('Decile', ascending=False, inplace=True)
    lg['avg_call_rate_total'] = result['CallToRetention'].mean()
    lg = lg.merge(add, on='Decile', how='left')
    lg = lg.merge(add2, on='Decile', how='left')
    lg['lift'] = lg['avg_real_call_rate'] / lg['avg_call_rate_total']

    return lg

### set X_train, X_test, y_train, y_test

In [7]:
features = [col for col in df_processed.columns if col not in ["BAN", "target"]]

# X, y = df_processed[[col for col in df_processed.columns if col != "target"]], df_processed["target"]

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None, shuffle=True, stratify=df_train['target'])

#train test split
df_train, df_val = train_test_split(df_processed, shuffle=True, test_size=0.2, stratify=df_processed['target'])

ban_train = df_train['BAN']
X_train = df_train[features]
y_train = np.squeeze(df_train['target'].values)

ban_test = df_val['BAN']
X_test = df_val[features]
y_test = np.squeeze(df_val['target'].values)


In [8]:
y_train.shape

(89919,)

### set up xgb and train the model

In [9]:
# build model and fit in training data
import xgboost as xgb
from sklearn.metrics import roc_auc_score

xgb_model = xgb.XGBClassifier(
    learning_rate=0.02,
    n_estimators=80,
    max_depth=8,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27
)

xgb_model.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_test, y_test)], 
            early_stopping_rounds=20) 
# xgb_model.fit(X_train, y_train)
print('xgb training done')




[0]	validation_0-logloss:0.69098	validation_1-logloss:0.69127
[1]	validation_0-logloss:0.68921	validation_1-logloss:0.68973
[2]	validation_0-logloss:0.68717	validation_1-logloss:0.68798
[3]	validation_0-logloss:0.68521	validation_1-logloss:0.68626
[4]	validation_0-logloss:0.68354	validation_1-logloss:0.68482
[5]	validation_0-logloss:0.68170	validation_1-logloss:0.68324
[6]	validation_0-logloss:0.67994	validation_1-logloss:0.68176
[7]	validation_0-logloss:0.67819	validation_1-logloss:0.68026
[8]	validation_0-logloss:0.67653	validation_1-logloss:0.67884
[9]	validation_0-logloss:0.67490	validation_1-logloss:0.67747
[10]	validation_0-logloss:0.67351	validation_1-logloss:0.67634
[11]	validation_0-logloss:0.67205	validation_1-logloss:0.67509
[12]	validation_0-logloss:0.67063	validation_1-logloss:0.67393
[13]	validation_0-logloss:0.66916	validation_1-logloss:0.67269
[14]	validation_0-logloss:0.66774	validation_1-logloss:0.67151
[15]	validation_0-logloss:0.66636	validation_1-logloss:0.67039
[1

### xgb hyperparameter tuning 

In [10]:
# import xgboost as xgb 
# import pandas as pd 

# # Create your housing DMatrix: housing_dmatrix
# class_dmatrix = xgb.DMatrix(data=X_train, label=y_train)

# # Create the parameter dictionary for each tree (boosting round)
# params = {"objective":"binary:logistic", "max_depth":3}
# # Create list of eta values and empty list to store final round rmse per xgboost model
# eta_vals = [0.001, 0.005, 0.01, 0.02, 0.05]
# max_depths = [5, 8, 10, 12, 15]
# colsample_bytree_vals = [0.7, 0.8, 0.9]
# n_estimators = [25, 50, 80, 100, 150, 200, 300]

# params_col = []
# best_accuracy = []


# # Systematically vary the eta 
# for curr_val in eta_vals:
    
#     for curr_depth in max_depths: 
        
#         for curr_colval in colsample_bytree_vals:
            
#             for curr_estimators in n_estimators: 

#                 params["eta"] = curr_val
#                 params["max_depth"] = curr_depth
#                 params["colsample_bytree"] = curr_colval
#                 params["n_estimators"] = curr_estimators

#                 # Perform cross-validation: cv_results
#                 cv_results = xgb.cv(dtrain=class_dmatrix, params=params, nfold=3, metrics="error", as_pandas=True, early_stopping_rounds=5)

#                 # Append the final round rmse to best_rmse
#                 params_col.append([curr_val, curr_depth, curr_colval, curr_estimators])
#                 best_accuracy.append(1-(cv_results["test-error-mean"]).iloc[-1])

# # Print the resultant DataFrame
# df_result = pd.DataFrame(list(zip(params_col, best_accuracy)), columns=["params", "best_accuracy"])

# objs = [df_result, pd.DataFrame(df_result['params'].tolist())]
# df_result = pd.concat(objs, axis=1).drop('params', axis=1)
# df_result.rename(columns = {0 : 'eta', 1 : 'max_depth', 2: 'colsample_bytree', 3: 'n_estimators'}, inplace = True)
# df_result = df_result[['eta', 'max_depth', 'colsample_bytree', 'best_accuracy', 'n_estimators']]

# print(df_result)


### make predictions on X_train and get lift

In [11]:
from sklearn.preprocessing import normalize

#predictions on X_test
pred_prb = xgb_model.predict_proba(X_train, ntree_limit=xgb_model.best_iteration)[:, 1]
pred_prb = np.array(normalize([pred_prb]))[0]

#join ban_test, X_test, y_test and pred_prb and print to csv
#CHECK THE SIZE OF EACH COMPONENT BEFORE JOINING
q=10
df_ban_train = ban_train.to_frame()
df_train_exp = df_ban_train.join(X_train) 
df_train_exp['y_test'] = y_train
df_train_exp['y_pred_proba'] = pred_prb
df_train_exp['y_pred'] = (df_train_exp['y_pred_proba'] > 0.5).astype(int)
df_train_exp['decile'] = pd.qcut(df_train_exp['y_pred_proba'], q, labels=[i for i in range(q, 0, -1)])

lg = get_lift(pred_prb, y_train, q)

lg



Unnamed: 0,Decile,avg_model_pred_call_rate,avg_call_rate_total,avg_real_call_rate,ban_count,lift
0,1,0.004776,0.471814,0.843083,8992,1.786898
1,2,0.004166,0.471814,0.710854,8992,1.506642
2,3,0.003776,0.471814,0.609542,8992,1.291913
3,4,0.003474,0.471814,0.542149,8992,1.149074
4,5,0.003251,0.471814,0.471694,8991,0.999746
5,6,0.003043,0.471814,0.424647,8991,0.900031
6,7,0.002837,0.471814,0.382075,8993,0.809801
7,8,0.002621,0.471814,0.31096,8969,0.659074
8,9,0.002399,0.471814,0.267554,9015,0.567076
9,10,0.002016,0.471814,0.155694,8992,0.32999


### make predictions on X_test and get lift

In [21]:
from sklearn.preprocessing import normalize

#predictions on X_test
pred_prb = xgb_model.predict_proba(X_test, ntree_limit=xgb_model.best_iteration)[:, 1]
pred_prb = np.array(normalize([pred_prb]))[0]

#join ban_test, X_test, y_test and pred_prb and print to csv
#CHECK THE SIZE OF EACH COMPONENT BEFORE JOINING
q=10
df_ban_test = ban_test.to_frame()
df_test_exp = df_ban_test.join(X_test) 
df_test_exp['y_test'] = y_test
df_test_exp['y_pred_proba'] = pred_prb
df_test_exp['y_pred'] = (df_test_exp['y_pred_proba'] > 0.5).astype(int)
df_test_exp['decile'] = pd.qcut(df_test_exp['y_pred_proba'], q, labels=[i for i in range(q, 0, -1)])

lg = get_lift(pred_prb, y_test, q)

lg




Unnamed: 0,Decile,avg_model_pred_call_rate,avg_call_rate_total,avg_real_call_rate,ban_count,lift
0,1,0.009543,0.471797,0.77847,2248,1.650009
1,2,0.008323,0.471797,0.660587,2248,1.400151
2,3,0.007553,0.471797,0.572509,2248,1.213464
3,4,0.006949,0.471797,0.541815,2248,1.148407
4,5,0.006502,0.471797,0.489324,2248,1.037149
5,6,0.006089,0.471797,0.43105,2248,0.913634
6,7,0.005676,0.471797,0.389235,2248,0.825005
7,8,0.005235,0.471797,0.342972,2248,0.726947
8,9,0.004799,0.471797,0.298932,2248,0.633604
9,10,0.004063,0.471797,0.213078,2248,0.451631


### export df_test_exp and lift scores to gcs bucket

In [22]:
df_test_exp.to_csv('gs://{}/downloads/df_test_exp.csv'.format(file_bucket, index=True))
print("....df_test_exp done")

# lg.to_csv('gs://{}/lift_on_scoring_data.csv'.format(file_bucket, index=False))
# print("....lift_to_csv done")

....df_test_exp done


### get feature importances from xgboost model

In [14]:
# Get feature importances from xgboost model
importances = xgb_model.feature_importances_

# Get the index of importances from greatest importance to least
sorted_index = np.argsort(importances)[::-1]
x = range(len(importances))

feature_names = X_train.columns

# Create tick labels 
labels = np.array(feature_names)[sorted_index]
importances = np.array(importances)[sorted_index]

for idx, item in enumerate(labels): 
    print(labels[idx], importances[idx])
    if idx == 25: 
        break
        

frequency 0.1981541
HSIA_CHARGE 0.06286183
ttv_disc_pct 0.03807802
recency 0.027450202
total_disc_pct 0.024580061
price_sensitivity_Not_Sensitive 0.024358682
Product_Count 0.024094315
Price_Plan_Grouping_Internet/Optik_Under_250MB 0.023431791
productMix_sing_count 0.022105964
price_sensitivity_Very_Sensitive 0.021500459
Tenure_Group_less_than_2Y 0.01894444
productMix_hsic_count 0.018314183
hsic_disc_pct 0.016915374
HP_CHARGE 0.01613059
sing_disc_pct 0.015927522
TV_CHARGE 0.015723959
Price_Plan_Grouping_Internet/Optik_250-1GB 0.015394906
clckstrmData_wln_tot_cnt_r90d 0.014795081
productMix_ttv_count 0.014465891
HSIC_CHARGE_NO_DISC 0.01439913
CampaignFlag_Targeted 0.013902354
demographics_rural_family 0.013863077
tot_disc_amt 0.013860954
SING_CHARGE_NO_DISC 0.013806397
TOTAL_CHARGE 0.0134863835
hsic_disc_amt 0.012776533


### set up rfc and train the model

In [15]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

# Create the random forest model and fit to the training data
rfc_model = RandomForestClassifier(n_estimators=75, max_features=25, max_depth=8)

rfc_model.fit(X_train, y_train)


### make predictions on X_train and get lift

In [16]:
from sklearn.preprocessing import normalize

#predictions on X_test
pred_prb = rfc_model.predict_proba(X_train)[:, 1]
pred_prb = np.array(normalize([pred_prb]))[0]

#join ban_test, X_test, y_test and pred_prb and print to csv
#CHECK THE SIZE OF EACH COMPONENT BEFORE JOINING
q=10
df_ban_train = ban_train.to_frame()
df_train_exp = df_ban_train.join(X_train) 
df_train_exp['y_test'] = y_train
df_train_exp['y_pred_proba'] = pred_prb
df_train_exp['y_pred'] = (df_train_exp['y_pred_proba'] > 0.5).astype(int)
df_train_exp['decile'] = pd.qcut(df_train_exp['y_pred_proba'], q, labels=[i for i in range(q, 0, -1)])

lg = get_lift(pred_prb, y_train, q)

lg

Unnamed: 0,Decile,avg_model_pred_call_rate,avg_call_rate_total,avg_real_call_rate,ban_count,lift
0,1,0.005058,0.471814,0.807273,8992,1.711
1,2,0.004319,0.471814,0.678381,8992,1.437815
2,3,0.003837,0.471814,0.596753,8992,1.264806
3,4,0.003468,0.471814,0.522576,8992,1.107589
4,5,0.003197,0.471814,0.482149,8991,1.021905
5,6,0.002945,0.471814,0.439279,8992,0.931044
6,7,0.002703,0.471814,0.387456,8992,0.821205
7,8,0.002453,0.471814,0.329738,8992,0.698873
8,9,0.002184,0.471814,0.28859,8992,0.611661
9,10,0.001727,0.471814,0.185943,8992,0.394103


### make predictions on X_test and get lift

In [17]:
from sklearn.preprocessing import normalize

#predictions on X_test
pred_prb = rfc_model.predict_proba(X_test)[:, 1]
pred_prb = np.array(normalize([pred_prb]))[0]

#join ban_test, X_test, y_test and pred_prb and print to csv
#CHECK THE SIZE OF EACH COMPONENT BEFORE JOINING
q=10
df_ban_test = ban_test.to_frame()
df_test_exp = df_ban_test.join(X_test) 
df_test_exp['y_test'] = y_test
df_test_exp['y_pred_proba'] = pred_prb
df_test_exp['y_pred'] = (df_test_exp['y_pred_proba'] > 0.5).astype(int)
df_test_exp['decile'] = pd.qcut(df_test_exp['y_pred_proba'], q, labels=[i for i in range(q, 0, -1)])

lg = get_lift(pred_prb, y_test, q)

lg

Unnamed: 0,Decile,avg_model_pred_call_rate,avg_call_rate_total,avg_real_call_rate,ban_count,lift
0,1,0.010119,0.471797,0.766904,2248,1.625495
1,2,0.008634,0.471797,0.649021,2248,1.375636
2,3,0.007687,0.471797,0.582295,2248,1.234207
3,4,0.006937,0.471797,0.531139,2248,1.125778
4,5,0.006391,0.471797,0.490214,2248,1.039035
5,6,0.005886,0.471797,0.43016,2248,0.911748
6,7,0.005397,0.471797,0.375,2248,0.794833
7,8,0.004888,0.471797,0.351868,2248,0.745804
8,9,0.004361,0.471797,0.311833,2248,0.660947
9,10,0.003485,0.471797,0.229537,2248,0.486517
