## Predict callers to retention

### read csv

In [1]:
#import libraries
import os
import re
import time
import google
from google.oauth2 import credentials
from google.oauth2 import service_account
from google.oauth2.service_account import Credentials
from datetime import date
from datetime import timedelta
from dateutil.relativedelta import relativedelta
import pandas as pd
import numpy as np
import pickle
from google.cloud import storage
from google.cloud import bigquery
from sklearn.model_selection import train_test_split

# build model
import xgboost as xgb
from sklearn.metrics import roc_auc_score

file_bucket = 'divg-josh-pr-d1cc3a-default' 
folder_name = 'promo_expiry_analysis'

df = pd.read_csv('gs://{}/{}/data_final.csv'.format(file_bucket, folder_name))

### preprocess

- Tenure Group: cat
- PROV: cat
- Pcount: cat
- Price Plan Grouping: cat
- Technology Group: cat
- demographics: cat
- CampaignFlag: cat
- price_sensitivity: cat
- TOTALCalls: remove

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112364 entries, 0 to 112363
Data columns (total 46 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   BAN                               112364 non-null  int64  
 1   Tenure Group                      112364 non-null  object 
 2   PROV                              112364 non-null  object 
 3   Pcount                            112364 non-null  object 
 4   Product Count                     112364 non-null  int64  
 5   productMix_product_mix_all        112364 non-null  int64  
 6   productMix_hsic_count             112364 non-null  int64  
 7   productMix_sing_count             112364 non-null  int64  
 8   productMix_ttv_count              112364 non-null  int64  
 9   productMix_shs_count              112364 non-null  int64  
 10  Price Plan Grouping               112364 non-null  object 
 11  Technology Group                  112364 non-null  o

In [3]:
df.columns

Index(['BAN', 'Tenure Group', 'PROV', 'Pcount', 'Product Count',
       'productMix_product_mix_all', 'productMix_hsic_count',
       'productMix_sing_count', 'productMix_ttv_count', 'productMix_shs_count',
       'Price Plan Grouping', 'Technology Group', 'TOTAL_CHARGE',
       'HSIA_CHARGE', 'HP_CHARGE', 'TV_CHARGE', 'tot_disc_amt',
       'hsic_disc_amt', 'sing_disc_amt', 'ttv_disc_amt',
       'TOTAL_CHARGE_NO_DISC', 'HSIC_CHARGE_NO_DISC', 'SING_CHARGE_NO_DISC',
       'TTV_CHARGE_NO_DISC', 'total_disc_pct', 'hsic_disc_pct',
       'sing_disc_pct', 'ttv_disc_pct', 'demographics_demo_avg_income',
       'callCentre_avg_talk_time_30_days', 'infra_num_srvc_typ_copper_sum',
       'infra_num_srvc_typ_gpon_sum', 'demographics', 'CampaignFlag',
       'hsiaUsage_hs_tot_gb_avg', 'clckstrmData_wln_tot_cnt_r30d',
       'clckstrmData_deal_tot_cnt_r30d', 'clckstrmData_chg_tot_cnt_r30d',
       'clckstrmData_supp_tot_cnt_r30d', 'clckstrmData_deal_tot_cnt_r90d',
       'clckstrmData_upgr_tot_c

In [4]:
cols_to_dummy = ['Tenure Group', 'PROV', 'Pcount', 'Price Plan Grouping', 'Technology Group', 'demographics', 'CampaignFlag', 'price_sensitivity']

for col in cols_to_dummy: 

    # Create dummy variables for the Country column
    df = pd.get_dummies(df, columns=[col], drop_first=True, prefix=None, dtype="int64")

# reorder the df columns so that 'target' comes last
df_processed = df[[c for c in df if c not in ['target']] 
       + ['target']]

df_processed.columns = df_processed.columns.str.replace('<', 'less_than_')
df_processed.columns = df_processed.columns.str.replace(' ', '_')

df_processed.head()

Unnamed: 0,BAN,Product_Count,productMix_product_mix_all,productMix_hsic_count,productMix_sing_count,productMix_ttv_count,productMix_shs_count,TOTAL_CHARGE,HSIA_CHARGE,HP_CHARGE,...,demographics_rural_family,demographics_unassigned,demographics_urban,demographics_urban_family,demographics_urban_young,CampaignFlag_Reached,CampaignFlag_Targeted,price_sensitivity_Not_Sensitive,price_sensitivity_Very_Sensitive,target
0,126956000,2,2,1,1,0,0,50.28,43.21,7.07,...,1,0,0,0,0,0,0,1,0,1
1,126991928,3,3,1,1,1,0,153.3,63.21,12.14,...,1,0,0,0,0,1,0,1,0,1
2,200201191,3,3,1,1,1,0,114.09,51.67,18.7,...,0,0,0,1,0,0,1,1,0,1
3,200225413,3,3,1,1,1,0,170.1,70.48,14.14,...,0,0,0,0,0,0,1,1,0,1
4,200232415,3,3,1,1,1,0,110.8,55.0,15.8,...,0,0,0,1,0,0,1,1,0,1


In [5]:
df_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112364 entries, 0 to 112363
Data columns (total 58 columns):
 #   Column                                          Non-Null Count   Dtype  
---  ------                                          --------------   -----  
 0   BAN                                             112364 non-null  int64  
 1   Product_Count                                   112364 non-null  int64  
 2   productMix_product_mix_all                      112364 non-null  int64  
 3   productMix_hsic_count                           112364 non-null  int64  
 4   productMix_sing_count                           112364 non-null  int64  
 5   productMix_ttv_count                            112364 non-null  int64  
 6   productMix_shs_count                            112364 non-null  int64  
 7   TOTAL_CHARGE                                    112364 non-null  float64
 8   HSIA_CHARGE                                     112364 non-null  float64
 9   HP_CHARGE                 

### register lift function 

In [6]:
def get_lift(prob, y_test, q):
    result = pd.DataFrame(columns=['Prob', 'CallToRetention'])
    result['Prob'] = prob
    result['CallToRetention'] = y_test
    result['Decile'] = pd.qcut(result['Prob'], q, labels=[i for i in range(q, 0, -1)])
    add = pd.DataFrame(result.groupby('Decile')['CallToRetention'].mean()).reset_index()
    add.columns = ['Decile', 'avg_real_call_rate']
    add2 = pd.DataFrame(result.groupby('Decile')['CallToRetention'].count()).reset_index()
    add2.columns = ['Decile', 'ban_count']
    result = result.merge(add, on='Decile', how='left')
    result = result.merge(add2, on='Decile', how='left')
    result.sort_values('Decile', ascending=True, inplace=True)
    lg = pd.DataFrame(result.groupby('Decile')['Prob'].mean()).reset_index()
    lg.columns = ['Decile', 'avg_model_pred_call_rate']
    lg.sort_values('Decile', ascending=False, inplace=True)
    lg['avg_call_rate_total'] = result['CallToRetention'].mean()
    lg = lg.merge(add, on='Decile', how='left')
    lg = lg.merge(add2, on='Decile', how='left')
    lg['lift'] = lg['avg_real_call_rate'] / lg['avg_call_rate_total']

    return lg

### set X_train, X_test, y_train, y_test

In [8]:
features = [col for col in df_processed.columns if col not in ["BAN", "target"]]

X, y = df_processed[[col for col in df_processed.columns if col != "target"]], df_processed["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=None, shuffle=True)

ban_train = X_train['BAN']
X_train = X_train[features]
y_train = np.squeeze(y_train.values)

ban_test = X_test['BAN']
X_test = X_test[features]
y_test = np.squeeze(y_test.values)


### set up xgb and train the model

In [9]:
# build model and fit in training data
import xgboost as xgb
from sklearn.metrics import roc_auc_score

xgb_model = xgb.XGBClassifier(
    learning_rate=0.01,
    n_estimators=80,
    max_depth=8,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1
    # seed=27
)

xgb_model.fit(X_train, y_train)
print('xgb training done')


xgb training done


### xgb hyperparameter tuning 

In [10]:
# import xgboost as xgb 
# import pandas as pd 

# # Create your housing DMatrix: housing_dmatrix
# class_dmatrix = xgb.DMatrix(data=X_train, label=y_train)

# # Create the parameter dictionary for each tree (boosting round)
# params = {"objective":"binary:logistic", "max_depth":3}
# # Create list of eta values and empty list to store final round rmse per xgboost model
# eta_vals = [0.001, 0.005, 0.01, 0.02, 0.05]
# max_depths = [5, 8, 10, 12, 15]
# colsample_bytree_vals = [0.7, 0.8, 0.9]
# n_estimators = [25, 50, 80, 100, 150, 200, 300]

# params_col = []
# best_accuracy = []


# # Systematically vary the eta 
# for curr_val in eta_vals:
    
#     for curr_depth in max_depths: 
        
#         for curr_colval in colsample_bytree_vals:
            
#             for curr_estimators in n_estimators: 

#                 params["eta"] = curr_val
#                 params["max_depth"] = curr_depth
#                 params["colsample_bytree"] = curr_colval
#                 params["n_estimators"] = curr_estimators

#                 # Perform cross-validation: cv_results
#                 cv_results = xgb.cv(dtrain=class_dmatrix, params=params, nfold=3, metrics="error", as_pandas=True, early_stopping_rounds=5)

#                 # Append the final round rmse to best_rmse
#                 params_col.append([curr_val, curr_depth, curr_colval, curr_estimators])
#                 best_accuracy.append(1-(cv_results["test-error-mean"]).iloc[-1])

# # Print the resultant DataFrame
# df_result = pd.DataFrame(list(zip(params_col, best_accuracy)), columns=["params", "best_accuracy"])

# objs = [df_result, pd.DataFrame(df_result['params'].tolist())]
# df_result = pd.concat(objs, axis=1).drop('params', axis=1)
# df_result.rename(columns = {0 : 'eta', 1 : 'max_depth', 2: 'colsample_bytree', 3: 'n_estimators'}, inplace = True)
# df_result = df_result[['eta', 'max_depth', 'colsample_bytree', 'best_accuracy', 'n_estimators']]

# print(df_result)


### make predictions on X_train and get lift

In [11]:
from sklearn.preprocessing import normalize

#predictions on X_test
pred_prb = xgb_model.predict_proba(X_train, ntree_limit=xgb_model.best_iteration)[:, 1]
pred_prb = np.array(normalize([pred_prb]))[0]

#join ban_test, X_test, y_test and pred_prb and print to csv
#CHECK THE SIZE OF EACH COMPONENT BEFORE JOINING
q=10
df_ban_train = ban_train.to_frame()
df_train_exp = df_ban_train.join(X_train) 
df_train_exp['y_test'] = y_train
df_train_exp['y_pred_proba'] = pred_prb
df_train_exp['y_pred'] = (df_train_exp['y_pred_proba'] > 0.5).astype(int)
df_train_exp['decile'] = pd.qcut(df_train_exp['y_pred_proba'], q, labels=[i for i in range(q, 0, -1)])

lg = get_lift(pred_prb, y_train, q)

lg



Unnamed: 0,Decile,avg_model_pred_call_rate,avg_call_rate_total,avg_real_call_rate,ban_count,lift
0,1,0.003958,0.471556,0.798082,10113,1.692445
1,2,0.003657,0.471556,0.674478,10113,1.430326
2,3,0.003447,0.471556,0.600237,10111,1.272888
3,4,0.003268,0.471556,0.539846,10114,1.144819
4,5,0.003131,0.471556,0.466475,10112,0.989227
5,6,0.003003,0.471556,0.430535,10113,0.91301
6,7,0.00287,0.471556,0.391279,10113,0.829761
7,8,0.002735,0.471556,0.339498,10112,0.719953
8,9,0.002613,0.471556,0.284484,10106,0.603289
9,10,0.002425,0.471556,0.190711,10120,0.404431


### make predictions on X_test and get lift

In [12]:
from sklearn.preprocessing import normalize

#predictions on X_test
pred_prb = xgb_model.predict_proba(X_test, ntree_limit=xgb_model.best_iteration)[:, 1]
pred_prb = np.array(normalize([pred_prb]))[0]

#join ban_test, X_test, y_test and pred_prb and print to csv
#CHECK THE SIZE OF EACH COMPONENT BEFORE JOINING
q=10
df_ban_test = ban_test.to_frame()
df_test_exp = df_ban_test.join(X_test) 
df_test_exp['y_test'] = y_test
df_test_exp['y_pred_proba'] = pred_prb
df_test_exp['y_pred'] = (df_test_exp['y_pred_proba'] > 0.5).astype(int)
df_test_exp['decile'] = pd.qcut(df_test_exp['y_pred_proba'], q, labels=[i for i in range(q, 0, -1)])

lg = get_lift(pred_prb, y_test, q)

lg



Unnamed: 0,Decile,avg_model_pred_call_rate,avg_call_rate_total,avg_real_call_rate,ban_count,lift
0,1,0.011824,0.47397,0.741103,1124,1.563608
1,2,0.010952,0.47397,0.642349,1124,1.355252
2,3,0.010336,0.47397,0.599288,1123,1.2644
3,4,0.009813,0.47397,0.516904,1124,1.090584
4,5,0.009415,0.47397,0.47195,1123,0.995739
5,6,0.009035,0.47397,0.435053,1124,0.917892
6,7,0.008614,0.47397,0.418149,1124,0.882228
7,8,0.00821,0.47397,0.372217,1123,0.785318
8,9,0.007853,0.47397,0.30427,1124,0.641962
9,10,0.007287,0.47397,0.238434,1124,0.503058


### export df_test_exp and lift scores to gcs bucket

In [None]:
df_test_exp.to_csv('gs://{}/downloads/df_test_exp.csv'.format(file_bucket, index=True))
print("....df_test_exp done")

# lg.to_csv('gs://{}/lift_on_scoring_data.csv'.format(file_bucket, index=False))
# print("....lift_to_csv done")

### get feature importances from xgboost model

In [None]:
# Get feature importances from xgboost model
importances = xgb_model.feature_importances_

# Get the index of importances from greatest importance to least
sorted_index = np.argsort(importances)[::-1]
x = range(len(importances))

feature_names = X_train.columns

# Create tick labels 
labels = np.array(feature_names)[sorted_index]
importances = np.array(importances)[sorted_index]

for idx, item in enumerate(labels): 
    print(labels[idx], importances[idx])
    if idx == 25: 
        break

### set up rfc and train the model

In [None]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

# Create the random forest model and fit to the training data
rfc_model = RandomForestClassifier(n_estimators=75, max_features=25, max_depth=8)

rfc_model.fit(X_train, y_train)


### make predictions on X_train and get lift

In [None]:
from sklearn.preprocessing import normalize

#predictions on X_test
pred_prb = rfc_model.predict_proba(X_train)[:, 1]
pred_prb = np.array(normalize([pred_prb]))[0]

#join ban_test, X_test, y_test and pred_prb and print to csv
#CHECK THE SIZE OF EACH COMPONENT BEFORE JOINING
q=10
df_ban_train = ban_train.to_frame()
df_train_exp = df_ban_train.join(X_train) 
df_train_exp['y_test'] = y_train
df_train_exp['y_pred_proba'] = pred_prb
df_train_exp['y_pred'] = (df_train_exp['y_pred_proba'] > 0.5).astype(int)
df_train_exp['decile'] = pd.qcut(df_train_exp['y_pred_proba'], q, labels=[i for i in range(q, 0, -1)])

lg = get_lift(pred_prb, y_train, q)

lg

### make predictions on X_test and get lift

In [None]:
from sklearn.preprocessing import normalize

#predictions on X_test
pred_prb = rfc_model.predict_proba(X_test)[:, 1]
pred_prb = np.array(normalize([pred_prb]))[0]

#join ban_test, X_test, y_test and pred_prb and print to csv
#CHECK THE SIZE OF EACH COMPONENT BEFORE JOINING
q=10
df_ban_test = ban_test.to_frame()
df_test_exp = df_ban_test.join(X_test) 
df_test_exp['y_test'] = y_test
df_test_exp['y_pred_proba'] = pred_prb
df_test_exp['y_pred'] = (df_test_exp['y_pred_proba'] > 0.5).astype(int)
df_test_exp['decile'] = pd.qcut(df_test_exp['y_pred_proba'], q, labels=[i for i in range(q, 0, -1)])

lg = get_lift(pred_prb, y_test, q)

lg