## Predict callers to retention

### read csv

In [1]:
#import libraries
import os
import re
import time
import google
from google.oauth2 import credentials
from google.oauth2 import service_account
from google.oauth2.service_account import Credentials
from datetime import date
from datetime import timedelta
from dateutil.relativedelta import relativedelta
import pandas as pd
import numpy as np
import pickle
from google.cloud import storage
from google.cloud import bigquery
from sklearn.model_selection import train_test_split

# build model
import xgboost as xgb
from sklearn.metrics import roc_auc_score

file_bucket = 'divg-josh-pr-d1cc3a-default' 
folder_name = 'promo_expiry_analysis'

df = pd.read_csv('gs://{}/{}/data_final.csv'.format(file_bucket, folder_name))
df_score = pd.read_csv('gs://{}/{}/data_score.csv'.format(file_bucket, folder_name))

cols_1 = df.columns.values
cols_2 = df_score.columns.values
cols = set(cols_1).intersection(set(cols_2))

cols_to_preserve = [f for f in cols if f not in ['target']]
cols_to_preserve_df = [f for f in cols] 
cols_to_preserve_df.append('target')

df = df[cols_to_preserve_df]
df_score = df_score[cols_to_preserve]

df_score.head()

Unnamed: 0,productMix_ttv_count,hsiaUsage_hs_tot_gb_average,clckstrmData_cancel_tot_cnt_r90d,ttv_disc_pct,productMix_shs_count,total_disc_pct,infra_num_srvc_typ_gpon_sum,productMix_product_mix_all,hsic_disc_amt,TOTAL_CHARGE,...,Pcount,Price Plan Grouping,productMix_hsic_count,recency,hsic_disc_pct,Technology Group,Tenure Group,ttv_disc_amt,clckstrmData_wln_tot_cnt_r90d,frequency
0,1,72,0,0.0,0,0.0,1,4,0.0,137.21,...,3P,Internet/Optik Under 250MB,1,699,0.0,Fibre,7+Y,0.0,0,1
1,1,49,0,0.433207,0,0.339967,0,3,-35.0,197.543333,...,3P,Internet/Optik Under 250MB,1,0,0.333333,Copper,7+Y,-66.75,0,0
2,1,148,0,0.0,0,0.0,1,4,0.0,115.47,...,3P,Internet/Optik Under 250MB,1,69,0.0,Fibre,7+Y,0.0,29,2
3,1,76,0,0.0,0,0.0,1,4,0.0,137.145,...,3P,Internet/Optik Under 250MB,1,609,0.0,Fibre,7+Y,0.0,0,2
4,1,105,0,0.443255,0,0.429778,0,2,-50.0,135.0,...,2P,Internet/Optik Under 250MB,1,0,0.416667,Copper,7+Y,-51.75,8,0


### preprocess

- Tenure Group: cat
- PROV: cat
- Pcount: cat
- Price Plan Grouping: cat
- Technology Group: cat
- demographics: cat
- CampaignFlag: cat
- price_sensitivity: cat
- TOTALCalls: remove

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112399 entries, 0 to 112398
Data columns (total 41 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   productMix_ttv_count              112399 non-null  int64  
 1   hsiaUsage_hs_tot_gb_average       112399 non-null  int64  
 2   clckstrmData_cancel_tot_cnt_r90d  112399 non-null  int64  
 3   ttv_disc_pct                      112399 non-null  float64
 4   productMix_shs_count              112399 non-null  int64  
 5   total_disc_pct                    112399 non-null  float64
 6   infra_num_srvc_typ_gpon_sum       112399 non-null  int64  
 7   productMix_product_mix_all        112399 non-null  int64  
 8   hsic_disc_amt                     112399 non-null  float64
 9   TOTAL_CHARGE                      112399 non-null  float64
 10  TV_CHARGE                         112399 non-null  float64
 11  infra_num_srvc_typ_copper_sum     112399 non-null  i

In [3]:
df.columns

Index(['productMix_ttv_count', 'hsiaUsage_hs_tot_gb_average',
       'clckstrmData_cancel_tot_cnt_r90d', 'ttv_disc_pct',
       'productMix_shs_count', 'total_disc_pct', 'infra_num_srvc_typ_gpon_sum',
       'productMix_product_mix_all', 'hsic_disc_amt', 'TOTAL_CHARGE',
       'TV_CHARGE', 'infra_num_srvc_typ_copper_sum', 'productMix_sing_count',
       'price_sensitivity', 'PROV', 'BAN', 'HSIA_CHARGE', 'demographics',
       'callCentre_avg_talk_time_30_days', 'HSIC_CHARGE_NO_DISC',
       'clckstrmData_chg_tot_cnt_r90d', 'clckstrmData_supp_tot_cnt_r90d',
       'TTV_CHARGE_NO_DISC', 'clckstrmData_move_tot_cnt_r90d',
       'demographics_demo_avg_income', 'clckstrmData_upgr_tot_cnt_r90d',
       'have_called', 'clckstrmData_deal_tot_cnt_r90d', 'TOTAL_CHARGE_NO_DISC',
       'tot_disc_amt', 'Pcount', 'Price Plan Grouping',
       'productMix_hsic_count', 'recency', 'hsic_disc_pct', 'Technology Group',
       'Tenure Group', 'ttv_disc_amt', 'clckstrmData_wln_tot_cnt_r90d',
       'frequ

In [5]:
# cols_to_dummy = ['Tenure Group', 'PROV', 'Pcount', 'Price Plan Grouping', 'Technology Group', 'demographics', 'CampaignFlag', 'price_sensitivity']
cols_to_dummy = ['Tenure Group', 'PROV', 'Pcount', 'Price Plan Grouping', 'Technology Group', 'demographics', 'price_sensitivity']

for col in cols_to_dummy: 

    # Create dummy variables for the Country column
    df = pd.get_dummies(df, columns=[col], drop_first=True, prefix=None, dtype="int64")
    df_score = pd.get_dummies(df_score, columns=[col], drop_first=True, prefix=None, dtype="int64")

# training & validation set
# reorder the df columns so that 'target' comes last
df_processed = df[[c for c in df if c not in ['target']] 
       + ['target']]

df_processed.columns = df_processed.columns.str.replace('<', 'less_than_')
df_processed.columns = df_processed.columns.str.replace(' ', '_')

# scoring set
# reorder the df columns so that 'target' comes last
df_score_processed = df_score[[c for c in df_score]]

df_score_processed.columns = df_score_processed.columns.str.replace('<', 'less_than_')
df_score_processed.columns = df_score_processed.columns.str.replace(' ', '_')

df_processed.head()

Unnamed: 0,productMix_ttv_count,hsiaUsage_hs_tot_gb_average,clckstrmData_cancel_tot_cnt_r90d,ttv_disc_pct,productMix_shs_count,total_disc_pct,infra_num_srvc_typ_gpon_sum,productMix_product_mix_all,hsic_disc_amt,TOTAL_CHARGE,...,Price_Plan_Grouping_Other,Technology_Group_Fibre,demographics_rural_family,demographics_unassigned,demographics_urban,demographics_urban_family,demographics_urban_young,price_sensitivity_Not_Sensitive,price_sensitivity_Very_Sensitive,target
0,0,29,0,0.0,0,0.347606,0,2,-26.79,50.28,...,0,0,1,0,0,0,0,1,0,1
1,1,22,0,0.431851,0,0.430831,1,3,-46.79,153.3,...,0,1,1,0,0,0,0,1,0,1
2,1,49,0,0.503182,0,0.404851,0,3,-33.33,114.09,...,0,0,0,0,0,1,0,1,0,1
3,1,25,0,0.366815,0,0.372417,1,3,-49.52,170.1,...,0,1,0,0,0,0,0,1,0,1
4,1,29,0,0.375,0,0.347468,0,3,-35.0,110.8,...,0,0,0,0,0,1,0,1,0,1


In [6]:
df_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112399 entries, 0 to 112398
Data columns (total 52 columns):
 #   Column                                          Non-Null Count   Dtype  
---  ------                                          --------------   -----  
 0   productMix_ttv_count                            112399 non-null  int64  
 1   hsiaUsage_hs_tot_gb_average                     112399 non-null  int64  
 2   clckstrmData_cancel_tot_cnt_r90d                112399 non-null  int64  
 3   ttv_disc_pct                                    112399 non-null  float64
 4   productMix_shs_count                            112399 non-null  int64  
 5   total_disc_pct                                  112399 non-null  float64
 6   infra_num_srvc_typ_gpon_sum                     112399 non-null  int64  
 7   productMix_product_mix_all                      112399 non-null  int64  
 8   hsic_disc_amt                                   112399 non-null  float64
 9   TOTAL_CHARGE              

### register lift function 

In [7]:
def get_lift(prob, y_test, q):
    result = pd.DataFrame(columns=['Prob', 'CallToRetention'])
    result['Prob'] = prob
    result['CallToRetention'] = y_test
    result['Decile'] = pd.qcut(result['Prob'], q, labels=[i for i in range(q, 0, -1)])
    add = pd.DataFrame(result.groupby('Decile')['CallToRetention'].mean()).reset_index()
    add.columns = ['Decile', 'avg_real_call_rate']
    add2 = pd.DataFrame(result.groupby('Decile')['CallToRetention'].count()).reset_index()
    add2.columns = ['Decile', 'ban_count']
    result = result.merge(add, on='Decile', how='left')
    result = result.merge(add2, on='Decile', how='left')
    result.sort_values('Decile', ascending=True, inplace=True)
    lg = pd.DataFrame(result.groupby('Decile')['Prob'].mean()).reset_index()
    lg.columns = ['Decile', 'avg_model_pred_call_rate']
    lg.sort_values('Decile', ascending=False, inplace=True)
    lg['avg_call_rate_total'] = result['CallToRetention'].mean()
    lg = lg.merge(add, on='Decile', how='left')
    lg = lg.merge(add2, on='Decile', how='left')
    lg['lift'] = lg['avg_real_call_rate'] / lg['avg_call_rate_total']

    return lg

### set X_train, X_test, y_train, y_test

In [10]:
features = [col for col in df_processed.columns if col not in ['BAN', 'target']]
# features = ['Product_Count','productMix_product_mix_all','productMix_hsic_count','productMix_sing_count','productMix_ttv_count','TOTAL_CHARGE',
# 'HSIA_CHARGE','HP_CHARGE','TV_CHARGE','tot_disc_amt','hsic_disc_amt','sing_disc_amt','ttv_disc_amt','TOTAL_CHARGE_NO_DISC','HSIC_CHARGE_NO_DISC',
# 'SING_CHARGE_NO_DISC','TTV_CHARGE_NO_DISC','total_disc_pct','hsic_disc_pct','sing_disc_pct','ttv_disc_pct','demographics_demo_avg_income','hsiaUsage_hs_tot_gb_average',
# 'clckstrmData_wln_tot_cnt_r90d','clckstrmData_deal_tot_cnt_r90d','clckstrmData_supp_tot_cnt_r90d','frequency','recency','have_called','Tenure_Group_5-6Y','Tenure_Group_7+Y',
# 'Tenure_Group_less_than_2Y','PROV_BC','Pcount_2P','Pcount_3P','Pcount_4P','Price_Plan_Grouping_Internet/Optik_250-1GB','Price_Plan_Grouping_Internet/Optik_Under_250MB',
# 'Price_Plan_Grouping_Other','demographics_rural_family','demographics_unassigned','demographics_urban','demographics_urban_family','demographics_urban_young','CampaignFlag_Reached',
# 'CampaignFlag_Targeted','price_sensitivity_Not_Sensitive','price_sensitivity_Very_Sensitive']

# X, y = df_processed[[col for col in df_processed.columns if col != \target\]], df_processed[\target\]

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None, shuffle=True, stratify=df_train['target'])

#train test split
df_train, df_val = train_test_split(df_processed, shuffle=True, test_size=0.2, stratify=df_processed['target'])

ban_train = df_train['BAN']
X_train = df_train[features]
y_train = np.squeeze(df_train['target'].values)

ban_test = df_val['BAN']
X_test = df_val[features]
y_test = np.squeeze(df_val['target'].values)

ban_comb = df_processed['BAN']
X_comb = df_processed[features]
y_comb = np.squeeze(df_processed['target'].values)

ban_score = df_score['BAN']
X_score = df_score_processed[features]

In [11]:
y_train.shape

(89919,)

### set up xgb and train the model

In [12]:
# build model and fit in training data
import xgboost as xgb
from sklearn.metrics import roc_auc_score

xgb_model = xgb.XGBClassifier(
    learning_rate=0.02,
    n_estimators=80,
    max_depth=8,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27
)

xgb_model.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_test, y_test)], 
            early_stopping_rounds=20) 
# xgb_model.fit(X_train, y_train)
print('xgb training done')




[0]	validation_0-logloss:0.69095	validation_1-logloss:0.69132
[1]	validation_0-logloss:0.68887	validation_1-logloss:0.68955
[2]	validation_0-logloss:0.68688	validation_1-logloss:0.68790
[3]	validation_0-logloss:0.68494	validation_1-logloss:0.68628
[4]	validation_0-logloss:0.68303	validation_1-logloss:0.68470
[5]	validation_0-logloss:0.68118	validation_1-logloss:0.68319
[6]	validation_0-logloss:0.67940	validation_1-logloss:0.68176
[7]	validation_0-logloss:0.67770	validation_1-logloss:0.68037
[8]	validation_0-logloss:0.67600	validation_1-logloss:0.67903
[9]	validation_0-logloss:0.67435	validation_1-logloss:0.67775
[10]	validation_0-logloss:0.67280	validation_1-logloss:0.67652
[11]	validation_0-logloss:0.67130	validation_1-logloss:0.67533
[12]	validation_0-logloss:0.66984	validation_1-logloss:0.67418
[13]	validation_0-logloss:0.66843	validation_1-logloss:0.67308
[14]	validation_0-logloss:0.66703	validation_1-logloss:0.67200
[15]	validation_0-logloss:0.66567	validation_1-logloss:0.67096
[1

### xgb hyperparameter tuning 

In [13]:
# import xgboost as xgb 
# import pandas as pd 

# # Create your housing DMatrix: housing_dmatrix
# class_dmatrix = xgb.DMatrix(data=X_train, label=y_train)

# # Create the parameter dictionary for each tree (boosting round)
# params = {"objective":"binary:logistic", "max_depth":3}
# # Create list of eta values and empty list to store final round rmse per xgboost model
# eta_vals = [0.001, 0.005, 0.01, 0.02, 0.05]
# max_depths = [5, 8, 10, 12, 15]
# colsample_bytree_vals = [0.7, 0.8, 0.9]
# n_estimators = [25, 50, 80, 100, 150, 200, 300]

# params_col = []
# best_accuracy = []


# # Systematically vary the eta 
# for curr_val in eta_vals:
    
#     for curr_depth in max_depths: 
        
#         for curr_colval in colsample_bytree_vals:
            
#             for curr_estimators in n_estimators: 

#                 params["eta"] = curr_val
#                 params["max_depth"] = curr_depth
#                 params["colsample_bytree"] = curr_colval
#                 params["n_estimators"] = curr_estimators

#                 # Perform cross-validation: cv_results
#                 cv_results = xgb.cv(dtrain=class_dmatrix, params=params, nfold=3, metrics="error", as_pandas=True, early_stopping_rounds=5)

#                 # Append the final round rmse to best_rmse
#                 params_col.append([curr_val, curr_depth, curr_colval, curr_estimators])
#                 best_accuracy.append(1-(cv_results["test-error-mean"]).iloc[-1])

# # Print the resultant DataFrame
# df_result = pd.DataFrame(list(zip(params_col, best_accuracy)), columns=["params", "best_accuracy"])

# objs = [df_result, pd.DataFrame(df_result['params'].tolist())]
# df_result = pd.concat(objs, axis=1).drop('params', axis=1)
# df_result.rename(columns = {0 : 'eta', 1 : 'max_depth', 2: 'colsample_bytree', 3: 'n_estimators'}, inplace = True)
# df_result = df_result[['eta', 'max_depth', 'colsample_bytree', 'best_accuracy', 'n_estimators']]

# print(df_result)


### make predictions on X_train and get lift

In [14]:
from sklearn.preprocessing import normalize

#predictions on X_test
pred_prb = xgb_model.predict_proba(X_train, ntree_limit=xgb_model.best_iteration)[:, 1]
pred_prb = np.array(normalize([pred_prb]))[0]

#join ban_test, X_test, y_test and pred_prb and print to csv
#CHECK THE SIZE OF EACH COMPONENT BEFORE JOINING
q=10
df_ban_train = ban_train.to_frame()
df_train_exp = df_ban_train.join(X_train) 
df_train_exp['y_test'] = y_train
df_train_exp['y_pred_proba'] = pred_prb
df_train_exp['y_pred'] = (df_train_exp['y_pred_proba'] > 0.5).astype(int)
df_train_exp['decile'] = pd.qcut(df_train_exp['y_pred_proba'], q, labels=[i for i in range(q, 0, -1)])

lg = get_lift(pred_prb, y_train, q)

lg



Unnamed: 0,Decile,avg_model_pred_call_rate,avg_call_rate_total,avg_real_call_rate,ban_count,lift
0,1,0.004785,0.471814,0.841748,8992,1.78407
1,2,0.004167,0.471814,0.701735,8992,1.487314
2,3,0.003781,0.471814,0.623443,8992,1.321376
3,4,0.00349,0.471814,0.533474,8992,1.130689
4,5,0.003254,0.471814,0.47136,8991,0.999039
5,6,0.003036,0.471814,0.43766,8991,0.927612
6,7,0.002829,0.471814,0.371969,8990,0.788381
7,8,0.002616,0.471814,0.311951,8995,0.661175
8,9,0.002388,0.471814,0.262011,8992,0.555327
9,10,0.001988,0.471814,0.162811,8992,0.345076


### make predictions on X_test and get lift

In [15]:
from sklearn.preprocessing import normalize

#predictions on X_test
pred_prb = xgb_model.predict_proba(X_test, ntree_limit=xgb_model.best_iteration)[:, 1]
pred_prb = np.array(normalize([pred_prb]))[0]

#join ban_test, X_test, y_test and pred_prb and print to csv
#CHECK THE SIZE OF EACH COMPONENT BEFORE JOINING
q=10
df_ban_test = ban_test.to_frame()
df_test_exp = df_ban_test.join(X_test) 
df_test_exp['y_test'] = y_test
df_test_exp['y_pred_proba'] = pred_prb
df_test_exp['y_pred'] = (df_test_exp['y_pred_proba'] > 0.5).astype(int)
df_test_exp['decile'] = pd.qcut(df_test_exp['y_pred_proba'], q, labels=[i for i in range(q, 0, -1)])

lg = get_lift(pred_prb, y_test, q)

lg




Unnamed: 0,Decile,avg_model_pred_call_rate,avg_call_rate_total,avg_real_call_rate,ban_count,lift
0,1,0.009552,0.471797,0.762011,2248,1.615124
1,2,0.008318,0.471797,0.662367,2248,1.403922
2,3,0.007563,0.471797,0.577402,2248,1.223836
3,4,0.006991,0.471797,0.527135,2248,1.117292
4,5,0.006517,0.471797,0.468416,2248,0.992834
5,6,0.006081,0.471797,0.437278,2248,0.926834
6,7,0.005662,0.471797,0.388345,2248,0.823119
7,8,0.005236,0.471797,0.335409,2248,0.710918
8,9,0.004777,0.471797,0.310053,2248,0.657175
9,10,0.003993,0.471797,0.249555,2248,0.528946


### make predictions on X_comb and get lift

In [16]:
from sklearn.preprocessing import normalize

#predictions on X_comb
pred_prb = xgb_model.predict_proba(X_comb, ntree_limit=xgb_model.best_iteration)[:, 1]
pred_prb = np.array(normalize([pred_prb]))[0]

#join ban_comb, X_comb, y_comb and pred_prb and print to csv
#CHECK THE SIZE OF EACH COMPONENT BEFORE JOINING
q=10
df_ban_comb = ban_comb.to_frame()
df_comb_exp = df_ban_comb.join(X_comb) 
df_comb_exp['y_comb'] = y_comb
df_comb_exp['y_pred_proba'] = pred_prb
df_comb_exp['y_pred'] = (df_comb_exp['y_pred_proba'] > 0.5).astype(int)
df_comb_exp['decile'] = pd.qcut(df_comb_exp['y_pred_proba'], q, labels=[i for i in range(q, 0, -1)])

lg = get_lift(pred_prb, y_comb, q)

lg




Unnamed: 0,Decile,avg_model_pred_call_rate,avg_call_rate_total,avg_real_call_rate,ban_count,lift
0,1,0.004278,0.47181,0.825801,11240,1.750281
1,2,0.003725,0.47181,0.693594,11240,1.47007
2,3,0.003382,0.47181,0.614324,11240,1.302057
3,4,0.003123,0.47181,0.532384,11240,1.128387
4,5,0.002911,0.47181,0.470416,11239,0.997044
5,6,0.002717,0.47181,0.4379,11240,0.928128
6,7,0.002531,0.47181,0.375178,11240,0.795188
7,8,0.00234,0.47181,0.316726,11240,0.671299
8,9,0.002136,0.47181,0.271708,11240,0.575884
9,10,0.001779,0.47181,0.180071,11240,0.38166


### export df_test_exp and lift scores to gcs bucket

In [17]:
df_comb_exp.to_csv('gs://{}/downloads/df_comb_exp.csv'.format(file_bucket, index=True))
print("....df_comb_exp done")

# lg.to_csv('gs://{}/lift_on_scoring_data.csv'.format(file_bucket, index=False))
# print("....lift_to_csv done")

....df_comb_exp done


### make predictions on X_score and get lift

In [19]:

from sklearn.preprocessing import normalize

#predictions on X_comb
pred_prb = xgb_model.predict_proba(X_score, ntree_limit=xgb_model.best_iteration)[:, 1]
pred_prb = np.array(normalize([pred_prb]))[0]

#join ban_comb, X_comb, y_comb and pred_prb and print to csv
#CHECK THE SIZE OF EACH COMPONENT BEFORE JOINING
q=10
df_ban_score = ban_score.to_frame()
df_score_exp = df_ban_score.join(X_score) 
df_score_exp['y_pred_proba'] = pred_prb
df_score_exp['decile'] = pd.qcut(df_score_exp['y_pred_proba'], q, labels=[i for i in range(q, 0, -1)])

df_score_exp.to_csv('gs://{}/downloads/df_score_exp.csv'.format(file_bucket, index=True))
print('....df_score_exp done')




....df_score_exp done


### get feature importances from xgboost model

In [None]:
# Get feature importances from xgboost model
importances = xgb_model.feature_importances_

# Get the index of importances from greatest importance to least
sorted_index = np.argsort(importances)[::-1]
x = range(len(importances))

feature_names = X_train.columns

# Create tick labels 
labels = np.array(feature_names)[sorted_index]
importances = np.array(importances)[sorted_index]

for idx, item in enumerate(labels): 
    print(labels[idx], importances[idx])
    if idx == 25: 
        break
        

### set up rfc and train the model

In [None]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

# Create the random forest model and fit to the training data
rfc_model = RandomForestClassifier(n_estimators=75, max_features=25, max_depth=8)

rfc_model.fit(X_train, y_train)


### make predictions on X_train and get lift

In [None]:
from sklearn.preprocessing import normalize

#predictions on X_test
pred_prb = rfc_model.predict_proba(X_train)[:, 1]
pred_prb = np.array(normalize([pred_prb]))[0]

#join ban_test, X_test, y_test and pred_prb and print to csv
#CHECK THE SIZE OF EACH COMPONENT BEFORE JOINING
q=10
df_ban_train = ban_train.to_frame()
df_train_exp = df_ban_train.join(X_train) 
df_train_exp['y_test'] = y_train
df_train_exp['y_pred_proba'] = pred_prb
df_train_exp['y_pred'] = (df_train_exp['y_pred_proba'] > 0.5).astype(int)
df_train_exp['decile'] = pd.qcut(df_train_exp['y_pred_proba'], q, labels=[i for i in range(q, 0, -1)])

lg = get_lift(pred_prb, y_train, q)

lg

### make predictions on X_test and get lift

In [None]:
from sklearn.preprocessing import normalize

#predictions on X_test
pred_prb = rfc_model.predict_proba(X_test)[:, 1]
pred_prb = np.array(normalize([pred_prb]))[0]

#join ban_test, X_test, y_test and pred_prb and print to csv
#CHECK THE SIZE OF EACH COMPONENT BEFORE JOINING
q=10
df_ban_test = ban_test.to_frame()
df_test_exp = df_ban_test.join(X_test) 
df_test_exp['y_test'] = y_test
df_test_exp['y_pred_proba'] = pred_prb
df_test_exp['y_pred'] = (df_test_exp['y_pred_proba'] > 0.5).astype(int)
df_test_exp['decile'] = pd.qcut(df_test_exp['y_pred_proba'], q, labels=[i for i in range(q, 0, -1)])

lg = get_lift(pred_prb, y_test, q)

lg