In [1]:
import numpy as np 
import pandas as pd 

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold, RepeatedStratifiedKFold

from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE, SVMSMOTE, RandomOverSampler
from imblearn.under_sampling import ClusterCentroids, RandomUnderSampler, NearMiss
from tqdm import tqdm

import os,gc,copy
import warnings
warnings.filterwarnings("ignore")

In [2]:
df_full = pd.read_csv("WA_-Telco-Customer-Churn.csv")
df_full = df_full.loc[df_full["TotalCharges"] != " "]
df_full["TotalCharges"] = df_full["TotalCharges"].astype(float)
df_full.reset_index(drop = True,inplace = True)
df_full = df_full.replace({"Churn": {"Yes":1,"No":0}})
y_full = df_full["Churn"]
X_full = df_full.drop(["Churn"],axis = 1)

In [3]:
X_train, X_oof_test, y_train, y_oof_test  = train_test_split(X_full,y_full,test_size=0.1,stratify=y_full,shuffle=True,random_state=9966)

In [4]:
df_full["Churn"].value_counts()/df_full.shape[0]

0    0.734215
1    0.265785
Name: Churn, dtype: float64

In [5]:
y_train.value_counts()/len(y_train)

0    0.734197
1    0.265803
Name: Churn, dtype: float64

In [6]:
y_oof_test.value_counts()/len(y_oof_test)

0    0.734375
1    0.265625
Name: Churn, dtype: float64

In [7]:
X_train.to_csv('x_tr.csv',index = False)
X_oof_test.to_csv('x_oof_test.csv',index = False)
y_train.to_csv('y_tr.csv',index = False)
y_oof_test.to_csv('y_oof_test.csv',index = False)

In [8]:
y_oof_test

320     1
5277    1
6702    0
4788    0
3917    1
       ..
3138    0
5634    1
1658    1
6175    0
6774    1
Name: Churn, Length: 704, dtype: int64

In [9]:
categorical_features = X_train.columns[ X_train.dtypes==object].tolist()
categorical_features.remove('customerID')
numeric_features = X_train.columns[X_train.dtypes!=object].tolist()

In [10]:
categorical_features

['gender',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod']

In [11]:
le = LabelEncoder()
le_dict = dict()

for col in categorical_features:
    le_dict[col] = (le.fit(X_train[col]))
    X_train[col] = le_dict[col].transform(X_train[col])
    X_oof_test[col] = le_dict[col].transform(X_oof_test[col])
# X_train[final_categorical_feature] = X_train[final_categorical_feature].apply(lambda col: le.fit_transform(col)) 
X_train.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
6231,6521-YYTYI,1,0,0,1,1,1,2,1,0,0,0,0,2,2,0,1,2,93.3,93.3
3437,5996-NRVXR,1,1,1,0,40,1,0,1,2,2,2,2,2,0,1,1,1,98.15,4116.8
160,8992-VONJD,0,0,0,1,13,1,0,0,2,2,0,0,0,0,0,1,2,56.0,764.55
4297,9732-OUYRN,0,0,1,0,49,1,0,2,1,1,1,1,1,1,1,0,1,19.0,918.7
4307,4597-NUCQV,1,1,0,0,24,1,2,1,0,2,0,0,2,2,0,1,2,101.25,2440.15


In [12]:
X_train.to_csv('X_tr_encoded.csv',index = False)

In [13]:
n_folds = 5
results = pd.DataFrame()
skf = StratifiedKFold(n_splits=n_folds,random_state=9966, shuffle=True)

sampling_techniques = dict()
sampling_techniques["Base - No sampling"] = None
sampling_techniques["SMOTE"] = SMOTE
sampling_techniques["ADASYN"] = ADASYN
sampling_techniques["BorderlineSMOTE"] = BorderlineSMOTE
sampling_techniques["SVMSMOTE"] = SVMSMOTE
sampling_techniques["RandomUnderSampler"] = RandomUnderSampler
sampling_techniques["ClusterCentroids"] = ClusterCentroids
# sampling_techniques["NearMiss"] = NearMiss
# sampling_techniques["Random Oversample"] = RandomOverSampler
# sampling_techniques["Random Oversample"] = RandomOverSampler

cols_log_from_sfs = ['Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'StreamingTV', 'Contract', 'PaymentMethod', 'MonthlyCharges']
params_log = {
    'C': 1.0, 
    'penalty': 'l2', 
    'solver': 'newton-cg',
    'random_state' : 33
}

cols_dt_from_sfs = ['PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'TechSupport', 'StreamingTV', 'Contract', 'PaperlessBilling']
params_dt = {
    'criterion': 'gini', 
    'max_depth': 5, 
    'max_features': 'auto', 
    'min_samples_leaf': 200,
    'random_state' : 11
}

cols_rf_from_sfs = ['InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'Contract', 'MonthlyCharges', 'TotalCharges']
params_rf = {
    'criterion': 'gini', 
    'max_depth': 10, 
    'max_features': 'auto', 
    'min_samples_leaf': 10, 
    'n_estimators': 175,
    'random_state' : 1010
}

cols_gb_from_sfs = ['tenure', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'TechSupport', 'StreamingTV', 'Contract', 'PaymentMethod']
params_gbm = {'learning_rate': 0.04, 
              'max_depth': 1, 
              'n_estimators': 12, 
              'subsample': 0.6,
              'random_state' : 1100
             }

cols_xgb_from_sfs = ['MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'Contract', 'MonthlyCharges', 'TotalCharges']
params_xgb = {'colsample_bytree': 0.96, 
              'gamma': 0.46, 
              'learning_rate': 0.25, 
              'max_depth': 6, 
              'min_child_weight': 10.0, 
              'n_estimators': 10, 
              'subsample': 0.11,
              'random_state' : 9966}


for technique,fxn in tqdm(sampling_techniques.items()):
    print(f'Using sampling technique: {technique}')
    X_train_resampled = copy.deepcopy(X_train.drop(["customerID"],axis = 1))
    y_train_resampled = copy.deepcopy(y_train)
    
    if sampling_techniques[technique] == None:
        X_train_resampled, y_train_resampled = X_train_resampled, y_train_resampled.ravel()
    else:
        sampler = sampling_techniques[technique](random_state=100)
        X_train_resampled, y_train_resampled = sampler.fit_resample(X_train_resampled, y_train_resampled.ravel())              
        
    for i, (train_index, valid_index) in tqdm(enumerate(skf.split(X_train_resampled, y_train_resampled))):

        model = "logisticRegression"

#         columns_logit = [col for col in X_train.columns if col != "customerID"]
        X_tr, X_val = X_train_resampled[cols_log_from_sfs].iloc[train_index], X_train_resampled[cols_log_from_sfs].iloc[valid_index]
        y_tr, y_val = y_train_resampled[train_index], y_train_resampled[valid_index]
        lr_model = LogisticRegression(**params_log).fit(X_tr, y_tr)
        
        y_pred_prob = lr_model.predict_proba(X_val)[:, 1]
        test_prob = lr_model.predict_proba(X_oof_test[cols_log_from_sfs])[:, 1]


        results = pd.concat([results,(pd.DataFrame({"sampling technique" : technique,
                                                    "total records before 0" :len(y_train[y_train == 0]),
                                                    "total records after 0" :len(y_train_resampled[y_train_resampled == 0]),
                                                    "total records before 1" :len(y_train[y_train == 1]),
                                                    "total records after 1" :len(y_train_resampled[y_train_resampled == 1]),                                                  
                                                    "model" : [model],
                                                    "fold_no" : [i],
                                                    "roc_auc_score" : roc_auc_score(y_val,y_pred_prob),
                                                    "f1_score" : f1_score(y_val,y_pred_prob>0.5),
                                                    "accuracy_score" : accuracy_score(y_val,y_pred_prob>0.5),
                                                    "test_oof_roc_auc_score" : roc_auc_score(y_oof_test,test_prob),
                                                    "test_oof_f1_score" : f1_score(y_oof_test,test_prob>0.5),
                                                    "test_oof_accuracy_score" : accuracy_score(y_oof_test,test_prob>0.5)}))])


        model = "DecisionTree"
#         columns_dt = [col for col in X_train.columns if col != "customerID"]
        X_tr, X_val = X_train_resampled[cols_dt_from_sfs].iloc[train_index], X_train_resampled[cols_dt_from_sfs].iloc[valid_index]
        y_tr, y_val = y_train_resampled[train_index], y_train_resampled[valid_index]        
        dt_model = DecisionTreeClassifier(**params_dt).fit(X_tr, y_tr)
        y_pred_prob = dt_model.predict_proba(X_val)[:, 1]
        test_prob = dt_model.predict_proba(X_oof_test[cols_dt_from_sfs])[:, 1]
        results = pd.concat([results,(pd.DataFrame({"sampling technique" : technique,
                                                    "total records before 0" :len(y_train[y_train == 0]),
                                                    "total records after 0" :len(y_train_resampled[y_train_resampled == 0]),
                                                    "total records before 1" :len(y_train[y_train == 1]),
                                                    "total records after 1" :len(y_train_resampled[y_train_resampled == 1]),                                                  
                                                    "model" : [model],
                                                    "fold_no" : [i],
                                                    "roc_auc_score" : roc_auc_score(y_val,y_pred_prob),
                                                    "f1_score" : f1_score(y_val,y_pred_prob>0.5),
                                                    "accuracy_score" : accuracy_score(y_val,y_pred_prob>0.5),
                                                    "test_oof_roc_auc_score" : roc_auc_score(y_oof_test,test_prob),
                                                    "test_oof_f1_score" : f1_score(y_oof_test,test_prob>0.5),
                                                    "test_oof_accuracy_score" : accuracy_score(y_oof_test,test_prob>0.5)}))])




        model = "RandomForest"
#         columns_rf = [col for col in X_train.columns if col != "customerID"]
        X_tr, X_val = X_train_resampled[cols_rf_from_sfs].iloc[train_index], X_train_resampled[cols_rf_from_sfs].iloc[valid_index]
        y_tr, y_val = y_train_resampled[train_index], y_train_resampled[valid_index]           
        rf_model = RandomForestClassifier(**params_rf).fit(X_tr, y_tr)
        y_pred_prob = rf_model.predict_proba(X_val)[:, 1]
        test_prob = rf_model.predict_proba(X_oof_test[cols_rf_from_sfs])[:, 1]
        results = pd.concat([results,(pd.DataFrame({"sampling technique" : technique,
                                                    "total records before 0" :len(y_train[y_train == 0]),
                                                    "total records after 0" :len(y_train_resampled[y_train_resampled == 0]),
                                                    "total records before 1" :len(y_train[y_train == 1]),
                                                    "total records after 1" :len(y_train_resampled[y_train_resampled == 1]),                                                  
                                                    "model" : [model],
                                                    "fold_no" : [i],
                                                    "roc_auc_score" : roc_auc_score(y_val,y_pred_prob),
                                                    "f1_score" : f1_score(y_val,y_pred_prob>0.5),
                                                    "accuracy_score" : accuracy_score(y_val,y_pred_prob>0.5),
                                                    "test_oof_roc_auc_score" : roc_auc_score(y_oof_test,test_prob),
                                                    "test_oof_f1_score" : f1_score(y_oof_test,test_prob>0.5),
                                                    "test_oof_accuracy_score" : accuracy_score(y_oof_test,test_prob>0.5)}))])



        model = "GBM"
#         columns_gbm = [col for col in X_train.columns if col != "customerID"]
        X_tr, X_val = X_train_resampled[cols_gb_from_sfs].iloc[train_index], X_train_resampled[cols_gb_from_sfs].iloc[valid_index]
        y_tr, y_val = y_train_resampled[train_index], y_train_resampled[valid_index]               
        gbm_model = GradientBoostingClassifier(**params_gbm).fit(X_tr, y_tr)
        y_pred_prob = gbm_model.predict_proba(X_val)[:, 1]
        test_prob = rf_model.predict_proba(X_oof_test[cols_gb_from_sfs])[:, 1]
        results = pd.concat([results,(pd.DataFrame({"sampling technique" : technique,
                                                    "total records before 0" :len(y_train[y_train == 0]),
                                                    "total records after 0" :len(y_train_resampled[y_train_resampled == 0]),
                                                    "total records before 1" :len(y_train[y_train == 1]),
                                                    "total records after 1" :len(y_train_resampled[y_train_resampled == 1]),                                                  
                                                    "model" : [model],
                                                    "fold_no" : [i],
                                                    "roc_auc_score" : roc_auc_score(y_val,y_pred_prob),
                                                    "f1_score" : f1_score(y_val,y_pred_prob>0.5),
                                                    "accuracy_score" : accuracy_score(y_val,y_pred_prob>0.5),
                                                    "test_oof_roc_auc_score" : roc_auc_score(y_oof_test,test_prob),
                                                    "test_oof_f1_score" : f1_score(y_oof_test,test_prob>0.5),
                                                    "test_oof_accuracy_score" : accuracy_score(y_oof_test,test_prob>0.5)}))])


        model = "XGB"
#         columns_xgb = [col for col in X_train.columns if col != "customerID"]
        X_tr, X_val = X_train_resampled[cols_xgb_from_sfs].iloc[train_index], X_train_resampled[cols_xgb_from_sfs].iloc[valid_index]
        y_tr, y_val = y_train_resampled[train_index], y_train_resampled[valid_index]    
        xgb_model = XGBClassifier(**params_xgb, use_label_encoder=False).fit(X_tr, y_tr)
        y_pred_prob = xgb_model.predict_proba(X_val)[:, 1]
        test_prob = xgb_model.predict_proba(X_oof_test[cols_xgb_from_sfs])[:, 1]
        results = pd.concat([results,(pd.DataFrame({"sampling technique" : technique,
                                                    "total records before 0" :len(y_train[y_train == 0]),
                                                    "total records after 0" :len(y_train_resampled[y_train_resampled == 0]),
                                                    "total records before 1" :len(y_train[y_train == 1]),
                                                    "total records after 1" :len(y_train_resampled[y_train_resampled == 1]),                                                  
                                                    "model" : [model],
                                                    "fold_no" : [i],
                                                    "roc_auc_score" : roc_auc_score(y_val,y_pred_prob),
                                                    "f1_score" : f1_score(y_val,y_pred_prob>0.5),
                                                    "accuracy_score" : accuracy_score(y_val,y_pred_prob>0.5),
                                                    "test_oof_roc_auc_score" : roc_auc_score(y_oof_test,test_prob),
                                                    "test_oof_f1_score" : f1_score(y_oof_test,test_prob>0.5),
                                                    "test_oof_accuracy_score" : accuracy_score(y_oof_test,test_prob>0.5)}))])
   
    
    


  0%|                                                                                            | 0/7 [00:00<?, ?it/s]

Using sampling technique: Base - No sampling



0it [00:00, ?it/s][A
1it [00:01,  1.16s/it][A
2it [00:02,  1.17s/it][A
3it [00:03,  1.15s/it][A
4it [00:04,  1.13s/it][A
5it [00:05,  1.14s/it][A
 14%|████████████                                                                        | 1/7 [00:05<00:34,  5.70s/it]

Using sampling technique: SMOTE



0it [00:00, ?it/s][A
1it [00:01,  1.40s/it][A
2it [00:02,  1.43s/it][A
3it [00:04,  1.43s/it][A
4it [00:05,  1.43s/it][A
5it [00:07,  1.42s/it][A
 29%|████████████████████████                                                            | 2/7 [00:12<00:32,  6.59s/it]

Using sampling technique: ADASYN



0it [00:00, ?it/s][A
1it [00:01,  1.43s/it][A
2it [00:02,  1.41s/it][A
3it [00:04,  1.41s/it][A
4it [00:05,  1.40s/it][A
5it [00:07,  1.41s/it][A
 43%|████████████████████████████████████                                                | 3/7 [00:20<00:27,  6.96s/it]

Using sampling technique: BorderlineSMOTE



0it [00:00, ?it/s][A
1it [00:01,  1.40s/it][A
2it [00:02,  1.42s/it][A
3it [00:04,  1.42s/it][A
4it [00:05,  1.42s/it][A
5it [00:07,  1.42s/it][A
 57%|████████████████████████████████████████████████                                    | 4/7 [00:27<00:21,  7.14s/it]

Using sampling technique: SVMSMOTE



0it [00:00, ?it/s][A
1it [00:01,  1.43s/it][A
2it [00:02,  1.44s/it][A
3it [00:04,  1.41s/it][A
4it [00:05,  1.40s/it][A
5it [00:07,  1.41s/it][A
 71%|████████████████████████████████████████████████████████████                        | 5/7 [00:37<00:15,  7.91s/it]

Using sampling technique: RandomUnderSampler



0it [00:00, ?it/s][A
1it [00:00,  1.26it/s][A
2it [00:01,  1.30it/s][A
3it [00:02,  1.28it/s][A
4it [00:03,  1.28it/s][A
5it [00:03,  1.28it/s][A
 86%|████████████████████████████████████████████████████████████████████████            | 6/7 [00:40<00:06,  6.57s/it]

Using sampling technique: ClusterCentroids



0it [00:00, ?it/s][A
1it [00:00,  1.25it/s][A
2it [00:01,  1.26it/s][A
3it [00:02,  1.24it/s][A
4it [00:03,  1.23it/s][A
5it [00:04,  1.24it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [01:02<00:00,  8.92s/it]


In [14]:
#  X_tr, X_val = X_train_resampled[cols_log_from_sfs].iloc[train_index], X_train_resampled[cols_log_from_sfs].iloc[valid_index]

In [15]:
# len(train_index)

In [16]:
#  X_train_resampled[cols_log_from_sfs].iloc[train_index]

In [17]:
results.to_csv("results_v0.csv",index = False)

In [32]:
x = results.groupby(["sampling technique","model"]).agg("mean")

In [33]:
summary = x.reset_index()

In [34]:
summary

Unnamed: 0,sampling technique,model,total records before 0,total records after 0,total records before 1,total records after 1,fold_no,roc_auc_score,f1_score,accuracy_score,test_oof_roc_auc_score,test_oof_f1_score,test_oof_accuracy_score
0,ADASYN,DecisionTree,4646.0,4646.0,1682.0,4722.0,2.0,0.839683,0.778678,0.774337,0.775768,0.560334,0.732386
1,ADASYN,GBM,4646.0,4646.0,1682.0,4722.0,2.0,0.740613,0.780651,0.741992,0.619964,0.401332,0.690625
2,ADASYN,RandomForest,4646.0,4646.0,1682.0,4722.0,2.0,0.901802,0.827121,0.820558,0.817971,0.587674,0.75
3,ADASYN,XGB,4646.0,4646.0,1682.0,4722.0,2.0,0.869994,0.800061,0.789709,0.812638,0.582817,0.73125
4,ADASYN,logisticRegression,4646.0,4646.0,1682.0,4722.0,2.0,0.8421,0.787311,0.77327,0.782203,0.570943,0.712216
5,Base - No sampling,DecisionTree,4646.0,4646.0,1682.0,1682.0,2.0,0.808648,0.508859,0.769438,0.793581,0.497006,0.761364
6,Base - No sampling,GBM,4646.0,4646.0,1682.0,1682.0,2.0,0.797632,0.0,0.734197,0.594409,0.315843,0.69858
7,Base - No sampling,RandomForest,4646.0,4646.0,1682.0,1682.0,2.0,0.842483,0.580589,0.800884,0.82938,0.542587,0.790341
8,Base - No sampling,XGB,4646.0,4646.0,1682.0,1682.0,2.0,0.841123,0.56424,0.799779,0.828603,0.518947,0.784375
9,Base - No sampling,logisticRegression,4646.0,4646.0,1682.0,1682.0,2.0,0.813809,0.535818,0.774655,0.798403,0.496527,0.755114


In [35]:
summary.to_csv("summary.csv",index = False)