In [75]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
pd.set_option('display.max_columns', None)  
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix, mean_squared_error, auc, roc_curve

# Data cleaning

In [2]:
dataset = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [12]:
dataset.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [13]:
dataset['gender'] = np.where(dataset['gender'] == 'Male', 1, 0)
dataset['Partner'] = np.where(dataset['Partner'] == 'Yes', 1, 0)
dataset['Dependents'] = np.where(dataset['Dependents'] == 'Yes', 1, 0)
dataset['PhoneService'] = np.where(dataset['PhoneService'] == 'Yes', 1, 0)
dataset['PaperlessBilling'] = np.where(dataset['PaperlessBilling'] == 'Yes',1,0)
dataset['Churn'] = np.where(dataset['Churn']== 'Yes',1,0)

In [16]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   int32  
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   int32  
 4   Dependents        7043 non-null   int32  
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   int32  
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   int32  


In [20]:
totalcharge = list(dataset['TotalCharges'])

In [41]:
totalcharge_result = []
for i in range (len(totalcharge)):
    if len(totalcharge[i]) == 1:
        totalcharge_result.append(0)
    else:
        totalcharge_result.append(float(totalcharge[i]))

In [45]:
dataset['TotalCharges'] = totalcharge_result

In [54]:
data_dum = pd.get_dummies(dataset.drop(columns=['customerID']))

In [57]:
data_x = data_dum.drop(columns=['Churn'])
data_y = data_dum.Churn

In [61]:
X_train,X_test,y_train,y_test=train_test_split(data_x,data_y,test_size=0.3, random_state=109)

# Logistic Regression

In [77]:
logreg = LogisticRegression(max_iter = 500)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

y_prob = logreg.predict_proba(X_test)
y_prob = pd.DataFrame(y_prob[:, 1])
accuracy = metrics.accuracy_score(y_pred, y_test)
cm1 = pd.DataFrame(confusion_matrix(y_pred, y_test))
cm1.columns = ['True Y=0','True Y=1']
cm1.index = ['Predicted Y=0','Predicted Y=1']
display(cm1)

print("Accuracy:", round(metrics.accuracy_score(y_pred, y_test), 4))  # Model Accuracy: how often is the classifier correct?
print("Sensitivity:", round(cm1.iloc[1, 1]/(cm1.iloc[1, 1] + cm1.iloc[0, 1]), 4))  # sensitivity
print("Specificity:", round(cm1.iloc[0, 0]/(cm1.iloc[0, 0] + cm1.iloc[1, 0]), 4))  # specificity
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
print('AUC:', round(metrics.auc(fpr, tpr), 4))  # auc
print("F1 score:", round(metrics.f1_score(y_test, y_pred), 4))  # F1 score

Unnamed: 0,True Y=0,True Y=1
Predicted Y=0,1382,252
Predicted Y=1,154,325


Accuracy: 0.8079
Sensitivity: 0.5633
Specificity: 0.8997
AUC: 0.8478
F1 score: 0.6155


# Decision Trees, Gini

In [80]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import graphviz 

In [153]:
clf = DecisionTreeClassifier(criterion='gini', max_depth = 7, random_state=109,min_samples_split=6,min_samples_leaf=10)
clf.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=7, min_samples_leaf=10, min_samples_split=6,
                       random_state=109)

In [154]:
y_pred = clf.predict(X_test)
cm1 = pd.DataFrame(confusion_matrix(y_pred, y_test))
cm1.columns = ['True Y=0','True Y=1']
cm1.index = ['Predicted Y=0','Predicted Y=1']
display(cm1)
total=sum(sum(cm))
    
print("Accuracy:", round(metrics.accuracy_score(y_pred, y_test), 4))  # Model Accuracy: how often is the classifier correct?
print("Sensitivity:", round(cm1.iloc[1, 1]/(cm1.iloc[1, 1] + cm1.iloc[0, 1]), 4))  # sensitivity
print("Specificity:", round(cm1.iloc[0, 0]/(cm1.iloc[0, 0] + cm1.iloc[1, 0]), 4))  # specificity
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
print('AUC:', round(metrics.auc(fpr, tpr), 4))  # auc
print("F1 score:", round(metrics.f1_score(y_test, y_pred), 4))  # F1 score

Unnamed: 0,True Y=0,True Y=1
Predicted Y=0,1348,257
Predicted Y=1,188,320


Accuracy: 0.7894
Sensitivity: 0.5546
Specificity: 0.8776
AUC: 0.8478
F1 score: 0.5899


# Decision Trees, Entropy

In [149]:
clf = DecisionTreeClassifier(criterion='entropy', max_depth = 7, random_state=109,min_samples_split=6,min_samples_leaf=10)
clf.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=7, min_samples_leaf=10,
                       min_samples_split=6, random_state=109)

In [150]:
y_pred = clf.predict(X_test)
cm1 = pd.DataFrame(confusion_matrix(y_pred, y_test))
cm1.columns = ['True Y=0','True Y=1']
cm1.index = ['Predicted Y=0','Predicted Y=1']
display(cm1)
total=sum(sum(cm))
    
print("Accuracy:", round(metrics.accuracy_score(y_pred, y_test), 4))  # Model Accuracy: how often is the classifier correct?
print("Sensitivity:", round(cm1.iloc[1, 1]/(cm1.iloc[1, 1] + cm1.iloc[0, 1]), 4))  # sensitivity
print("Specificity:", round(cm1.iloc[0, 0]/(cm1.iloc[0, 0] + cm1.iloc[1, 0]), 4))  # specificity
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
print('AUC:', round(metrics.auc(fpr, tpr), 4))  # auc
print("F1 score:", round(metrics.f1_score(y_test, y_pred), 4))  # F1 score

Unnamed: 0,True Y=0,True Y=1
Predicted Y=0,1324,251
Predicted Y=1,212,326


Accuracy: 0.7809
Sensitivity: 0.565
Specificity: 0.862
AUC: 0.8478
F1 score: 0.5848


# XGBoost

In [159]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import datetime 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import graphviz
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix, mean_squared_error, auc, roc_curve
from sklearn import metrics
pd.set_option("display.max_rows", None, "display.max_columns", None)
from IPython.display import display_html
from sklearn.feature_selection import SelectFromModel

In [157]:
def find_best(randomState, iterations, crossValid, verb, njobs, returnScore, X_train, y_train):
    xgb_model = xgb.XGBClassifier(objective = "binary:logistic", random_state = 42)  # define model first
    
    # Get different combinations of parameter values
    search = RandomizedSearchCV(xgb_model, param_distributions = params, 
                                random_state = randomState, n_iter = iterations, 
                                cv = crossValid, verbose = verb, n_jobs = njobs, 
                                return_train_score = returnScore)
    
    # Fit these sets of parameters to the model
    search.fit(X_train.drop(['IMO Number','Y'],axis=1), y_train)
    print("Best Score Found:", round(search.best_score_, 4))  # Best Accuracy; mean cross-validated score of the best_estimator
    print("Best Parameters Found:", search.best_params_)  # Most optimal parameter values
    return search

# get metric scores
def get_metrics(best_model, X_test, y_test, X_train, y_train):  #, imo_xtrain, imo_xtest):
    y_pred = best_model.predict(X_test)
    y_prob = best_model.predict_proba(X_test)
    y_prob = pd.DataFrame(y_prob[:, 1])

    # Confusion matrix
    cm1 = pd.DataFrame(confusion_matrix(y_pred, y_test))
    cm1.columns = ['True Y=0','True Y=1']
    cm1.index = ['Predicted Y=0','Predicted Y=1']
    display(cm1)
    
    print("Accuracy:", round(metrics.accuracy_score(y_pred, y_test), 4))  # Model Accuracy: how often is the classifier correct?
    print("Sensitivity:", round(cm1.iloc[1, 1]/(cm1.iloc[1, 1] + cm1.iloc[0, 1]), 4))  # sensitivity
    print("Specificity:", round(cm1.iloc[0, 0]/(cm1.iloc[0, 0] + cm1.iloc[1, 0]), 4))  # specificity
    fpr, tpr, thresholds = roc_curve(y_test, y_prob)
    print('AUC:', round(metrics.auc(fpr, tpr), 4))  # auc
    print("F1 score:", round(metrics.f1_score(y_test, y_pred), 4))  # F1 score
    
# Plot visualisation of important variables and probabilities of Y = 1 for training and validation sets
def visual_and_prob(best_model, X_test, y_test, X_train, y_train, imo_xtrain, imo_xtest):
    # visualisation of top 10 important variables
    xgb.plot_importance(best_model.get_booster(), max_num_features = 10, xlabel='no. of times feature appeared',
                        title='Feature importance of top 10 variables')  # ,importance_type = 'gain')
    plt.rcParams['figure.figsize'] = [5, 5]
    plt.show()
    
    # predict probabilities for training set
    X_train['IMO Number'] = imo_xtrain.astype(str)
    X_train['Y'] = y_train
    y_prob_train = best_model.predict_proba(X_train[X_train['Y'] == 1].drop(columns = ['Y', 'IMO Number']))
    y_prob_train = pd.DataFrame(y_prob_train[:,1])
    imo = pd.DataFrame(X_train[X_train['Y'] == 1]['IMO Number']).reset_index()
    imo['Y=1'] = y_prob_train
    imo = imo.sort_values(by = 'Y=1', ascending = False).drop('index', axis = 1)
    
    # predict probabilities for test set
    X_test['IMO Number'] = imo_xtest.astype(str)
    X_test['Y'] = y_test
    y_prob_test = best_model.predict_proba(X_test[X_test['Y'] == 1].drop(columns = ['Y', 'IMO Number']))
    y_prob_test = pd.DataFrame(y_prob_test[:,1])
    imo1 = pd.DataFrame(X_test[X_test['Y'] == 1]['IMO Number']).reset_index()
    imo1['Y=1'] = y_prob_test
    imo1 = imo1.sort_values(by = 'Y=1', ascending = False).drop('index',axis=1)
    
    side_by_side(imo.iloc[:6, :], imo1.iloc[:6, :], "Probabilities for training set", "Probabilities for test set")

In [162]:
# Initialise parameter values
params = {
    "colsample_bytree": uniform(0.3, 0.7),
    "gamma": uniform(0, 0.5),
    "learning_rate": uniform(0.03, 0.3), # default 0.1 
    "max_depth": randint(2, 6), # default 3
    "n_estimators": randint(100, 500), # default 100
    "subsample": uniform(0.3, 0.6)
}

randomState, iterations, crossValid, verb, njobs, returnScore = 42, 300, 5, 1, 1, True

In [167]:
xgb_model = xgb.XGBClassifier(objective = "binary:logistic", random_state = 42)  # define model first
    
    # Get different combinations of parameter values
search = RandomizedSearchCV(xgb_model, param_distributions = params, 
                            random_state = randomState, n_iter = iterations, 
                            cv = crossValid, verbose = verb, n_jobs = njobs, 
                            return_train_score = returnScore)
search.fit(X_train, y_train)


Fitting 5 folds for each of 300 candidates, totalling 1500 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1500 out of 1500 | elapsed: 10.4min finished


RandomizedSearchCV(cv=5,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints=None,
                                           n_estimators=100,...
                                        'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000026DF8A67040>,
                                        'max_dep

In [168]:
xgb_best = search.best_estimator_

In [169]:
# get metric scores
def get_metrics(best_model, X_test, y_test, X_train, y_train):  #, imo_xtrain, imo_xtest):
    y_pred = best_model.predict(X_test)
    y_prob = best_model.predict_proba(X_test)
    y_prob = pd.DataFrame(y_prob[:, 1])

    # Confusion matrix
    cm1 = pd.DataFrame(confusion_matrix(y_pred, y_test))
    cm1.columns = ['True Y=0','True Y=1']
    cm1.index = ['Predicted Y=0','Predicted Y=1']
    display(cm1)
    
    print("Accuracy:", round(metrics.accuracy_score(y_pred, y_test), 4))  # Model Accuracy: how often is the classifier correct?
    print("Sensitivity:", round(cm1.iloc[1, 1]/(cm1.iloc[1, 1] + cm1.iloc[0, 1]), 4))  # sensitivity
    print("Specificity:", round(cm1.iloc[0, 0]/(cm1.iloc[0, 0] + cm1.iloc[1, 0]), 4))  # specificity
    fpr, tpr, thresholds = roc_curve(y_test, y_prob)
    print('AUC:', round(metrics.auc(fpr, tpr), 4))  # auc
    print("F1 score:", round(metrics.f1_score(y_test, y_pred), 4))  # F1 score

In [170]:
get_metrics(xgb_best,X_test,y_test,X_train,y_train)

Unnamed: 0,True Y=0,True Y=1
Predicted Y=0,1383,265
Predicted Y=1,153,312


Accuracy: 0.8022
Sensitivity: 0.5407
Specificity: 0.9004
AUC: 0.8488
F1 score: 0.5988
