## Customer Churn Prediction
This notebook illustrates a basic ML approaches on a 2-class problem of predicting who will churn from a company.

In [1]:
import pandas as pd
from sklearn import linear_model
from sklearn import neighbors
from sklearn import naive_bayes
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder


### Training Data Preprocessing

In [2]:

data = pd.read_csv(r'C:\Users\student\Documents\churn_data.csv')
data.head()
#residence length is in months
#lot of them in binary code: 1 = yes, 2 = no

Unnamed: 0,CustID,Gender,Age,Income,FamilySize,Education,Calls,Visits,Churn
0,123251,Male,34,Lower,4,16,14,5,Yes
1,188922,Male,20,Lower,5,14,49,1,No
2,145322,Female,30,Lower,4,20,19,4,Yes
3,153729,Female,46,Lower,4,14,15,4,Yes
4,103976,Female,23,Lower,4,16,18,0,No


In [3]:
#One hot encoding of categorical features:
def cat_features(dataframe):
    td = pd.DataFrame({'a':[1,2,3], 'b':[1.0,2.0,3.0]})
    return [x for x in list(dataframe) if not (dataframe[x].dtype in [td['a'].dtype, td['b'].dtype])]
   

#Get one hot encoding of columns Gender
one_hot = pd.get_dummies(data['Gender'])
# Drop column B as it is now encoded
data = data.drop('Gender',axis = 1)
# Join the encoded df
data= data.join(one_hot)
data.head()

Unnamed: 0,CustID,Age,Income,FamilySize,Education,Calls,Visits,Churn,Female,Male
0,123251,34,Lower,4,16,14,5,Yes,0,1
1,188922,20,Lower,5,14,49,1,No,0,1
2,145322,30,Lower,4,20,19,4,Yes,1,0
3,153729,46,Lower,4,14,15,4,Yes,1,0
4,103976,23,Lower,4,16,18,0,No,1,0


In [4]:
#Get one hot encoding of column Income
one_hot = pd.get_dummies(data['Income'])
# Drop column B as it is now encoded
data = data.drop('Income',axis = 1)
# Join the encoded df
data= data.join(one_hot)
data.head()

Unnamed: 0,CustID,Age,FamilySize,Education,Calls,Visits,Churn,Female,Male,Lower,Upper
0,123251,34,4,16,14,5,Yes,0,1,1,0
1,188922,20,5,14,49,1,No,0,1,1,0
2,145322,30,4,20,19,4,Yes,1,0,1,0
3,153729,46,4,14,15,4,Yes,1,0,1,0
4,103976,23,4,16,18,0,No,1,0,1,0


In [5]:
#Changing order of the columns for convenience
data=data[['Churn','CustID','Age','FamilySize','Education','Calls', 'Visits','Female','Male','Lower','Upper',]]
data.head()

Unnamed: 0,Churn,CustID,Age,FamilySize,Education,Calls,Visits,Female,Male,Lower,Upper
0,Yes,123251,34,4,16,14,5,0,1,1,0
1,No,188922,20,5,14,49,1,0,1,1,0
2,Yes,145322,30,4,20,19,4,1,0,1,0
3,Yes,153729,46,4,14,15,4,1,0,1,0
4,No,103976,23,4,16,18,0,1,0,1,0


In [6]:
data['Churn'] = data['Churn'].map({'Yes': 1, 'No': 0})
data.head()

Unnamed: 0,Churn,CustID,Age,FamilySize,Education,Calls,Visits,Female,Male,Lower,Upper
0,1,123251,34,4,16,14,5,0,1,1,0
1,0,188922,20,5,14,49,1,0,1,1,0
2,1,145322,30,4,20,19,4,1,0,1,0
3,1,153729,46,4,14,15,4,1,0,1,0
4,0,103976,23,4,16,18,0,1,0,1,0


In [7]:

#Get predictors - all non-Buy columns (Buy is column 0)
data_x = data[list(data)[1:]]

#Get target variable y - Buy column
data_y = data['Churn']

#Split data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size = 0.3, random_state=4)

### Validation data Preprocessing


In [8]:
#Reading in of the Churn_validation data. 
df=pd.read_csv(r'C:\Users\student\Documents\Demo_data\churn_validation.csv')
df.head()




Unnamed: 0,CustID,Gender,Age,Income,FamilySize,Education,Calls,Visits,Churn
0,102522,Male,54,Upper,4,18,48,3,Yes
1,108050,Male,21,Lower,4,19,44,2,Yes
2,108118,Female,22,Lower,3,16,22,5,Yes
3,109501,Male,27,Upper,3,13,19,2,Yes
4,109782,Male,18,Lower,2,14,6,3,No


In [9]:
#Get one hot encoding of column Gender
one_hot = pd.get_dummies(df['Gender'])
# Drop column B as it is now encoded
df = df.drop('Gender',axis = 1)
# Join the encoded df
df= df.join(one_hot)

In [10]:
#Get one hot encoding of column Income
one_hot = pd.get_dummies(df['Income'])
# Drop column B as it is now encoded
df = df.drop('Income',axis = 1)
# Join the encoded df
df= df.join(one_hot)

In [11]:
#Changing order of the columns for convenience
df=df[['Churn','CustID','Age','FamilySize','Education','Calls', 'Visits','Female','Male','Lower','Upper',]]

In [12]:
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})
df.head()

Unnamed: 0,Churn,CustID,Age,FamilySize,Education,Calls,Visits,Female,Male,Lower,Upper
0,1,102522,54,4,18,48,3,0,1,0,1
1,1,108050,21,4,19,44,2,0,1,1,0
2,1,108118,22,3,16,22,5,1,0,1,0
3,1,109501,27,3,13,19,2,0,1,0,1
4,0,109782,18,2,14,6,3,0,1,1,0


In [13]:
#Get predictors - all non-Buy columns (Buy is column 0)
x = df[list(df)[1:]]

#Get target variable y - Buy column
y = df['Churn']

**1. Logistic Regression Example**

In [14]:
#Build a logisitc regression model
log_mod = linear_model.LogisticRegression()
log_mod.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
#Make predictions
preds = log_mod.predict(x)  #Get predicted labels
pred_probs = log_mod.predict_proba(x)   #Get predicted probabilities/ Each observation is a 2-element array.
pred_pos = pred_probs.transpose()[1]  #P(X = 1) is column 1
pred_neg = pred_probs.transpose()[1]  #P(X = 0) is column 0

In [16]:
print(pred_probs)


[[0.05407694 0.94592306]
 [0.05776435 0.94223565]
 [0.18134014 0.81865986]
 [0.31058155 0.68941845]
 [0.39923822 0.60076178]
 [0.28905714 0.71094286]
 [0.36485984 0.63514016]
 [0.31092881 0.68907119]
 [0.16890942 0.83109058]
 [0.32173951 0.67826049]
 [0.13759194 0.86240806]
 [0.35066062 0.64933938]
 [0.35781336 0.64218664]
 [0.39959246 0.60040754]
 [0.14520423 0.85479577]
 [0.55951567 0.44048433]
 [0.3047329  0.6952671 ]
 [0.59108639 0.40891361]
 [0.3980342  0.6019658 ]
 [0.70359376 0.29640624]
 [0.67290078 0.32709922]
 [0.58639159 0.41360841]
 [0.5154894  0.4845106 ]
 [0.73700572 0.26299428]
 [0.4516491  0.5483509 ]
 [0.53184128 0.46815872]
 [0.71521546 0.28478454]
 [0.41447804 0.58552196]
 [0.6142951  0.3857049 ]
 [0.58105605 0.41894395]
 [0.6485691  0.3514309 ]
 [0.75733976 0.24266024]]


In [17]:
#Look at results
pred_df = pd.DataFrame({"Actual": y, "Predicted Class": preds, "P(X=1)" : pred_pos, "P(x=1)": pred_neg})
pred_df.head(15)

Unnamed: 0,Actual,Predicted Class,P(X=1),P(x=1)
0,1,1,0.945923,0.945923
1,1,1,0.942236,0.942236
2,1,1,0.81866,0.81866
3,1,1,0.689418,0.689418
4,0,1,0.600762,0.600762
5,1,1,0.710943,0.710943
6,0,1,0.63514,0.63514
7,1,1,0.689071,0.689071
8,1,1,0.831091,0.831091
9,0,1,0.67826,0.67826


In [18]:
#Look at error metrics
print("Accuracy:  " + str(accuracy_score(y, preds)))
print("Precision:  " + str(precision_score(y, preds)))
print("Recall:  " + str(recall_score(y, preds)))
print("F1:  " + str(f1_score(y, preds)))
print("ROC AUC:  " + str(roc_auc_score(y, preds)))
print("Confusion Matrix: \n" + str(confusion_matrix(y, preds)))

Accuracy:  0.75
Precision:  0.631578947368421
Recall:  0.9230769230769231
F1:  0.7499999999999999
ROC AUC:  0.7773279352226721
Confusion Matrix: 
[[12  7]
 [ 1 12]]


**2. K-Nearest Neighbors Example**

In [19]:
#Build a sequence of models for a set of different k values.
ks = [2, 3, 7, 11, 13, 15, 17, 19, 21]
for k in ks:
    #Create and fit a KNN model 
    mod = neighbors.KNeighborsClassifier(n_neighbors = k)
    mod.fit(x_train, y_train)
    
    #Make predictions and evaluate
    preds = mod.predict(x)
    print('-------------------EVALUATING MODEL k = ' + str(k) + '---------------------------')
    #Look at error metrics
    print("Accuracy:  " + str(accuracy_score(y, preds)))
    print("Precision:  " + str(precision_score(y, preds)))
    print("Recall:  " + str(recall_score(y, preds)))
    print("F1:  " + str(f1_score(y, preds)))
    print("ROC AUC:  " + str(roc_auc_score(y, preds)))
    print("Confusion Matrix: \n" + str(confusion_matrix(y, preds)))



-------------------EVALUATING MODEL k = 2---------------------------
Accuracy:  0.53125
Precision:  0.4
Recall:  0.3076923076923077
F1:  0.34782608695652173
ROC AUC:  0.4959514170040486
Confusion Matrix: 
[[13  6]
 [ 9  4]]
-------------------EVALUATING MODEL k = 3---------------------------
Accuracy:  0.40625
Precision:  0.3125
Recall:  0.38461538461538464
F1:  0.3448275862068966
ROC AUC:  0.40283400809716596
Confusion Matrix: 
[[ 8 11]
 [ 8  5]]
-------------------EVALUATING MODEL k = 7---------------------------
Accuracy:  0.4375
Precision:  0.391304347826087
Recall:  0.6923076923076923
F1:  0.5
ROC AUC:  0.47773279352226716
Confusion Matrix: 
[[ 5 14]
 [ 4  9]]
-------------------EVALUATING MODEL k = 11---------------------------
Accuracy:  0.40625
Precision:  0.375
Recall:  0.6923076923076923
F1:  0.48648648648648646
ROC AUC:  0.451417004048583
Confusion Matrix: 
[[ 4 15]
 [ 4  9]]
-------------------EVALUATING MODEL k = 13---------------------------
Accuracy:  0.4375
Precision:  

### Naive Bayes Classifier


In [20]:
gnb_mod=naive_bayes.GaussianNB()
gnb_mod.fit(x_train,y_train)
preds=gnb_mod.predict(x)
#Look at error metrics
print("Accuracy:  " + str(accuracy_score(y, preds)))
print("Precision:  " + str(precision_score(y, preds)))
print("Recall:  " + str(recall_score(y, preds)))
print("F1:  " + str(f1_score(y, preds)))
print("ROC AUC:  " + str(roc_auc_score(y, preds)))
print("Confusion Matrix: \n" + str(confusion_matrix(y, preds)))


Accuracy:  0.71875
Precision:  0.6
Recall:  0.9230769230769231
F1:  0.7272727272727274
ROC AUC:  0.751012145748988
Confusion Matrix: 
[[11  8]
 [ 1 12]]


### Random forest model

In [21]:
#Build a sequence of random forest models for different numbers of estimators and tree depths
n_est=[10]#I found 10 to work the best here
depths=[3]#max_depth of 3 gave best results
for n in n_est:
    for depth in depths:
        mod = ensemble.RandomForestClassifier(n_estimators=n,max_depth=depth)
        mod.fit(x_train,y_train)
        preds=mod.predict(x)
print('-------Evaluating Model: n_estimators=' + str(n)+",maxdepth=" +str(depth)+"-----------")       
print("Accuracy:  " + str(accuracy_score(y, preds)))
print("Precision:  " + str(precision_score(y, preds)))
print("Recall:  " + str(recall_score(y, preds)))
print("F1:  " + str(f1_score(y, preds)))
print("ROC AUC:  " + str(roc_auc_score(y, preds)))
print("Confusion Matrix: \n" + str(confusion_matrix(y, preds)))

-------Evaluating Model: n_estimators=10,maxdepth=3-----------
Accuracy:  0.75
Precision:  0.6666666666666666
Recall:  0.7692307692307693
F1:  0.7142857142857142
ROC AUC:  0.7530364372469636
Confusion Matrix: 
[[14  5]
 [ 3 10]]
