In [60]:
import pandas as pd
import numpy as np
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

## Load and prepare dataset

In [41]:
data = pd.read_csv('./data/selfeat_customer_churn.csv')
data = data.replace({'Churn?':{'True.':1, 'False.':0}})
X = data.drop(['Churn?'], axis=1)
y = data['Churn?']
del data

## Build prediction models

In [46]:
#prepare classifiers with default setting, offcourse we can tune the parameters for better performance
rf = RandomForestClassifier()
ab = AdaBoostClassifier()
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier()

estimators = [rf, ab, knn, dt]
names = ['Random Forest', 'Ada Boost', 'kNN', 'Decision Tree']

In [59]:
#split data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

#evaluate the performance of the prediction models
for i,e in enumerate(estimators):
    scores = cross_val_score(estimator=e, X=X_train, y=y_train, cv=5, scoring='f1')
    print('f1 scores of 5 cross validation:',['%0.3f' %(s) for s in scores])
    print(names[i] + " mean f1: %0.3f with standard deviation: %0.3f" %( scores.mean(),  scores.std()))
    


f1 scores of 5 cross validation: ['0.752', '0.803', '0.848', '0.853', '0.792']
Random Forest mean f1: 0.809 with standard deviation: 0.038
f1 scores of 5 cross validation: ['0.740', '0.811', '0.839', '0.789', '0.770']
Ada Boost mean f1: 0.790 with standard deviation: 0.034
f1 scores of 5 cross validation: ['0.545', '0.557', '0.619', '0.617', '0.584']
kNN mean f1: 0.585 with standard deviation: 0.030
f1 scores of 5 cross validation: ['0.745', '0.755', '0.805', '0.766', '0.779']
Decision Tree mean f1: 0.770 with standard deviation: 0.021


From several prediction models, Random Forest achieves the best performance. Therefore, the random Forest prediction model will be used to predict data test, whether a customer will churn or not

## Predict customer in data test  

In [64]:
model = rf.fit(X_train, y_train)
predict_val = model.predict(X_test)
predict_prob = model.predict_proba(X_test)

In [91]:
#print the first 10 prediction result
y_test = np.asarray(pd.DataFrame(y_test).replace({0:'stay', 1:'churn'}))
predict_val = np.asarray(pd.DataFrame(predict_val).replace({0:'stay', 1:'churn'}))

row_val = [[str(y_test[i]),str(predict_val[i]),str(predict_prob[i][0]), str(predict_prob[i][1])] for i in range(10)]

header = ["Real condition", "Predicted as", "prob. stay", "prob. churn"]
row_format ="{:<20}" * (len(header)+1)

print("                    ========================================================================")
print(row_format.format("", *header))
print("                    ========================================================================")
for row in row_val:
    print(row_format.format("",*row))
print("                    ========================================================================")


                    Real condition      Predicted as        prob. stay          prob. churn         
                    ['churn']           ['churn']           0.2                 0.8                 
                    ['churn']           ['stay']            0.7                 0.3                 
                    ['churn']           ['churn']           0.0                 1.0                 
                    ['stay']            ['churn']           0.4                 0.6                 
                    ['churn']           ['churn']           0.0                 1.0                 
                    ['stay']            ['stay']            0.8                 0.2                 
                    ['stay']            ['stay']            0.9                 0.1                 
                    ['stay']            ['stay']            0.9                 0.1                 
                    ['stay']            ['stay']            0.6                 0.4        

## Notes

There are four condition regarding the prediction result:
1. TP : a customer is predicted as churn and in the reality he/she is churn
2. TN : a customer is predicted as stay and in the reality he/she is stay
3. FP : a customer is predicted as churn but in the reality he/she is stay
4. FN : a customer is predicted as stay but in the reality he/she is churn

If it is assumed that a customer who want to churn will stay if they receive discount price, then a prevention action can be performed by offering discount price. In this case, the discount will be offered to predicted-churn customer (case number 1 and 3). 

Case number 1, if the discounted price succeed to retain customer, then money loss is avoided.

Case number 3, if the loyal customer use the discount, it seems that the company spend unnecessary cost (discounted price), but it may strengthen the customer loyalty.

In case number 4, we loose the opportunity to prevent money loss.

According to those cases, offcourse it would be important to improve the model performance. Since the purpose of this project is for gaining preliminary insight about problem-solving in telco churn prediction, I simply did undersampling for the imbalanced dataset. It would be better if oversampling is chosen or other approaches for imbalanced dataset. In addition, it is very important to cooperate with other departments (for example, Marketing dept.) to optimise the effectiveness of the discount price.
