In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.compose import make_column_transformer

In [13]:
data = pd.read_csv('data.csv')

Data Preprocessing: 
Only the nominal categorical data needs to be one-hot-encoded.
All other numerical data does not need scaling (normalising).
The target data yes/no will needs to be encoded 0/1

In [14]:
# Data processing
data = data.iloc[:, 1: ] # removes first column with id's
X = data.iloc[:,:-1] # creates feature matrix without churn
#X = X.drop(['tenure'],axis = 1)

In [15]:
# One hot encoding
column_trans = make_column_transformer((OneHotEncoder(), ['gender', 'SeniorCitizen','Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']), remainder = 'passthrough')

# NEW feature matrix
X = column_trans.fit_transform(X)


In [16]:
# Binary encode churn
target = data.iloc[:,-1:]
y = target.apply(LabelEncoder().fit_transform)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [18]:
# Hyperprameter tuning
params = {'n_estimators' : (100, 110, 120, 130, 140),
            'max_features': ['auto', 'sqrt']}

gridsearch = GridSearchCV(RandomForestClassifier(random_state=2, oob_score=True), params,verbose=1, cv=5, n_jobs=-1)


In [19]:
clf = gridsearch.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    6.6s finished


In [20]:
#clf = gridsearch.best_estimator_
#clf.oob_score_

In [21]:
# Check overfitting by predicting the training set. Single iteration.
predict_train = clf.predict(X_train) # predict for training set
acc = accuracy_score(y_train, predict_train)
pre = precision_score(y_train, predict_train)
rec = recall_score(y_train, predict_train)
print('TRAIN SET SINGLE prediction scores:\nAccuracy score =', acc)
print('Precision score =', pre)
print('Recall score =', rec)

TRAIN SET SINGLE prediction scores:
Accuracy score = 0.9987809833401057
Precision score = 0.9984591679506933
Recall score = 0.9969230769230769


In [22]:
# Scores for single iteration of test data
predict = clf.predict(X_test) 
acc = accuracy_score(y_test, predict)
pre = precision_score(y_test, predict)
rec = recall_score(y_test, predict)
print('TEST SET SINGLE:\nAccuracy score =', acc)
print('Precision score =', pre)
print('Recall score =', rec)

TEST SET SINGLE:
Accuracy score = 0.8530805687203792
Precision score = 0.7524366471734892
Recall score = 0.6783831282952548


In [23]:
# 10 fold Cross validation predict scores on hold out dataset 
a = clf.best_estimator_

y_predict = cross_val_predict(a, X_test,y_test, cv =10)
acc = accuracy_score(y_test, y_predict)
pre = precision_score(y_test, y_predict)
rec = recall_score(y_test, y_predict)
print('TEST SET 10 FOLD CV (hold out data set)\nAccuracy score =', acc)
print('Precision score =', pre)
print('Recall score =', rec)

TEST SET 10 FOLD CV (hold out data set)
Accuracy score = 0.8478672985781991
Precision score = 0.7572614107883817
Recall score = 0.6414762741652021


In [None]:
precision_score(y_train, predict)

In [None]:
recall_score(y_train, predict)

In [None]:
predict2 = clf.predict(X_test)
precision_score(y_test, predict2)
#accuracy_score(y_test, predict2)

In [None]:
recall_score(y_test, predict2)