In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.compose import make_column_transformer

In [3]:
data = pd.read_csv('data.csv')

In [4]:
# Data processing
data = data.iloc[:, 1: ] # removes first column with id's
X = data.iloc[:,:-1] # creates feature matrix without churn

In [5]:
# One hot encoding
column_trans = make_column_transformer((OneHotEncoder(), ['gender', 'SeniorCitizen','Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']), remainder = 'passthrough')

# NEW feature matrix
X = column_trans.fit_transform(X)


In [6]:
# Binary encode churn
target = data.iloc[:,-1:]
y = target.apply(LabelEncoder().fit_transform)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [20]:
# Hyperprameter tuning
#
#params = {"base_estimator__max_depth" : [2,4,6,8,10],
 #             "n_estimators": [40, 50, 70, 90]
             #}
DTC = DecisionTreeClassifier()
ABC = AdaBoostClassifier(base_estimator = DTC)
random_grid = {'base_estimator__max_depth': [1,2,3,4,5,7], 
               'base_estimator__min_samples_leaf': [30,100,150,200,300], 
               'n_estimators': [50,80,100,150,200], 
               'learning_rate': [0.05, 0.1, 0.5, 1.0]}
#grid_search_ABC = GridSearchCV(ABC, params, n_jobs=-1, verbose=1)

gridsearch = RandomizedSearchCV(estimator = ABC, param_distributions = random_grid, n_iter = 50, cv = 3, verbose=2, random_state=42, n_jobs = -1, scoring = 'roc_auc')


In [21]:
clf = gridsearch.fit(X_train,y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   34.0s finished


In [22]:
clf.best_params_

{'n_estimators': 150,
 'learning_rate': 0.05,
 'base_estimator__min_samples_leaf': 200,
 'base_estimator__max_depth': 2}

In [23]:
# Check overfitting by predicting the training set. Single iteration.
predict_train = clf.predict(X_train) # predict for training set
acc = accuracy_score(y_train, predict_train)
pre = precision_score(y_train, predict_train)
rec = recall_score(y_train, predict_train)
print('TRAIN SET SINGLE prediction scores:\nAccuracy score =', acc)
print('Precision score =', pre)
print('Recall score =', rec)

TRAIN SET SINGLE prediction scores:
Accuracy score = 0.8677366924014628
Precision score = 0.7994555353901996
Recall score = 0.6720061022120518


In [24]:
# Scores for single iteration of test data
predict = clf.predict(X_test) 
acc = accuracy_score(y_test, predict)
pre = precision_score(y_test, predict)
rec = recall_score(y_test, predict)
print('TEST SET SINGLE:\nAccuracy score =', acc)
print('Precision score =', pre)
print('Recall score =', rec)

TEST SET SINGLE:
Accuracy score = 0.8587677725118483
Precision score = 0.7653061224489796
Recall score = 0.6720430107526881


In [25]:
# 10 fold Cross validation predict scores on hold out dataset 
a = clf.best_estimator_

y_predict = cross_val_predict(a, X_test,y_test, cv =10)
acc = accuracy_score(y_test, y_predict)
pre = precision_score(y_test, y_predict)
rec = recall_score(y_test, y_predict)
print('TEST SET 10 FOLD CV (hold out data set)\nAccuracy score =', acc)
print('Precision score =', pre)
print('Recall score =', rec)

TEST SET 10 FOLD CV (hold out data set)
Accuracy score = 0.8497630331753554
Precision score = 0.7464212678936605
Recall score = 0.6541218637992832
