In [3]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats

import jb_helper_functions_prep
from jb_helper_functions_prep import create_enc

import prep_telco
from prep_telco import prep_telco_df

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import tree

In [4]:
df = prep_telco_df()
df.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,onlinesecurity_enc,onlinebackup_enc,deviceprotection_enc,techsupport_enc,streamingtv_enc,streamingmovies_enc,contract_enc,paperlessbilling_enc,paymentmethod_enc,churn_enc
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,0,2,0,0,0,0,0,1,2,0
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,2,0,2,0,0,0,1,0,3,0
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,2,2,0,0,0,0,0,1,3,1
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,2,0,2,2,0,0,1,0,0,0
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,0,0,0,0,0,0,0,1,2,1


In [5]:
train, test = train_test_split(df, test_size=.3, random_state=123, stratify=df[['churn_enc']])

In [22]:
print('Percent of non-churn: ' + str(train.churn_enc.value_counts()[0]/train.churn_enc.count()))
print('Percent of churn: ' + str(train.churn_enc.value_counts()[1]/train.churn_enc.count()))

Percent of non-churn: 0.7342543681430312
Percent of churn: 0.2657456318569687


### We see that our train set has 73.4% of customers that did not churn.  This is our benchmark for now.  

In [6]:
y_train = train[['churn_enc']]
y_test = test[['churn_enc']]

In [7]:
df.columns

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn',
       'gender_enc', 'partner_enc', 'dependents_enc', 'phoneservice_enc',
       'multiplelines_enc', 'internetservice_enc', 'onlinesecurity_enc',
       'onlinebackup_enc', 'deviceprotection_enc', 'techsupport_enc',
       'streamingtv_enc', 'streamingmovies_enc', 'contract_enc',
       'paperlessbilling_enc', 'paymentmethod_enc', 'churn_enc'],
      dtype='object')

In [36]:
X_train = train[['tenure', 'monthlycharges', 'internetservice_enc', 'techsupport_enc', 'contract_enc', 'phoneservice_enc']]
X_test = test[['tenure', 'monthlycharges', 'internetservice_enc', 'techsupport_enc', 'contract_enc', 'phoneservice_enc']]

### Let's try a decision tree.

In [51]:
clf = DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=123).fit(X_train, y_train)
y_pred = clf.predict(X_train)
y_pred_proba = clf.predict_proba(X_train)
print('Accuracy of Decision Tree classifier on training set: {:.6f}'
     .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.795815


In [52]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.92      0.87      3614
           1       0.67      0.45      0.54      1308

   micro avg       0.80      0.80      0.80      4922
   macro avg       0.75      0.68      0.70      4922
weighted avg       0.78      0.80      0.78      4922



In [53]:
cm = pd.DataFrame(confusion_matrix(y_train, y_pred),
             columns=['Pred -', 'Pred +'], index=['Actual -', 'Actual +'])

cm

Unnamed: 0,Pred -,Pred +
Actual -,3331,283
Actual +,722,586


### Now, let's try logistic regression.

In [54]:
log_reg = LogisticRegression(random_state=123, solver='saga').fit(X_train, y_train)
y_pred = log_reg.predict(X_train)
print('Accuracy of Logistic Regression classifier on training set: {:.6f}'
     .format(log_reg.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.797643


In [55]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.90      0.87      3614
           1       0.65      0.52      0.58      1308

   micro avg       0.80      0.80      0.80      4922
   macro avg       0.74      0.71      0.72      4922
weighted avg       0.79      0.80      0.79      4922



In [56]:
cm = pd.DataFrame(confusion_matrix(y_train, y_pred),
             columns=['Pred -', 'Pred +'], index=['Actual -', 'Actual +'])

cm

Unnamed: 0,Pred -,Pred +
Actual -,3246,368
Actual +,628,680


### Let's see how these compare to a keras ANN model.

In [45]:
from sklearn.preprocessing import StandardScaler

import keras
from keras.models import Sequential
from keras.layers import Dense

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

Using TensorFlow backend.


In [50]:
cf = Sequential()
cf.add(Dense(output_dim=4, init='uniform', activation='relu', input_dim=6))
cf.add(Dense(output_dim=4, init='uniform', activation='relu'))
cf.add(Dense(output_dim=1, init='uniform', activation='sigmoid'))

cf.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
cf.fit(X_train, y_train, nb_epoch=100, batch_size=30)

scores = cf.evaluate(X_train, y_train)
print('%s: %.2f%%' % (cf.metrics_names[1], scores[1]*100))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
acc: 79.58%


### Let's run our logistic regression model on the test set.

In [57]:
print('Accuracy of Decision Tree classifier on test set: {:.6f}'
     .format(log_reg.score(X_test, y_test)))

Accuracy of Decision Tree classifier on test set: 0.793839
