In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats

import jb_helper_functions_prep
from jb_helper_functions_prep import create_enc

import prep_telco
from prep_telco import prep_telco_df

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import tree

In [2]:
df = prep_telco_df()
df.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,onlinesecurity_enc,onlinebackup_enc,deviceprotection_enc,techsupport_enc,streamingtv_enc,streamingmovies_enc,contract_enc,paperlessbilling_enc,paymentmethod_enc,churn_enc
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,0,2,0,0,0,0,0,1,2,0
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,2,0,2,0,0,0,1,0,3,0
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,2,2,0,0,0,0,0,1,3,1
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,2,0,2,2,0,0,1,0,0,0
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,0,0,0,0,0,0,0,1,2,1


In [3]:
df.internetservice.unique()

array(['DSL', 'Fiber optic', 'No'], dtype=object)

In [4]:
fiber_df = df[df.internetservice == 'Fiber optic']

In [5]:
fiber_train, fiber_test = train_test_split(fiber_df, test_size=.3, random_state=123, stratify=fiber_df[['churn_enc']])

In [6]:
print('Percent of fiber_non-churn: ' + str(fiber_train.churn_enc.value_counts()[0]/fiber_train.churn_enc.count()))
print('Percent of fiber_churn: ' + str(fiber_train.churn_enc.value_counts()[1]/fiber_train.churn_enc.count()))

Percent of fiber_non-churn: 0.5809875403784033
Percent of fiber_churn: 0.41901245962159667


### We see that our train set has 58.1% of customers that did not churn.  This is our benchmark for now.  

In [7]:
fiber_y_train = fiber_train[['churn_enc']]
fiber_y_test = fiber_test[['churn_enc']]

In [8]:
fiber_X_train = fiber_train[['tenure', 'monthlycharges', 'internetservice_enc', 'techsupport_enc', 'contract_enc', 'phoneservice_enc']]
fiber_X_test = fiber_test[['tenure', 'monthlycharges', 'internetservice_enc', 'techsupport_enc', 'contract_enc', 'phoneservice_enc']]

### Let's try a decision tree.

In [12]:
fiber_clf = DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=123).fit(fiber_X_train, fiber_y_train)
fiber_y_pred = fiber_clf.predict(fiber_X_train)
fiber_y_pred_proba = fiber_clf.predict_proba(fiber_X_train)
print('Accuracy of Decision Tree classifier on fiber_training set: {:.6f}'
     .format(fiber_clf.score(fiber_X_train, fiber_y_train)))


Accuracy of Decision Tree classifier on fiber_training set: 0.713429


In [13]:
print(classification_report(fiber_y_train, fiber_y_pred))

              precision    recall  f1-score   support

           0       0.75      0.77      0.76      1259
           1       0.66      0.64      0.65       908

   micro avg       0.71      0.71      0.71      2167
   macro avg       0.71      0.70      0.70      2167
weighted avg       0.71      0.71      0.71      2167



In [14]:
fiber_cm = pd.DataFrame(confusion_matrix(fiber_y_train, fiber_y_pred),
             columns=['Pred -', 'Pred +'], index=['Actual -', 'Actual +'])

fiber_cm

Unnamed: 0,Pred -,Pred +
Actual -,965,294
Actual +,327,581


### Now, let's try logistic regression.

In [15]:
fiber_log_reg = LogisticRegression(random_state=123, solver='saga').fit(fiber_X_train, fiber_y_train)
fiber_y_pred = fiber_log_reg.predict(fiber_X_train)
print('Accuracy of Logistic Regression classifier on training set: {:.6f}'
     .format(fiber_log_reg.score(fiber_X_train, fiber_y_train)))

Accuracy of Logistic Regression classifier on training set: 0.701892


In [16]:
print(classification_report(fiber_y_train, fiber_y_pred))

              precision    recall  f1-score   support

           0       0.76      0.72      0.74      1259
           1       0.63      0.68      0.66       908

   micro avg       0.70      0.70      0.70      2167
   macro avg       0.70      0.70      0.70      2167
weighted avg       0.71      0.70      0.70      2167



In [17]:
fiber_cm = pd.DataFrame(confusion_matrix(fiber_y_train, fiber_y_pred),
             columns=['Pred -', 'Pred +'], index=['Actual -', 'Actual +'])

fiber_cm

Unnamed: 0,Pred -,Pred +
Actual -,903,356
Actual +,290,618


### Let's see how these compare to a keras ANN model.

In [18]:
from sklearn.preprocessing import StandardScaler

import keras
from keras.models import Sequential
from keras.layers import Dense

sc = StandardScaler()
fiber_X_train = sc.fit_transform(fiber_X_train)
fiber_X_test = sc.transform(fiber_X_test)

Using TensorFlow backend.


In [20]:
cf = Sequential()
cf.add(Dense(output_dim=4, init='uniform', activation='relu', input_dim=6))
cf.add(Dense(output_dim=4, init='uniform', activation='relu'))
cf.add(Dense(output_dim=1, init='uniform', activation='sigmoid'))

cf.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
cf.fit(fiber_X_train, fiber_y_train, nb_epoch=100, batch_size=30)

scores = cf.evaluate(fiber_X_train, fiber_y_train)
print('%s: %.2f%%' % (cf.metrics_names[1], scores[1]*100))

Instructions for updating:
Use tf.cast instead.
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100

Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
acc: 71.11%


### Let's run our logistic regression model on the test set.

In [21]:
print('Accuracy of Decision Tree classifier on test set: {:.6f}'
     .format(fiber_clf.score(fiber_X_test, fiber_y_test)))

Accuracy of Decision Tree classifier on test set: 0.645856
