In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
import acquire
from acquire import get_telco_data

import prepare
from prepare import prep_telco
from prepare import split

In [3]:
df = get_telco_data()

df.head()

Unnamed: 0.1,Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,0,2,1,1,0003-MKNFE,Male,0,No,No,9,...,No,No,Yes,No,59.9,542.4,No,Month-to-month,DSL,Mailed check
1,1,4,1,1,0013-MHZWF,Female,0,No,Yes,9,...,Yes,Yes,Yes,Yes,69.4,571.45,No,Month-to-month,DSL,Credit card (automatic)
2,2,1,1,1,0015-UOCOJ,Female,1,No,No,7,...,No,No,No,Yes,48.2,340.35,No,Month-to-month,DSL,Electronic check
3,3,1,1,1,0023-HGHWL,Male,1,No,No,1,...,No,No,No,Yes,25.1,25.1,Yes,Month-to-month,DSL,Electronic check
4,4,3,1,1,0032-PGELS,Female,0,Yes,Yes,1,...,No,No,No,No,30.5,30.5,Yes,Month-to-month,DSL,Bank transfer (automatic)


In [4]:
df = prep_telco(df)

df.head()

Unnamed: 0,senior_citizen,tenure,monthly_charges,total_charges,male,partner,dependents,phone_service,multiple_lines,no_internet_service,...,churn,one_year,two_year,fiber_optic,credit_card,electric_check,mailed_check,online_backup,live_alone,new_customer
0,0,9,59.9,539.1,1,0,0,1,1,0,...,0,0,0,0,0,0,1,0,True,False
1,0,9,69.4,624.6,0,0,1,1,0,0,...,0,0,0,0,1,0,0,0,False,False
2,1,7,48.2,337.4,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,True,False
3,1,1,25.1,25.1,1,0,0,0,0,0,...,1,0,0,0,0,1,0,0,True,True
4,0,1,30.5,30.5,0,1,1,0,0,0,...,1,0,0,0,0,0,0,0,False,True


In [5]:
train, validate, test = split(df)

In [6]:
train.head()

Unnamed: 0,senior_citizen,tenure,monthly_charges,total_charges,male,partner,dependents,phone_service,multiple_lines,no_internet_service,...,churn,one_year,two_year,fiber_optic,credit_card,electric_check,mailed_check,online_backup,live_alone,new_customer
6058,0,58,19.55,1133.9,1,1,1,1,0,1,...,0,1,0,0,0,1,0,0,False,False
4085,1,17,92.55,1573.35,0,1,0,1,1,0,...,0,0,0,1,0,1,0,0,False,False
3497,0,27,94.55,2552.85,0,0,0,1,1,0,...,1,0,0,1,0,0,1,0,True,False
4271,1,37,76.25,2821.25,0,0,0,1,1,0,...,1,0,0,1,0,1,0,0,True,False
2672,0,7,73.6,515.2,1,0,0,1,1,0,...,1,0,0,1,0,1,0,0,True,False


In [7]:
train.shape

(3943, 26)

In [8]:
validate.head()

Unnamed: 0,senior_citizen,tenure,monthly_charges,total_charges,male,partner,dependents,phone_service,multiple_lines,no_internet_service,...,churn,one_year,two_year,fiber_optic,credit_card,electric_check,mailed_check,online_backup,live_alone,new_customer
4111,0,18,85.2,1533.6,0,0,0,1,1,0,...,1,0,0,1,0,1,0,0,True,False
616,0,31,54.35,1684.85,1,1,1,1,0,0,...,0,0,0,0,0,0,1,0,False,False
6074,0,69,20.35,1404.15,0,1,1,1,0,1,...,0,1,0,0,1,0,0,0,False,False
1737,0,26,61.55,1600.3,1,0,0,1,1,0,...,0,1,0,0,0,1,0,1,True,False
97,0,5,53.85,269.25,0,0,0,1,0,0,...,1,0,0,0,1,0,0,0,True,True


In [9]:
validate.shape

(1691, 26)

In [10]:
test.head()

Unnamed: 0,senior_citizen,tenure,monthly_charges,total_charges,male,partner,dependents,phone_service,multiple_lines,no_internet_service,...,churn,one_year,two_year,fiber_optic,credit_card,electric_check,mailed_check,online_backup,live_alone,new_customer
2317,0,47,73.6,3459.2,1,1,0,1,0,0,...,0,0,1,0,0,1,0,1,False,False
1512,0,50,80.05,4002.5,0,1,1,1,1,0,...,0,1,0,0,0,0,0,1,False,False
3533,0,54,89.4,4827.6,1,0,0,1,1,0,...,0,0,0,1,1,0,0,0,True,False
2078,0,60,37.7,2262.0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,1,False,False
4254,0,3,73.6,220.8,1,0,0,1,0,0,...,0,0,0,1,1,0,0,0,True,True


In [11]:
test.shape

(1409, 26)

In [12]:
train.churn.value_counts()

# largest group is customers who have not churned, that will the the baseline

0    2897
1    1046
Name: churn, dtype: int64

In [13]:
# train['baseline'] = 0

# train.head()

In [14]:
# baseline_accuracy = (train.churn == train.baseline).mean()
# baseline_accuracy

# baseline accuracy is 73%

In [15]:
# create X & y version of train, where y is a series with just the target variable and X are all the features. 

X_train = train.drop(columns=['churn'])
y_train = train.churn

X_validate = validate.drop(columns=['churn'])
y_validate = validate.churn

X_test = test.drop(columns=['churn'])
y_test = test.churn

# Decision Tree 

In [16]:
clf = DecisionTreeClassifier(max_depth=3, random_state=319)

In [17]:
# model.fit(X, y)

clf = clf.fit(X_train, y_train)

In [18]:
import graphviz
from graphviz import Graph

In [19]:
#dot_data = export_graphviz(clf, feature_names= X_train.columns, class_names=clf.classes_, rounded=True, filled=True, out_file=None)
#graph = graphviz.Source(dot_data) 

#graph.render('telco_decision_tree', view=True)

In [20]:
# make prediction on train obeservations

y_pred = clf.predict(X_train)
y_pred[0:5]

array([0, 0, 0, 0, 1], dtype=uint8)

In [21]:
# check actual results

y_train.head()

6058    0
4085    0
3497    1
4271    1
2672    1
Name: churn, dtype: uint8

In [22]:
# make probability

y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[0:5]

array([[0.87184284, 0.12815716],
       [0.60355987, 0.39644013],
       [0.60355987, 0.39644013],
       [0.60355987, 0.39644013],
       [0.35681818, 0.64318182]])

## Evaluate Model

In [23]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

# this model is only 6% better than baseline on train data

Accuracy of Decision Tree classifier on training set: 0.79


In [24]:
# confusion matrix

confusion_matrix(y_train, y_pred)

array([[2591,  306],
       [ 516,  530]])

In [25]:
y_train.value_counts()

0    2897
1    1046
Name: churn, dtype: int64

In [26]:
import pandas as pd

labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,2591,306
1,516,530


In [27]:
print(classification_report(y_train, y_pred))

# model does well when not churn is the positive, poorly when predicting actual churn

              precision    recall  f1-score   support

           0       0.83      0.89      0.86      2897
           1       0.63      0.51      0.56      1046

    accuracy                           0.79      3943
   macro avg       0.73      0.70      0.71      3943
weighted avg       0.78      0.79      0.78      3943



# Decision Tree 2

In [28]:
clf2 = DecisionTreeClassifier(max_depth=5, random_state=319)

# increasing the max depth from 3 to 5

In [29]:
# model.fit(X, y)

clf2 = clf2.fit(X_train, y_train)

In [30]:
# make prediction on train obeservations

y_pred = clf2.predict(X_train)
y_pred[0:5]

array([0, 1, 0, 1, 1], dtype=uint8)

In [31]:
# check actual results

y_train.head()

6058    0
4085    0
3497    1
4271    1
2672    1
Name: churn, dtype: uint8

In [32]:
# make probability

y_pred_proba = clf2.predict_proba(X_train)
y_pred_proba[0:5]

array([[0.94799054, 0.05200946],
       [0.455     , 0.545     ],
       [0.73305085, 0.26694915],
       [0.455     , 0.545     ],
       [0.17117117, 0.82882883]])

## Evaluate Model

In [33]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf2.score(X_train, y_train)))

# this model performs 8% better than baseline on test data

Accuracy of Decision Tree classifier on training set: 0.81


In [34]:
# confusion matrix

confusion_matrix(y_train, y_pred)



array([[2621,  276],
       [ 482,  564]])

In [35]:
y_train.value_counts()

0    2897
1    1046
Name: churn, dtype: int64

In [36]:
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,2621,276
1,482,564


In [37]:
print(classification_report(y_train, y_pred))

# this model does barely better on predicting churn

              precision    recall  f1-score   support

           0       0.84      0.90      0.87      2897
           1       0.67      0.54      0.60      1046

    accuracy                           0.81      3943
   macro avg       0.76      0.72      0.74      3943
weighted avg       0.80      0.81      0.80      3943



# Random Forest

In [38]:
from sklearn.ensemble import RandomForestClassifier

In [39]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=3, 
                            random_state=319)

In [40]:
# fit model

rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=3, min_samples_leaf=3, random_state=319)

In [41]:
print(rf.feature_importances_)

[1.42422658e-02 1.89213823e-01 7.40442336e-02 8.94407617e-02
 7.32653418e-05 6.84234234e-03 2.31269146e-03 1.80131535e-03
 1.59909626e-03 7.57238741e-02 6.36953300e-03 4.27071431e-04
 1.00419023e-02 2.92484559e-03 1.90448694e-03 1.84491721e-02
 2.36649665e-02 1.36695315e-01 1.52865050e-01 6.76523979e-04
 9.37411629e-02 9.00874557e-03 4.50759610e-04 1.24355300e-03
 8.62432430e-02]


In [42]:
# make predictions

y_pred = rf.predict(X_train)
y_pred[0:5]

array([0, 0, 0, 0, 1], dtype=uint8)

In [43]:
# check actual results

y_train.head()

6058    0
4085    0
3497    1
4271    1
2672    1
Name: churn, dtype: uint8

In [44]:
# make probabilities 

y_pred_proba = rf.predict_proba(X_train)
y_pred_proba[0:5]

array([[0.87116716, 0.12883284],
       [0.55359739, 0.44640261],
       [0.69012759, 0.30987241],
       [0.60791167, 0.39208833],
       [0.48932675, 0.51067325]])

## Evaluate Model

In [45]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

# this model is only 5% more accurate than baseline

Accuracy of random forest classifier on training set: 0.78


In [46]:
print(confusion_matrix(y_train, y_pred))

[[2797  100]
 [ 758  288]]


In [47]:
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,2797,100
1,758,288


In [48]:
print(classification_report(y_train, y_pred))

# once again this model does well on not churn, but poorly on prediction actual churn

              precision    recall  f1-score   support

           0       0.79      0.97      0.87      2897
           1       0.74      0.28      0.40      1046

    accuracy                           0.78      3943
   macro avg       0.76      0.62      0.63      3943
weighted avg       0.77      0.78      0.74      3943



# Random Forest 2

In [49]:
rf2 = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=100,
                            max_depth=5, 
                            random_state=319)

# increasing the max depth from 3 to 5

# decreasing min samples leaf to 1

In [50]:
# fit model

rf2.fit(X_train, y_train)

RandomForestClassifier(max_depth=5, random_state=319)

In [51]:
print(rf2.feature_importances_)

[0.01438008 0.18309855 0.08668078 0.09630564 0.00208259 0.00577958
 0.00257101 0.00625919 0.00679347 0.06096092 0.01189158 0.00262881
 0.01666048 0.00542137 0.00585107 0.01692517 0.03219332 0.11964261
 0.15742166 0.00167311 0.08033083 0.00737107 0.00456037 0.00273923
 0.06977752]


In [52]:
# make predictions

y_pred = rf2.predict(X_train)
y_pred[0:5]

array([0, 1, 0, 0, 1], dtype=uint8)

In [53]:
# check actual results

y_train.head()

6058    0
4085    0
3497    1
4271    1
2672    1
Name: churn, dtype: uint8

In [54]:
# make probabilities 

y_pred_proba = rf2.predict_proba(X_train)
y_pred_proba[0:5]

array([[0.9436172 , 0.0563828 ],
       [0.49007266, 0.50992734],
       [0.62531331, 0.37468669],
       [0.53150998, 0.46849002],
       [0.40276028, 0.59723972]])

## Evaluate Model

In [55]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf2.score(X_train, y_train)))

# this model is only 7% more accurate than baseline

Accuracy of random forest classifier on training set: 0.80


In [56]:
print(confusion_matrix(y_train, y_pred))

[[2737  160]
 [ 614  432]]


In [57]:
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,2737,160
1,614,432


In [58]:
print(classification_report(y_train, y_pred))

# once again this model does well on not churn, but poorly on prediction actual churn, however this perfoms better
# than random forest 1 on train data

              precision    recall  f1-score   support

           0       0.82      0.94      0.88      2897
           1       0.73      0.41      0.53      1046

    accuracy                           0.80      3943
   macro avg       0.77      0.68      0.70      3943
weighted avg       0.79      0.80      0.78      3943



# KNN

In [59]:
from sklearn.neighbors import KNeighborsClassifier

In [60]:
# weights = ['uniform', 'density']
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')

In [61]:
# fit model

knn.fit(X_train, y_train)

KNeighborsClassifier()

In [62]:
# make predictions

y_pred = knn.predict(X_train)
y_pred[0:5]

array([0, 0, 0, 1, 0], dtype=uint8)

In [63]:
# check actual results

y_train.head()

6058    0
4085    0
3497    1
4271    1
2672    1
Name: churn, dtype: uint8

In [64]:
# make probabilities

y_pred_proba = knn.predict_proba(X_train)
y_pred_proba[0:5]

array([[1. , 0. ],
       [0.6, 0.4],
       [0.8, 0.2],
       [0.4, 0.6],
       [0.6, 0.4]])

## Evaluate Model

In [65]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))

# this model is 10% more accurate than baseline on train data

Accuracy of KNN classifier on training set: 0.83


In [66]:
# confusion matrix

print(confusion_matrix(y_train, y_pred))

[[2673  224]
 [ 453  593]]


In [67]:
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,2673,224
1,453,593


In [68]:
print(classification_report(y_train, y_pred))

# does great on not churn, not significant increase on actual churn prediction

              precision    recall  f1-score   support

           0       0.86      0.92      0.89      2897
           1       0.73      0.57      0.64      1046

    accuracy                           0.83      3943
   macro avg       0.79      0.74      0.76      3943
weighted avg       0.82      0.83      0.82      3943



# KNN 2

In [69]:
# weights = ['uniform', 'density']
knn2 = KNeighborsClassifier(n_neighbors=3, weights='uniform')

# decreased n_neighbors from 5 to 3

In [70]:
# fit model

knn2.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=3)

In [71]:
# make predictions

y_pred = knn2.predict(X_train)
y_pred[0:5]

array([0, 0, 0, 1, 0], dtype=uint8)

In [72]:
# check actual results

y_train.head()

6058    0
4085    0
3497    1
4271    1
2672    1
Name: churn, dtype: uint8

In [73]:
# make probabilities

y_pred_proba = knn2.predict_proba(X_train)
y_pred_proba[0:5]

array([[1.        , 0.        ],
       [0.66666667, 0.33333333],
       [0.66666667, 0.33333333],
       [0.33333333, 0.66666667],
       [0.66666667, 0.33333333]])

## Evaluate Model

In [74]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn2.score(X_train, y_train)))

# this model is 13% more accurate than baseline on train data

Accuracy of KNN classifier on training set: 0.86


In [75]:
# confusion matrix

print(confusion_matrix(y_train, y_pred))

[[2696  201]
 [ 352  694]]


In [76]:
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,2696,201
1,352,694


In [77]:
print(classification_report(y_train, y_pred))

# does well on on not churn, so far best actual churn prediction on train data

              precision    recall  f1-score   support

           0       0.88      0.93      0.91      2897
           1       0.78      0.66      0.72      1046

    accuracy                           0.86      3943
   macro avg       0.83      0.80      0.81      3943
weighted avg       0.86      0.86      0.86      3943



# Logistic Regression

In [78]:
from sklearn.linear_model import LogisticRegression

In [79]:
logit = LogisticRegression(C=1, class_weight={0:1, 1:99}, random_state=319, intercept_scaling=1, solver='lbfgs')

In [80]:
# fit model

logit.fit(X_train, y_train)

LogisticRegression(C=1, class_weight={0: 1, 1: 99}, random_state=319)

In [81]:
# feature importance

print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[ 8.40553635e-01 -1.91295786e-02  4.85095930e-02 -6.08504045e-05
  -2.24983596e-01  3.23400598e-01  7.72694531e-02 -8.34936564e-01
   3.27119935e-02  8.94675283e-02 -3.57473032e-01 -2.12375133e-01
  -6.01594522e-01 -9.36040704e-02 -1.43092433e-01 -7.37080174e-02
  -4.17513219e-01 -1.39076053e+00 -2.52347895e-01  3.50759155e-01
   6.66598289e-01 -1.48521466e-01 -1.91825008e-01  4.90440806e-01
   1.29767821e+00]]
Intercept: 
 [1.46125366]


In [82]:
# make predictions

y_pred = logit.predict(X_train)
y_pred[0:5]

array([1, 1, 1, 1, 1], dtype=uint8)

In [83]:
# check actual results

y_train.head()

6058    0
4085    0
3497    1
4271    1
2672    1
Name: churn, dtype: uint8

In [84]:
# make probabilities

y_pred_proba = logit.predict_proba(X_train)
y_pred_proba

array([[0.30223092, 0.69776908],
       [0.00248387, 0.99751613],
       [0.01270545, 0.98729455],
       ...,
       [0.12194129, 0.87805871],
       [0.01602493, 0.98397507],
       [0.02122215, 0.97877785]])

## Evaluate Model

In [85]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train, y_train)))

# very poor accuracy, directly inverse of baseline

Accuracy of Logistic Regression classifier on training set: 0.34


In [86]:
# confusion matrix

print(confusion_matrix(y_train, y_pred))

[[ 286 2611]
 [   2 1044]]


In [87]:
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,286,2611
1,2,1044


In [88]:
print(classification_report(y_train, y_pred))

# very poor model

              precision    recall  f1-score   support

           0       0.99      0.10      0.18      2897
           1       0.29      1.00      0.44      1046

    accuracy                           0.34      3943
   macro avg       0.64      0.55      0.31      3943
weighted avg       0.81      0.34      0.25      3943



# Logistic Regression 2

In [89]:
logit2 = LogisticRegression(C=.1, class_weight={0:1, 1:99}, random_state=319, intercept_scaling=1, solver='lbfgs')

# decreased C from 1 to .1

In [90]:
# fit model 
logit2.fit(X_train, y_train)

LogisticRegression(C=0.1, class_weight={0: 1, 1: 99}, random_state=319)

In [91]:
# feature importance
print('Coefficient: \n', logit2.coef_)
print('Intercept: \n', logit2.intercept_)

Coefficient: 
 [[ 7.90466908e-01 -1.92804006e-02  4.92441768e-02 -6.94067477e-05
  -2.37865915e-01  3.03977203e-01  7.72863241e-02 -8.06714831e-01
   4.28184085e-02  6.89822506e-02 -3.34105954e-01 -2.06445064e-01
  -5.69451386e-01 -9.84192216e-02 -1.44591620e-01 -9.69402719e-02
  -4.06330302e-01 -1.33686340e+00 -2.38247078e-01  3.50388861e-01
   6.13320230e-01 -1.58770657e-01 -1.93030638e-01  4.54844815e-01
   1.22145798e+00]]
Intercept: 
 [1.42535728]


In [92]:
# moke predictions
y_pred2 = logit2.predict(X_train)
y_pred2[0:5]

array([1, 1, 1, 1, 1], dtype=uint8)

In [93]:
# check actual results

y_train.head()

6058    0
4085    0
3497    1
4271    1
2672    1
Name: churn, dtype: uint8

In [94]:
# make probabilities
y_pred_proba2 = logit2.predict_proba(X_train)
y_pred_proba2[0:5]

array([[0.33029921, 0.66970079],
       [0.00270052, 0.99729948],
       [0.01289906, 0.98710094],
       [0.00649256, 0.99350744],
       [0.00894954, 0.99105046]])

## Evaluate Model

In [95]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit2.score(X_train, y_train)))
# very poor accuracy once again

Accuracy of Logistic Regression classifier on training set: 0.34


In [96]:
# confusion matrix
print(confusion_matrix(y_train, y_pred2))

[[ 284 2613]
 [   2 1044]]


In [97]:
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,286,2611
1,2,1044


In [98]:
print(classification_report(y_train, y_pred2))

# practically identical to linear regression 1

              precision    recall  f1-score   support

           0       0.99      0.10      0.18      2897
           1       0.29      1.00      0.44      1046

    accuracy                           0.34      3943
   macro avg       0.64      0.55      0.31      3943
weighted avg       0.81      0.34      0.25      3943



# Validate

## Decision Tree 2

In [None]:
## decision tree 2 performed best on train data

In [106]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf2.score(X_validate, y_validate)))

# slightly worse than on train data, but the same as decision tree 1 on validate

Accuracy of Decision Tree classifier on validate set: 0.79


In [107]:
# Produce y_predictions that come from the X_validate
y_pred = clf2.predict(X_validate)

# Compare actual y values (from validate) to predicted y_values from the model run on X_validate
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.89      0.86      1242
           1       0.63      0.52      0.57       449

    accuracy                           0.79      1691
   macro avg       0.74      0.70      0.72      1691
weighted avg       0.78      0.79      0.79      1691



## Random Forest 2

In [None]:
## random forest 2 performed best on train data

In [110]:
print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(rf2.score(X_validate, y_validate)))

# roughly the same as train data

Accuracy of random forest classifier on test set: 0.80


In [120]:
# Produce y_predictions that come from the X_validate
y_pred = rf2.predict(X_validate)

# Confusion Matrix
print(confusion_matrix(y_validate, y_pred))

# Compare actual y values (from validate) to predicted y_values from the model run on X_validate
print(classification_report(y_validate, y_pred))

[[1158   84]
 [ 259  190]]
              precision    recall  f1-score   support

           0       0.82      0.93      0.87      1242
           1       0.69      0.42      0.53       449

    accuracy                           0.80      1691
   macro avg       0.76      0.68      0.70      1691
weighted avg       0.78      0.80      0.78      1691



## KNN 1

In [None]:
## knn 1 performed best on train data

In [111]:
print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(knn.score(X_validate, y_validate)))

# marginally worse than on train data

Accuracy of KNN classifier on test set: 0.76


In [121]:
# Produce y_predictions that come from the X_validate
y_pred = knn.predict(X_validate)

# Confusion Matrix
print(confusion_matrix(y_validate, y_pred))

# Compare actual y values (from validate) to predicted y_values from the model run on X_validate
print(classification_report(y_validate, y_pred))

[[1074  168]
 [ 246  203]]
              precision    recall  f1-score   support

           0       0.81      0.86      0.84      1242
           1       0.55      0.45      0.50       449

    accuracy                           0.76      1691
   macro avg       0.68      0.66      0.67      1691
weighted avg       0.74      0.76      0.75      1691



## Logistic Regression 1

In [None]:
## logistic regression 1 performed best on train data

In [113]:
# make predictions

y_pred1 = logit.predict(X_validate)


print("Model 1: solver = lbfgs, c = 1")

# accuracy of model 1
print('Accuracy: {:.2f}'.format(logit.score(X_validate, y_validate)))

# confusion matrix of model 1
print(confusion_matrix(y_validate, y_pred1))

# classification report of model 1
print(classification_report(y_validate, y_pred1))

print("Model 2: solver = lbfgs, c = .1")

# slightly worse than on train data, stil dismal

Model 1: solver = lbfgs, c = 1
Accuracy: 0.33
[[ 116 1126]
 [   2  447]]
              precision    recall  f1-score   support

           0       0.98      0.09      0.17      1242
           1       0.28      1.00      0.44       449

    accuracy                           0.33      1691
   macro avg       0.63      0.54      0.31      1691
weighted avg       0.80      0.33      0.24      1691

Model 2: solver = lbfgs, c = .1


# Test

## Random Forest 2

In [123]:
## Random Forest 2 performed best on validate data out of all models

In [130]:
def get_metrics_binary(rf):
    '''
    get_metrics_binary takes in a confusion matrix (cnf) for a binary classifier and prints out metrics based on
    values in variables named X_train, y_train, and y_pred.
    
    return: a classification report as a transposed DataFrame
    '''
    accuracy = rf.score(X_test, y_test)
    class_report = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).T
    conf = confusion_matrix(y_test, y_pred)
    tpr = conf[1][1] / conf[1].sum()
    fpr = conf[0][1] / conf[0].sum()
    tnr = conf[0][0] / conf[0].sum()
    fnr = conf[1][0] / conf[1].sum()
    print(f'''
    The accuracy for our model is {accuracy:.4}
    The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
    The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
    ''')
    return class_report

In [126]:
# Produce y_predictions that come from the X_validate
y_pred = rf2.predict(X_test)
y_pred[0:5]

array([0, 0, 0, 0, 1], dtype=uint8)

In [127]:
y_test.head()

2317    0
1512    0
3533    0
2078    0
4254    0
Name: churn, dtype: uint8

In [128]:
# make probabilities 

y_pred_proba = rf2.predict_proba(X_test)
y_pred_proba[0:5]

array([[0.90441984, 0.09558016],
       [0.88597867, 0.11402133],
       [0.697062  , 0.302938  ],
       [0.96069067, 0.03930933],
       [0.38494458, 0.61505542]])

In [131]:
get_metrics_binary(rf2)

# final product is 6% higher than baseline prediction


    The accuracy for our model is 0.7942
    The True Positive Rate is 0.404, The False Positive Rate is 0.0647,
    The True Negative Rate is 0.935, and the False Negative Rate is 0.596
    


Unnamed: 0,precision,recall,f1-score,support
0,0.812762,0.935266,0.869721,1035.0
1,0.692661,0.403743,0.510135,374.0
accuracy,0.79418,0.79418,0.79418,0.79418
macro avg,0.752711,0.669505,0.689928,1409.0
weighted avg,0.780883,0.79418,0.774274,1409.0
