In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
import acquire
from acquire import get_telco_data

import prepare
from prepare import prep_telco
from prepare import split

In [3]:
df = get_telco_data()

df.head()

Unnamed: 0.1,Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,0,2,1,1,0003-MKNFE,Male,0,No,No,9,...,No,No,Yes,No,59.9,542.4,No,Month-to-month,DSL,Mailed check
1,1,4,1,1,0013-MHZWF,Female,0,No,Yes,9,...,Yes,Yes,Yes,Yes,69.4,571.45,No,Month-to-month,DSL,Credit card (automatic)
2,2,1,1,1,0015-UOCOJ,Female,1,No,No,7,...,No,No,No,Yes,48.2,340.35,No,Month-to-month,DSL,Electronic check
3,3,1,1,1,0023-HGHWL,Male,1,No,No,1,...,No,No,No,Yes,25.1,25.1,Yes,Month-to-month,DSL,Electronic check
4,4,3,1,1,0032-PGELS,Female,0,Yes,Yes,1,...,No,No,No,No,30.5,30.5,Yes,Month-to-month,DSL,Bank transfer (automatic)


In [4]:
df = prep_telco(df)

df.head()

Unnamed: 0,senior_citizen,tenure,monthly_charges,total_charges,male,partner,dependents,phone_service,multiple_lines,no_internet_service,...,churn,one_year,two_year,fiber_optic,credit_card,electric_check,mailed_check,online_backup,live_alone,new_customer
0,0,9,59.9,539.1,1,0,0,1,1,0,...,0,0,0,0,0,0,1,0,True,False
1,0,9,69.4,624.6,0,0,1,1,0,0,...,0,0,0,0,1,0,0,0,False,False
2,1,7,48.2,337.4,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,True,False
3,1,1,25.1,25.1,1,0,0,0,0,0,...,1,0,0,0,0,1,0,0,True,True
4,0,1,30.5,30.5,0,1,1,0,0,0,...,1,0,0,0,0,0,0,0,False,True


In [5]:
train, validate, test = split(df)

In [6]:
train.head()

Unnamed: 0,senior_citizen,tenure,monthly_charges,total_charges,male,partner,dependents,phone_service,multiple_lines,no_internet_service,...,churn,one_year,two_year,fiber_optic,credit_card,electric_check,mailed_check,online_backup,live_alone,new_customer
6058,0,58,19.55,1133.9,1,1,1,1,0,1,...,0,1,0,0,0,1,0,0,False,False
4085,1,17,92.55,1573.35,0,1,0,1,1,0,...,0,0,0,1,0,1,0,0,False,False
3497,0,27,94.55,2552.85,0,0,0,1,1,0,...,1,0,0,1,0,0,1,0,True,False
4271,1,37,76.25,2821.25,0,0,0,1,1,0,...,1,0,0,1,0,1,0,0,True,False
2672,0,7,73.6,515.2,1,0,0,1,1,0,...,1,0,0,1,0,1,0,0,True,False


In [7]:
train.shape

(3943, 26)

In [8]:
validate.head()

Unnamed: 0,senior_citizen,tenure,monthly_charges,total_charges,male,partner,dependents,phone_service,multiple_lines,no_internet_service,...,churn,one_year,two_year,fiber_optic,credit_card,electric_check,mailed_check,online_backup,live_alone,new_customer
4111,0,18,85.2,1533.6,0,0,0,1,1,0,...,1,0,0,1,0,1,0,0,True,False
616,0,31,54.35,1684.85,1,1,1,1,0,0,...,0,0,0,0,0,0,1,0,False,False
6074,0,69,20.35,1404.15,0,1,1,1,0,1,...,0,1,0,0,1,0,0,0,False,False
1737,0,26,61.55,1600.3,1,0,0,1,1,0,...,0,1,0,0,0,1,0,1,True,False
97,0,5,53.85,269.25,0,0,0,1,0,0,...,1,0,0,0,1,0,0,0,True,True


In [9]:
validate.shape

(1691, 26)

In [10]:
test.head()

Unnamed: 0,senior_citizen,tenure,monthly_charges,total_charges,male,partner,dependents,phone_service,multiple_lines,no_internet_service,...,churn,one_year,two_year,fiber_optic,credit_card,electric_check,mailed_check,online_backup,live_alone,new_customer
2317,0,47,73.6,3459.2,1,1,0,1,0,0,...,0,0,1,0,0,1,0,1,False,False
1512,0,50,80.05,4002.5,0,1,1,1,1,0,...,0,1,0,0,0,0,0,1,False,False
3533,0,54,89.4,4827.6,1,0,0,1,1,0,...,0,0,0,1,1,0,0,0,True,False
2078,0,60,37.7,2262.0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,1,False,False
4254,0,3,73.6,220.8,1,0,0,1,0,0,...,0,0,0,1,1,0,0,0,True,True


In [11]:
test.shape

(1409, 26)

In [12]:
train.churn.value_counts()

# largest group is customers who have not churned, that will the the baseline

0    2897
1    1046
Name: churn, dtype: int64

In [13]:
train['baseline'] = 0

train.head()

Unnamed: 0,senior_citizen,tenure,monthly_charges,total_charges,male,partner,dependents,phone_service,multiple_lines,no_internet_service,...,one_year,two_year,fiber_optic,credit_card,electric_check,mailed_check,online_backup,live_alone,new_customer,baseline
6058,0,58,19.55,1133.9,1,1,1,1,0,1,...,1,0,0,0,1,0,0,False,False,0
4085,1,17,92.55,1573.35,0,1,0,1,1,0,...,0,0,1,0,1,0,0,False,False,0
3497,0,27,94.55,2552.85,0,0,0,1,1,0,...,0,0,1,0,0,1,0,True,False,0
4271,1,37,76.25,2821.25,0,0,0,1,1,0,...,0,0,1,0,1,0,0,True,False,0
2672,0,7,73.6,515.2,1,0,0,1,1,0,...,0,0,1,0,1,0,0,True,False,0


In [14]:
baseline_accuracy = (train.churn == train.baseline).mean()
baseline_accuracy

# baseline accuracy is 73%

0.7347197565305605

In [15]:
# create X & y version of train, where y is a series with just the target variable and X are all the features. 

X_train = train.drop(columns=['churn'])
y_train = train.churn

X_validate = validate.drop(columns=['churn'])
y_validate = validate.churn

X_test = test.drop(columns=['churn'])
y_test = test.churn

# Decision Tree 

In [16]:
clf = DecisionTreeClassifier(max_depth=3, random_state=319)

In [17]:
train.isna().sum()

senior_citizen         0
tenure                 0
monthly_charges        0
total_charges          0
male                   0
partner                0
dependents             0
phone_service          0
multiple_lines         0
no_internet_service    0
online_security        0
device_protection      0
tech_support           0
streaming_tv           0
streaming_movies       0
paperless              0
churn                  0
one_year               0
two_year               0
fiber_optic            0
credit_card            0
electric_check         0
mailed_check           0
online_backup          0
live_alone             0
new_customer           0
baseline               0
dtype: int64

In [18]:
# model.fit(X, y)

clf = clf.fit(X_train, y_train)

In [19]:
import graphviz
from graphviz import Graph

In [20]:
dot_data = export_graphviz(clf, feature_names= X_train.columns, class_names=clf.classes_, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('telco_decision_tree', view=True)

TypeError: can only concatenate str (not "numpy.uint8") to str

In [21]:
# make prediction on train obeservations

y_pred = clf.predict(X_train)
y_pred[0:5]

array([0, 0, 0, 0, 1], dtype=uint8)

In [22]:
# check actual results

y_train.head()

6058    0
4085    0
3497    1
4271    1
2672    1
Name: churn, dtype: uint8

In [23]:
# make probability

y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[0:5]

array([[0.87184284, 0.12815716],
       [0.60355987, 0.39644013],
       [0.60355987, 0.39644013],
       [0.60355987, 0.39644013],
       [0.35681818, 0.64318182]])

## Evaluate Model

In [24]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

# this model is only 6% better than baseline on train data

Accuracy of Decision Tree classifier on training set: 0.79


In [25]:
# confusion matrix

confusion_matrix(y_train, y_pred)

array([[2591,  306],
       [ 516,  530]])

In [26]:
y_train.value_counts()

0    2897
1    1046
Name: churn, dtype: int64

In [27]:
import pandas as pd

labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,2591,306
1,516,530


In [28]:
print(classification_report(y_train, y_pred))

# model does well when not churn is the positive, poorly when predicting actual churn

              precision    recall  f1-score   support

           0       0.83      0.89      0.86      2897
           1       0.63      0.51      0.56      1046

    accuracy                           0.79      3943
   macro avg       0.73      0.70      0.71      3943
weighted avg       0.78      0.79      0.78      3943



# Decision Tree 2

In [31]:
clf2 = DecisionTreeClassifier(max_depth=5, random_state=319)

# increasing the max depth from 3 to 5

In [32]:
# model.fit(X, y)

clf2 = clf2.fit(X_train, y_train)

In [36]:
# make prediction on train obeservations

y_pred = clf2.predict(X_train)
y_pred[0:5]

array([0, 1, 0, 1, 1], dtype=uint8)

In [37]:
# check actual results

y_train.head()

6058    0
4085    0
3497    1
4271    1
2672    1
Name: churn, dtype: uint8

In [38]:
# make probability

y_pred_proba = clf2.predict_proba(X_train)
y_pred_proba[0:5]

array([[0.94799054, 0.05200946],
       [0.455     , 0.545     ],
       [0.73305085, 0.26694915],
       [0.455     , 0.545     ],
       [0.17117117, 0.82882883]])

## Evaluate Model

In [40]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf2.score(X_train, y_train)))

# this model performs 8% better than baseline on test data

Accuracy of Decision Tree classifier on training set: 0.81


In [43]:
# confusion matrix

confusion_matrix(y_train, y_pred)



array([[2621,  276],
       [ 482,  564]])

In [44]:
y_train.value_counts()

0    2897
1    1046
Name: churn, dtype: int64

In [45]:
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,2621,276
1,482,564


In [46]:
print(classification_report(y_train, y_pred))

# this model does barely better on predicting churn

              precision    recall  f1-score   support

           0       0.84      0.90      0.87      2897
           1       0.67      0.54      0.60      1046

    accuracy                           0.81      3943
   macro avg       0.76      0.72      0.74      3943
weighted avg       0.80      0.81      0.80      3943



# Random Forest

In [47]:
from sklearn.ensemble import RandomForestClassifier

In [98]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=3, 
                            random_state=319)

In [99]:
# fit model

rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=3, min_samples_leaf=3, random_state=319)

In [100]:
print(rf.feature_importances_)

[0.01477973 0.20074457 0.07822204 0.06760072 0.00025121 0.00461544
 0.00596327 0.00270687 0.00562703 0.04872635 0.00788538 0.00316212
 0.0161871  0.00321462 0.0040342  0.01274351 0.02781933 0.12530479
 0.12184311 0.00190414 0.13840112 0.00486091 0.00224873 0.00484619
 0.09630752 0.        ]


In [101]:
# make predictions

y_pred = rf.predict(X_train)
y_pred[0:5]

array([0, 0, 0, 0, 0], dtype=uint8)

In [102]:
# check actual results

y_train.head()

6058    0
4085    0
3497    1
4271    1
2672    1
Name: churn, dtype: uint8

In [103]:
# make probabilities 

y_pred_proba = rf.predict_proba(X_train)
y_pred_proba[0:5]

array([[0.85227274, 0.14772726],
       [0.53361067, 0.46638933],
       [0.67764149, 0.32235851],
       [0.59702649, 0.40297351],
       [0.50058982, 0.49941018]])

## Evaluate Model

In [104]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

# this model is only 6% more accurate than baseline

Accuracy of random forest classifier on training set: 0.79


In [105]:
print(confusion_matrix(y_train, y_pred))

[[2804   93]
 [ 754  292]]


In [106]:
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,2804,93
1,754,292


In [107]:
print(classification_report(y_train, y_pred))

# once again this model does well on not churn, but poorly on prediction actual churn

              precision    recall  f1-score   support

           0       0.79      0.97      0.87      2897
           1       0.76      0.28      0.41      1046

    accuracy                           0.79      3943
   macro avg       0.77      0.62      0.64      3943
weighted avg       0.78      0.79      0.75      3943



# Random Forest 2

In [108]:
rf2 = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=100,
                            max_depth=5, 
                            random_state=319)

# increasing the max depth from 3 to 5

# decreasing min samples leaf to 1

In [109]:
# fit model

rf2.fit(X_train, y_train)

RandomForestClassifier(max_depth=5, random_state=319)

In [110]:
print(rf2.feature_importances_)

[0.01637743 0.18711647 0.09925649 0.08551863 0.00334335 0.00377875
 0.00631879 0.00541901 0.00914385 0.04593024 0.01307205 0.00549366
 0.0219625  0.00741651 0.00536805 0.01334691 0.0385116  0.09744266
 0.11930906 0.00208421 0.11430314 0.0072297  0.00375168 0.00469356
 0.08381172 0.        ]


In [111]:
# make predictions

y_pred = rf2.predict(X_train)
y_pred[0:5]

array([0, 1, 0, 0, 1], dtype=uint8)

In [112]:
# check actual results

y_train.head()

6058    0
4085    0
3497    1
4271    1
2672    1
Name: churn, dtype: uint8

In [113]:
# make probabilities 

y_pred_proba = rf2.predict_proba(X_train)
y_pred_proba[0:5]

array([[0.93255756, 0.06744244],
       [0.46174227, 0.53825773],
       [0.62730627, 0.37269373],
       [0.54282829, 0.45717171],
       [0.42197603, 0.57802397]])

## Evaluate Model

In [114]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf2.score(X_train, y_train)))

# this model is only 8% more accurate than baseline

Accuracy of random forest classifier on training set: 0.81


In [70]:
print(confusion_matrix(y_train, y_pred))

[[2725  172]
 [ 605  441]]


In [115]:
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,2727,170
1,597,449


In [116]:
print(classification_report(y_train, y_pred))

# once again this model does well on not churn, but poorly on prediction actual churn, however this perfoms better
# than random forest 1 on train data

              precision    recall  f1-score   support

           0       0.82      0.94      0.88      2897
           1       0.73      0.43      0.54      1046

    accuracy                           0.81      3943
   macro avg       0.77      0.69      0.71      3943
weighted avg       0.80      0.81      0.79      3943



# KNN

In [73]:
from sklearn.neighbors import KNeighborsClassifier

In [74]:
# weights = ['uniform', 'density']
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')

In [75]:
# fit model

knn.fit(X_train, y_train)

KNeighborsClassifier()

In [76]:
# make predictions

y_pred = knn.predict(X_train)
y_pred[0:5]

array([0, 0, 0, 1, 0], dtype=uint8)

In [77]:
# check actual results

y_train.head()

6058    0
4085    0
3497    1
4271    1
2672    1
Name: churn, dtype: uint8

In [78]:
# make probabilities

y_pred_proba = knn.predict_proba(X_train)
y_pred_proba[0:5]

array([[1. , 0. ],
       [0.6, 0.4],
       [0.8, 0.2],
       [0.4, 0.6],
       [0.6, 0.4]])

## Evaluate Model

In [79]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))

# this model is 10% more accurate than baseline on train data

Accuracy of KNN classifier on training set: 0.83


In [80]:
# confusion matrix

print(confusion_matrix(y_train, y_pred))

[[2673  224]
 [ 453  593]]


In [81]:
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,2673,224
1,453,593


In [82]:
print(classification_report(y_train, y_pred))

# does great on not churn, not significant increase on actual churn prediction

              precision    recall  f1-score   support

           0       0.86      0.92      0.89      2897
           1       0.73      0.57      0.64      1046

    accuracy                           0.83      3943
   macro avg       0.79      0.74      0.76      3943
weighted avg       0.82      0.83      0.82      3943



# KNN 2

In [83]:
# weights = ['uniform', 'density']
knn2 = KNeighborsClassifier(n_neighbors=3, weights='uniform')

# decreased n_neighbors from 5 to 3

In [84]:
# fit model

knn2.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=3)

In [90]:
# make predictions

y_pred = knn2.predict(X_train)
y_pred[0:5]

array([0, 0, 0, 1, 0], dtype=uint8)

In [86]:
# check actual results

y_train.head()

6058    0
4085    0
3497    1
4271    1
2672    1
Name: churn, dtype: uint8

In [91]:
# make probabilities

y_pred_proba = knn2.predict_proba(X_train)
y_pred_proba[0:5]

array([[1.        , 0.        ],
       [0.66666667, 0.33333333],
       [0.66666667, 0.33333333],
       [0.33333333, 0.66666667],
       [0.66666667, 0.33333333]])

## Evaluate Model

In [88]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn2.score(X_train, y_train)))

# this model is 13% more accurate than baseline on train data

Accuracy of KNN classifier on training set: 0.86


In [92]:
# confusion matrix

print(confusion_matrix(y_train, y_pred))

[[2696  201]
 [ 352  694]]


In [93]:
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,2696,201
1,352,694


In [94]:
print(classification_report(y_train, y_pred))

# does well on on not churn, so far best actual churn prediction on train data

              precision    recall  f1-score   support

           0       0.88      0.93      0.91      2897
           1       0.78      0.66      0.72      1046

    accuracy                           0.86      3943
   macro avg       0.83      0.80      0.81      3943
weighted avg       0.86      0.86      0.86      3943



# Logistic Regression

In [95]:
from sklearn.linear_model import LogisticRegression

In [97]:
logit = LogisticRegression(C=1, class_weight={0:1, 1:99}, random_state=319, intercept_scaling=1, solver='lbfgs')

In [117]:
# fit model

logit.fit(X_train, y_train)

LogisticRegression(C=1, class_weight={0: 1, 1: 99}, random_state=319)

In [118]:
# feature importance

print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[ 8.39626553e-01 -1.90513394e-02  4.85218636e-02 -6.17124076e-05
  -2.25033360e-01  3.22400673e-01  7.72355287e-02 -8.34382439e-01
   3.31359066e-02  8.94670157e-02 -3.56891657e-01 -2.12515220e-01
  -6.00804766e-01 -9.39511405e-02 -1.43409652e-01 -7.45894327e-02
  -4.17711916e-01 -1.39005072e+00 -2.52379814e-01  3.51094737e-01
   6.65190082e-01 -1.48500685e-01 -1.92154121e-01  4.90756832e-01
   1.29682197e+00  0.00000000e+00]]
Intercept: 
 [1.46086714]


In [119]:
# make predictions

y_pred = logit.predict(X_train)
y_pred[0:5]

array([1, 1, 1, 1, 1], dtype=uint8)

In [120]:
# check actual results

y_train.head()

6058    0
4085    0
3497    1
4271    1
2672    1
Name: churn, dtype: uint8

In [121]:
# make probabilities

y_pred_proba = logit.predict_proba(X_train)
y_pred_proba

array([[0.30214752, 0.69785248],
       [0.00249184, 0.99750816],
       [0.01270015, 0.98729985],
       ...,
       [0.12184524, 0.87815476],
       [0.01601162, 0.98398838],
       [0.02120514, 0.97879486]])

## Evaluate Model

In [122]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train, y_train)))

# very poor accuracy, directly inverse of baseline

Accuracy of Logistic Regression classifier on training set: 0.34


In [123]:
# confusion matrix

print(confusion_matrix(y_train, y_pred))

[[ 286 2611]
 [   2 1044]]


In [124]:
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,286,2611
1,2,1044


In [125]:
print(classification_report(y_train, y_pred))

# very poor model

              precision    recall  f1-score   support

           0       0.99      0.10      0.18      2897
           1       0.29      1.00      0.44      1046

    accuracy                           0.34      3943
   macro avg       0.64      0.55      0.31      3943
weighted avg       0.81      0.34      0.25      3943



# Logistic Regression 2

In [128]:
logit2 = LogisticRegression(C=.1, class_weight={0:1, 1:99}, random_state=319, intercept_scaling=1, solver='lbfgs')

# decreased C from 1 to .1

In [129]:
# fit model 
logit2.fit(X_train, y_train)

LogisticRegression(C=0.1, class_weight={0: 1, 1: 99}, random_state=319)

In [130]:
# feature importance
print('Coefficient: \n', logit2.coef_)
print('Intercept: \n', logit2.intercept_)

Coefficient: 
 [[ 7.97117772e-01 -2.43220341e-02  4.75830183e-02 -2.53281115e-05
  -1.42732962e-01  2.85659355e-01  5.45200514e-02 -7.35193979e-01
   1.15278094e-02  1.29615161e-01 -3.60077654e-01 -2.01565996e-01
  -5.84807064e-01 -7.56050162e-02 -1.25614171e-01 -5.71824621e-03
  -3.70134312e-01 -1.27563732e+00 -2.36906866e-01  2.87320813e-01
   6.62157251e-01 -8.98614557e-02 -1.63633267e-01  5.13013660e-01
   1.24928915e+00  0.00000000e+00]]
Intercept: 
 [1.40705283]


In [133]:
# moke predictions
y_pred2 = logit2.predict(X_train)
y_pred2[0:5]

array([1, 1, 1, 1, 1], dtype=uint8)

In [132]:
# check actual results

y_train.head()

6058    0
4085    0
3497    1
4271    1
2672    1
Name: churn, dtype: uint8

In [134]:
# make probabilities
y_pred_proba2 = logit2.predict_proba(X_train)
y_pred_proba2[0:5]

array([[0.31516629, 0.68483371],
       [0.00263835, 0.99736165],
       [0.01165479, 0.98834521],
       [0.00624392, 0.99375608],
       [0.00818101, 0.99181899]])

## Evaluate Model

In [135]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit2.score(X_train, y_train)))
# very poor accuracy once again

Accuracy of Logistic Regression classifier on training set: 0.34


In [136]:
# confusion matrix
print(confusion_matrix(y_train, y_pred2))

[[ 288 2609]
 [   2 1044]]


In [137]:
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,286,2611
1,2,1044


In [138]:
print(classification_report(y_train, y_pred2))

# practically identical to linear regression 1

              precision    recall  f1-score   support

           0       0.99      0.10      0.18      2897
           1       0.29      1.00      0.44      1046

    accuracy                           0.34      3943
   macro avg       0.64      0.55      0.31      3943
weighted avg       0.81      0.34      0.25      3943



# Validate