# Intro

> Name: Jack Risse

> School: Flatiron School

> Date: 11.12.2019

> Instructors: Howard Smith & Amber Yandow

> Data: Telecom Customer Churn Prediction

# Imports

In [4]:
import datetime
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import tensorflow as tf
from tensorflow.keras import metrics

from sklearn.metrics import mean_absolute_error, confusion_matrix
from sklearn.metrics import accuracy_score, classification_report, roc_curve
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from sklearn.linear_model import LogisticRegression as Logit
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import load_model
 

ModuleNotFoundError: No module named 'plotly.graph_objects'

# Loading Data

In [None]:
tlc = pd.read_csv('/content/Mod4_Project/WA_Fn-UseC_-Telco-Customer-Churn.csv')
tlc.head()

In [None]:
tlc.info()

# Functions

In [None]:
def binary(x):
  """
  A simple function to turn categorical data
  to binary data meant to be passed in map(),
  applymap(), or apply().
  """

  if x == 'Yes':
    x = 1
  else:
    x = 0

  return x

def thresh_pred(model, x, threshold=.5):
  """
  Utilizes the sklearn predict_proba() to predict 
  classes with different thresholds.

  Parameters
  ----------
  model : fitted model
    Any kind of model from sklearn

  x : list, numpy array of ints or floats
    The input data that was used to train the model

  threshold : float 
    Needs to be a float between 0,1 because dealing 
    with probablities

  Returns
  ---------
  a list of the predicted classes for each instance
  """
  predicts = []
  for x in model.predict_proba(x):
    if x[1] >= threshold:
      predicts.append(1)
    else:
      predicts.append(0)

  return predicts

def model_report(ytrue, ypred):
  """
  Simple function to print out a few metrics 
  specific to a classification model.
  """
  print('Accuracy: ', accuracy_score(ytrue, ypred))
  print('-----------------')
  print('Confusion Matrix: \n', confusion_matrix(ytrue, ypred))
  print('-----------------')
  print('Classification Report: \n', classification_report(ytrue, ypred))
  print('-----------------')
  print('MAE Score: ', mean_absolute_error(ytrue, ypred))
  print('_________________')

def plot_roc_curve(y, pred):
  """
  Function that plots the ROC curve and prints
  out the area under the curve (AUC) score.

  Parameters
  ----------
  y : numpy array
    Data used to train/validate/test your model

  pred : numpy array
    Predicted values
  """

  fpr, tpr, thresholds = roc_curve(y, pred)
 
  layout = {"title": "ROC Curve Churn Classificaiton", 
            "xaxis": {"title": "False Positive Rate"}, 
            "yaxis": {"title": "True Positive Rate"}
            }

  fig = go.Figure(layout=layout)

  fig.add_trace(go.Scatter(x=fpr,
                           y=tpr,
                           name='ROC Curve',
                           line={'dash': 'solid',
                                 'color': 'red',
                                 'width': 2}))

  fig.add_trace(go.Scatter(x=[0,1], 
                           y=[0,1],
                           name='No name',
                           line={'dash': 'dash',
                                 'color': 'black',
                                 'width': 2}))     
  fig.show()

  print('AUC Score: ', roc_auc_score(y, pred))


# EDA & Data Cleaning

In [None]:
fig = go.Figure()
fig.addtrace(go.Bar)

In [0]:
binary_fts = ['Partner',            # columns with only two categories
              'Dependents', 
              'PhoneService', 
              'OnlineSecurity', 
              'DeviceProtection', 
              'TechSupport', 
              'StreamingTV', 
              'StreamingMovies', 
              'PaperlessBilling', 
              'Churn',
              'gender',
              'OnlineBackup']

In [0]:
tlc[binary_fts] = tlc[binary_fts].applymap(binary) # changes columns with two classes to binary data

In [0]:
tlc.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,0,0,1,0,1,0,No phone service,DSL,0,1,0,0,0,0,Month-to-month,1,Electronic check,29.85,29.85,0
1,5575-GNVDE,0,0,0,0,34,1,No,DSL,1,0,1,0,0,0,One year,0,Mailed check,56.95,1889.5,0
2,3668-QPYBK,0,0,0,0,2,1,No,DSL,1,1,0,0,0,0,Month-to-month,1,Mailed check,53.85,108.15,1
3,7795-CFOCW,0,0,0,0,45,0,No phone service,DSL,1,0,1,1,0,0,One year,0,Bank transfer (automatic),42.3,1840.75,0
4,9237-HQITU,0,0,0,0,2,1,No,Fiber optic,0,0,0,0,0,0,Month-to-month,1,Electronic check,70.7,151.65,1


In [0]:
tlc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
customerID          7043 non-null object
gender              7043 non-null int64
SeniorCitizen       7043 non-null int64
Partner             7043 non-null int64
Dependents          7043 non-null int64
tenure              7043 non-null int64
PhoneService        7043 non-null int64
MultipleLines       7043 non-null object
InternetService     7043 non-null object
OnlineSecurity      7043 non-null int64
OnlineBackup        7043 non-null int64
DeviceProtection    7043 non-null int64
TechSupport         7043 non-null int64
StreamingTV         7043 non-null int64
StreamingMovies     7043 non-null int64
Contract            7043 non-null object
PaperlessBilling    7043 non-null int64
PaymentMethod       7043 non-null object
MonthlyCharges      7043 non-null float64
TotalCharges        7043 non-null object
Churn               7043 non-null int64
dtypes: float64(1), int64(14), object(6)
memo

In [0]:
target = tlc['Churn'] # the target variable 

> Pie chart showing imbalance of the data given at hand which needs to handled at some point in time.

In [2]:
trace = go.Pie(labels=['Non_churn', 'Churn'], values=target.value_counts(), hole=.5)
layout = go.Layout(title='Class Embalance of the Target')
fig = go.Figure(data=trace, layout=layout)
fig.show()

NameError: name 'go' is not defined

In [0]:
mon_churn = tlc['MonthlyCharges'].loc[tlc['Churn']==1] # Monthly Charges for customers who churned
mon_no_churn = tlc['MonthlyCharges'].loc[tlc['Churn']==0] # Monthly Charfes for custers who haven't churned

> A histogram showing tenure and monthly charges of customers that churned and haven't churned.

In [0]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=tlc['tenure'], name='Tenure'))
fig.add_trace(go.Histogram(x=mon_churn, name='Monthly Charges of Churn'))
fig.add_trace(go.Histogram(x=mon_no_churn, name='Monthly Charges of No Churn'))
fig.update_layout(title_text='Tenure & Monthly Charges Distribution')

> One Hot Encoding the features with more than two categories. 

In [0]:
tlc['TotalCharges'] = pd.to_numeric(tlc['TotalCharges'], errors='coerce') # TotalCharges needed to be a float
fts = tlc[[col for col in tlc.columns if tlc[col].dtype == int or tlc[col].dtype == float]]
tlc_dum = ['MultipleLines', 'InternetService', 'Contract', 'PaymentMethod']
dummies = pd.get_dummies(tlc, columns=tlc_dum, drop_first=True)
dummies.drop('Churn', axis=1, inplace=True)

> TotalCharges had 11 missing values. Of those missing values the tenure was 0 meaning that the customer was new to the company so the missing values were filled with 0 since they haven't had any charges yet.

In [0]:
feats = dummies.drop(columns='customerID', axis=1) # customerID was an unecessary feature 
feats['TotalCharges']= feats['TotalCharges'].fillna(0)

In [0]:
columns = [col for col in feats.columns] # creating columns variable for future manipulation

# Models
> Problem: Predict if a customer has churned or not

> Algorithm: Logistic Regression 
* Logistic Regression is best used for binary classification

> Split of Data:
* Train: 70%
* Test: 30%

> Thresholds:
* Thersholds help with balancing how accurate and sensitivity of a model.
* Thresholds were compared at .25, .30, .35, .40, .45, adn .50 but when running the models .45 and .50 showed the best metrics for the model.


> Metrics:
* Accuracy
* Confusion Matrix
* Precision
* Recall 
* F1 Score
* Mean Absolute Error (MAE)
* ROC Curve 
* Area Under the Curve (AUC)

## Baseline Model
> The baseline model is for comparing with other models to see if the changes that were being made were improving the model. The only data engineering that was completed for this model was One Hot Encoding and Binarizing the categorical features.

### Training


In [0]:
xtrain, xtest, ytrain, ytest = train_test_split(feats, target, random_state=0) 

model = Logit(C=1.0, fit_intercept=True, solver='liblinear')
results = model.fit(xtrain, ytrain)

> Model Summary

In [0]:
thresholds = [.45, .50] # variable to calculate churn predictions at different thresholds
for x in thresholds:
  pred = thresh_pred(model, xtrain, threshold=x)
  print('\nThershold @ ', x, '\n')
  model_report(ytrain, pred)


Thershold @  0.45 

Accuracy:  0.8023475956077244
-----------------
Confusion Matrix: 
 [[3376  500]
 [ 544  862]]
-----------------
Classification Report: 
               precision    recall  f1-score   support

           0       0.86      0.87      0.87      3876
           1       0.63      0.61      0.62      1406

    accuracy                           0.80      5282
   macro avg       0.75      0.74      0.74      5282
weighted avg       0.80      0.80      0.80      5282

-----------------
MAE Score:  0.19765240439227566
_________________

Thershold @  0.5 

Accuracy:  0.8082165846270352
-----------------
Confusion Matrix: 
 [[3484  392]
 [ 621  785]]
-----------------
Classification Report: 
               precision    recall  f1-score   support

           0       0.85      0.90      0.87      3876
           1       0.67      0.56      0.61      1406

    accuracy                           0.81      5282
   macro avg       0.76      0.73      0.74      5282
weighted avg    

> ROC Curves

In [0]:
for x in thresholds:
  pred = thresh_pred(model, xtrain, threshold=x)
  print('\nThershold @ ', x, '\n')
  plot_roc_curve(ytrain, pred)


Thershold @  0.45 



AUC Score:  0.742043901486626

Thershold @  0.5 



AUC Score:  0.7285931442278191


### Testing

In [0]:
xtrain, xtest, ytrain, ytest = train_test_split(feats, target, random_state=0)

model = Logit(C=1.0, fit_intercept=True, solver='liblinear')
results = model.fit(xtest, ytest)

> Model Summary



In [0]:
for x in thresholds:
  pred = thresh_pred(model, xtest, threshold=x)
  print('\nThershold @ ', x, '\n')
  model_report(ytest, pred)


Thershold @  0.45 

Accuracy:  0.7955706984667802
-----------------
Confusion Matrix: 
 [[1136  162]
 [ 198  265]]
-----------------
Classification Report: 
               precision    recall  f1-score   support

           0       0.85      0.88      0.86      1298
           1       0.62      0.57      0.60       463

    accuracy                           0.80      1761
   macro avg       0.74      0.72      0.73      1761
weighted avg       0.79      0.80      0.79      1761

-----------------
MAE Score:  0.20442930153321975
_________________

Thershold @  0.5 

Accuracy:  0.80465644520159
-----------------
Confusion Matrix: 
 [[1175  123]
 [ 221  242]]
-----------------
Classification Report: 
               precision    recall  f1-score   support

           0       0.84      0.91      0.87      1298
           1       0.66      0.52      0.58       463

    accuracy                           0.80      1761
   macro avg       0.75      0.71      0.73      1761
weighted avg      

> ROC Curves

In [0]:
for x in thresholds:
  pred = thresh_pred(model, xtest, threshold=x)
  print('\nThershold @ ', x, '\n')
  plot_roc_curve(ytest, pred)


Thershold @  0.45 



AUC Score:  0.7237734078346151

Thershold @  0.5 



AUC Score:  0.7139585073563914


### Reflection
> When first running the model I was surprised to get such an high accuracy but then I remembered that class imbalance hadn't been taken care of.



---



---



# SMOTE
> Synthetic Minority Oversampling Technique
* Uses an over-sampling techinque but doesn't reuse existing observations but instead it generates new observations based on the previous observation
* Fixes the problem of class imbalance

In [0]:
smote = SMOTE(random_state=0) # initializing the SMOTE object
feats, target = smote.fit_sample(feats, target)
print(pd.Series(target).value_counts(normalize=True))

1    0.5
0    0.5
dtype: float64


> Pie Charts shows that SMOTE technique worked in creating a class balance.

In [0]:
trace = go.Pie(labels=['Non_churn', 'Churn'], values=pd.Series(target).value_counts(), hole=.5)
layout = go.Layout(title='Class Embalance of the Target')
fig = go.Figure(data=trace, layout=layout)
fig.show()

In [0]:
feats = pd.DataFrame(data=feats, columns=columns) # convert to a dataframe
feats.head() # the SMOTE technique doesn't leave the data in a dataframe

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,PaperlessBilling,MonthlyCharges,TotalCharges,MultipleLines_No phone service,MultipleLines_Yes,InternetService_Fiber optic,InternetService_No,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,29.85,29.85,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,34.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,56.95,1889.5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,53.85,108.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,45.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,42.3,1840.75,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,70.7,151.65,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


## Scaled Model

> Algorithm: Logistic Regression

> The numerical features were scaled between 0 and 1 because the categorical features were being represented by 0s and 1s so intuitively the numerical features needed to be scaled between 0 and 1 creating normalization through out the dataset.

In [0]:
scaler = MinMaxScaler()
feats[['tenure', 'MonthlyCharges', 'TotalCharges']] = scaler.fit_transform(feats[['tenure', 'MonthlyCharges', 'TotalCharges']])

In [0]:
feats.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,PaperlessBilling,MonthlyCharges,TotalCharges,MultipleLines_No phone service,MultipleLines_Yes,InternetService_Fiber optic,InternetService_No,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0.0,0.0,1.0,0.0,0.013889,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.115423,0.003437,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.472222,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.385075,0.217564,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.027778,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.354229,0.012453,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.625,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.239303,0.211951,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.027778,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.521891,0.017462,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


### Training

In [0]:
xtrain, xtest, ytrain, ytest = train_test_split(feats, target, random_state=0)

results = model.fit(xtrain, ytrain)

> Model Summary

In [0]:
for x in thresholds:
  pred = thresh_pred(model, xtrain, threshold=x)
  print('\nThershold @ ', x, '\n')
  model_report(ytrain, pred)


Thershold @  0.45 

Accuracy:  0.7689730704806081
-----------------
Confusion Matrix: 
 [[2682 1202]
 [ 591 3286]]
-----------------
Classification Report: 
               precision    recall  f1-score   support

           0       0.82      0.69      0.75      3884
           1       0.73      0.85      0.79      3877

    accuracy                           0.77      7761
   macro avg       0.78      0.77      0.77      7761
weighted avg       0.78      0.77      0.77      7761

-----------------
MAE Score:  0.23102692951939183
_________________

Thershold @  0.5 

Accuracy:  0.7697461667310913
-----------------
Confusion Matrix: 
 [[2831 1053]
 [ 734 3143]]
-----------------
Classification Report: 
               precision    recall  f1-score   support

           0       0.79      0.73      0.76      3884
           1       0.75      0.81      0.78      3877

    accuracy                           0.77      7761
   macro avg       0.77      0.77      0.77      7761
weighted avg    

> ROC Curves

In [0]:
for x in thresholds:
  pred = thresh_pred(model, xtrain, threshold=x)
  print('\nThershold @ ', x, '\n')
  plot_roc_curve(ytrain, pred)


Thershold @  0.45 



AUC Score:  0.769043890041006

Thershold @  0.5 



AUC Score:  0.7697830520747805


### Testing

In [0]:
results = model.fit(xtest, ytest)

> Model Summary

In [0]:
for x in thresholds:
  pred = thresh_pred(model, xtest, threshold=x)
  print('\nThershold @ ', x, '\n')
  model_report(ytest, pred)


Thershold @  0.45 

Accuracy:  0.7750289911093932
-----------------
Confusion Matrix: 
 [[ 906  384]
 [ 198 1099]]
-----------------
Classification Report: 
               precision    recall  f1-score   support

           0       0.82      0.70      0.76      1290
           1       0.74      0.85      0.79      1297

    accuracy                           0.78      2587
   macro avg       0.78      0.77      0.77      2587
weighted avg       0.78      0.78      0.77      2587

-----------------
MAE Score:  0.2249710088906069
_________________

Thershold @  0.5 

Accuracy:  0.7738693467336684
-----------------
Confusion Matrix: 
 [[ 950  340]
 [ 245 1052]]
-----------------
Classification Report: 
               precision    recall  f1-score   support

           0       0.79      0.74      0.76      1290
           1       0.76      0.81      0.78      1297

    accuracy                           0.77      2587
   macro avg       0.78      0.77      0.77      2587
weighted avg     

> ROC Curves

In [0]:
for x in thresholds:
  pred = thresh_pred(model, xtest, threshold=x)
  print('\nThershold @ ', x, '\n')
  plot_roc_curve(ytest, pred)


Thershold @  0.45 



AUC Score:  0.7748327984077746

Thershold @  0.5 



AUC Score:  0.7737683264301041


### Reflection
> 



---



---



# Feature Engineering

> Interactions
* the feature tenure and total charges have a relationship with one each other so I want to see if adding an interaction feature of the two would be beneficial.

In [0]:
feats['TotalChargesXtenure'] = np.multiply(feats['TotalCharges'], feats['tenure'])

In [0]:
feats['TotalChargesXtenure']

0        0.000048
1        0.102739
2        0.000346
3        0.132469
4        0.000485
           ...   
10343    0.000498
10344    0.032796
10345    0.029060
10346    0.038059
10347    0.000291
Name: TotalChargesXtenure, Length: 10348, dtype: float64



---



---



## Feature Engineered Model
> Algorithm: Logistic Regression

### Training

In [0]:
xtrain, xtest, ytrain, ytest = train_test_split(feats, target, random_state=0)

results = model.fit(xtrain, ytrain)

> Model Summary

In [0]:
for x in thresholds:
  pred = thresh_pred(model, xtrain, threshold=x)
  print('\nThershold @ ', x, '\n')
  model_report(ytrain, pred)


Thershold @  0.45 

Accuracy:  0.7693596186058498
-----------------
Confusion Matrix: 
 [[2686 1198]
 [ 592 3285]]
-----------------
Classification Report: 
               precision    recall  f1-score   support

           0       0.82      0.69      0.75      3884
           1       0.73      0.85      0.79      3877

    accuracy                           0.77      7761
   macro avg       0.78      0.77      0.77      7761
weighted avg       0.78      0.77      0.77      7761

-----------------
MAE Score:  0.23064038139415025
_________________

Thershold @  0.5 

Accuracy:  0.7723231542327019
-----------------
Confusion Matrix: 
 [[2843 1041]
 [ 726 3151]]
-----------------
Classification Report: 
               precision    recall  f1-score   support

           0       0.80      0.73      0.76      3884
           1       0.75      0.81      0.78      3877

    accuracy                           0.77      7761
   macro avg       0.77      0.77      0.77      7761
weighted avg    

In [0]:
for x in thresholds:
  pred = thresh_pred(model, xtrain, threshold=x)
  print('\nThershold @ ', x, '\n')
  plot_roc_curve(ytrain, pred)


Thershold @  0.45 



AUC Score:  0.7694298574045833

Thershold @  0.5 



AUC Score:  0.7723595768118883


### Testing

> Model Summary

In [0]:
for x in thresholds:
  pred = thresh_pred(model, xtest, threshold=x)
  print('\nThershold @ ', x, '\n')
  model_report(ytest, pred)


Thershold @  0.45 

Accuracy:  0.7676845767298028
-----------------
Confusion Matrix: 
 [[ 890  400]
 [ 201 1096]]
-----------------
Classification Report: 
               precision    recall  f1-score   support

           0       0.82      0.69      0.75      1290
           1       0.73      0.85      0.78      1297

    accuracy                           0.77      2587
   macro avg       0.77      0.77      0.77      2587
weighted avg       0.77      0.77      0.77      2587

-----------------
MAE Score:  0.23231542327019714
_________________

Thershold @  0.5 

Accuracy:  0.7750289911093932
-----------------
Confusion Matrix: 
 [[ 947  343]
 [ 239 1058]]
-----------------
Classification Report: 
               precision    recall  f1-score   support

           0       0.80      0.73      0.76      1290
           1       0.76      0.82      0.78      1297

    accuracy                           0.78      2587
   macro avg       0.78      0.77      0.77      2587
weighted avg    

In [0]:
for x in thresholds:
  pred = thresh_pred(model, xtest, threshold=x)
  print('\nThershold @ ', x, '\n')
  plot_roc_curve(ytest, pred)


Thershold @  0.45 



AUC Score:  0.7674747329854824

Thershold @  0.5 



AUC Score:  0.7749185658018205


### Reflection

> No metrics were improved so adding the interaction feature with tenure and total charges didn't seem to help.

## Hyperparameter Tuning Model
> Algorithm: Logistic Regression

> To find the best hyperparameters I used a grid search which is an exhaustive iteravtive approach to find the best model with the combination of hyperparameters.

### Training

In [0]:
C = np.arange(.000001, 2, .01) # interval of learning rates for regularization
params = {'solver': ['liblinear'],
          'penalty': ['l1','l2'], # lasso or ridge regularization
          'C':C,
          'class_weight': ['balanced'],
          }

gsm = GridSearchCV(estimator=Logit(), param_grid=params, cv=5)

best_model = gsm.fit(xtrain, ytrain)

In [0]:
for x in thresholds:
  pred = thresh_pred(best_model, xtrain, threshold=x)
  print('\nThershold @ ', x, '\n')
  model_report(ytrain, pred)


Thershold @  0.45 

Accuracy:  0.7691019198556887
-----------------
Confusion Matrix: 
 [[2684 1200]
 [ 592 3285]]
-----------------
Classification Report: 
               precision    recall  f1-score   support

           0       0.82      0.69      0.75      3884
           1       0.73      0.85      0.79      3877

    accuracy                           0.77      7761
   macro avg       0.78      0.77      0.77      7761
weighted avg       0.78      0.77      0.77      7761

-----------------
MAE Score:  0.2308980801443113
_________________

Thershold @  0.5 

Accuracy:  0.7720654554825409
-----------------
Confusion Matrix: 
 [[2841 1043]
 [ 726 3151]]
-----------------
Classification Report: 
               precision    recall  f1-score   support

           0       0.80      0.73      0.76      3884
           1       0.75      0.81      0.78      3877

    accuracy                           0.77      7761
   macro avg       0.77      0.77      0.77      7761
weighted avg     

In [0]:
for x in thresholds:
  pred = thresh_pred(best_model, xtrain, threshold=x)
  print('\nThershold @ ', x, '\n')
  plot_roc_curve(ytrain, pred)


Thershold @  0.45 



AUC Score:  0.7691723908752321

Thershold @  0.5 



AUC Score:  0.7721021102825372


### Testing

In [0]:
best_model = gsm.fit(xtest, ytest)

In [0]:
for x in thresholds:
  pred = thresh_pred(best_model, xtest, threshold=x)
  print('\nThershold @ ', x, '\n')
  model_report(ytest, pred)


Thershold @  0.45 

Accuracy:  0.7645921917278701
-----------------
Confusion Matrix: 
 [[ 893  397]
 [ 212 1085]]
-----------------
Classification Report: 
               precision    recall  f1-score   support

           0       0.81      0.69      0.75      1290
           1       0.73      0.84      0.78      1297

    accuracy                           0.76      2587
   macro avg       0.77      0.76      0.76      2587
weighted avg       0.77      0.76      0.76      2587

-----------------
MAE Score:  0.23540780827212987
_________________

Thershold @  0.5 

Accuracy:  0.7754155392346347
-----------------
Confusion Matrix: 
 [[ 968  322]
 [ 259 1038]]
-----------------
Classification Report: 
               precision    recall  f1-score   support

           0       0.79      0.75      0.77      1290
           1       0.76      0.80      0.78      1297

    accuracy                           0.78      2587
   macro avg       0.78      0.78      0.78      2587
weighted avg    

In [0]:
for x in thresholds:
  pred = thresh_pred(best_model, xtest, threshold=x)
  print('\nThershold @ ', x, '\n')
  plot_roc_curve(ytest, pred)


Thershold @  0.45 



AUC Score:  0.76439696855594

Thershold @  0.5 



AUC Score:  0.7753480004542385


### Reflection
> Since there was no huge change in accuracy from the previous model I can safely say that the model isn't overfitting. If the model was overfitting the accuracy of the previous model would have been really high for the training data and low for the testing data and the accuracy of hyperparameters tuned model would've changed significantlly from the previous model.

# Logistic Regression using Deep Learning
> First Attempt at modeling with neural networks

> TensorBoard Configurations

In [0]:
log_dir="logs/fit/" + datetime.datetime.now().strftime("%Y.%m.%d")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

> Creating the Neural Network

In [0]:
l1 = tf.keras.regularizers.l1(.001)
lrnn = Sequential()
lrnn.add(Dense(16, input_dim=xtrain.shape[1], activation='relu', kernel_regularizer=l1))
lrnn.add(Dense(8, activation='relu'))
lrnn.add(Dense(1, activation='sigmoid'))

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


> Training the Network

In [0]:
lrnn.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['binary_accuracy',])

lrnn.fit(xtrain, 
         ytrain, 
         epochs=150, 
         batch_size=10,
         callbacks=[tensorboard_callback],
         verbose=0)


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


<tensorflow.python.keras.callbacks.History at 0x7f6433f2f7f0>

In [0]:
%tensorboard --logdir logs/fit

In [0]:
pred = lrnn.predict_classes(xtrain)
model_report(ytrain, pred)

Accuracy:  0.8406133230253833
-----------------
Confusion Matrix: 
 [[3215  669]
 [ 568 3309]]
-----------------
Classification Report: 
               precision    recall  f1-score   support

           0       0.85      0.83      0.84      3884
           1       0.83      0.85      0.84      3877

    accuracy                           0.84      7761
   macro avg       0.84      0.84      0.84      7761
weighted avg       0.84      0.84      0.84      7761

-----------------
MAE Score:  0.15938667697461667
_________________


In [0]:
plot_roc_curve(ytrain, pred)

AUC Score:  0.8406249311009738


In [57]:
pred = lrnn.predict_classes(xtest)
model_report(ytest, pred)

Accuracy:  0.8144568998840356
-----------------
Confusion Matrix: 
 [[1017  273]
 [ 207 1090]]
-----------------
Classification Report: 
               precision    recall  f1-score   support

           0       0.83      0.79      0.81      1290
           1       0.80      0.84      0.82      1297

    accuracy                           0.81      2587
   macro avg       0.82      0.81      0.81      2587
weighted avg       0.82      0.81      0.81      2587

-----------------
MAE Score:  0.18554310011596445
_________________


In [58]:
plot_roc_curve(ytest, pred)

AUC Score:  0.8143865091176419
