**Build Simple Model**

Now that I have investigated the use of demographics, this notebook will build my final 'simple' models using the fuller dataset.  I will save my tuned predictions so that they can be combined with the other models output in the group!

In [4]:
import numpy as np
import pandas as pd

In [5]:
data = pd.read_csv("data/filtered_data.csv")
data.head()

KeyError: "['Unnamed: 0'] not found in axis"

In [None]:
#Split into X and y
X = data.drop("target", axis=1)
y = data.target

In [None]:
#Create a train_test_split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42, stratify = y)

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train, y_train)

print("Score on training data: {}".format(lr.score(X_train, y_train)))
#95% - high accuracy, but it is a biased dataset
pred_train = lr.predict(X_train)

print("Score on testing data: {}".format(lr.score(X_test, y_test)))
#Also 95%
pred_test = lr.predict(X_test)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import classification_report

y_train_probs = lr.predict_proba(X_train)
y_train_pred = [1 if x[1] > 0.5 else 0 for x in y_train_probs]

print(f"Precision: {precision_score(y_train, y_train_pred):.3f}")
print(f"Recall: {recall_score(y_train, y_train_pred):.3f}")
print(f"F1 score: {f1_score(y_train, y_train_pred):.3f}")

print(classification_report(y_train, y_train_pred))

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import classification_report

y_test_probs = lr.predict_proba(X_test)
y_test_pred = [1 if x[1] > 0.5 else 0 for x in y_test_probs]

print(f"Precision: {precision_score(y_test, y_test_pred):.3f}")
print(f"Recall: {recall_score(y_test, y_test_pred):.3f}")
print(f"F1 score: {f1_score(y_test, y_test_pred):.3f}")

print(classification_report(y_test, y_test_pred))


In [None]:
import matplotlib.pyplot as plt

#Use ROC curve to find the right threshold value
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

fpr, tpr, thresholds = roc_curve(y_test, y_test_probs[:,1])
print(roc_auc_score(y_test, y_test_probs[:,1]))

fig_roc, ax_roc = plt.subplots()
ROC = roc_curve(y_test, y_test_probs[:,1])
ax_roc.plot(ROC[0], ROC[1])
ax_roc.plot([0, 1], [0, 1])

#We have a ROCAUC score of 0.895 which indicates the model is better than random at guessing, closer to 1 is better
#Try plotting precision versus recall for different thresholds

We can potentially improve our performance by selecting what value we use as the cut-off between true/false predictions

In [None]:
#The 'optimal' value is essentially the point where our blue line (performance) is furthest from the orange line
import math
def performance(fpr, tpr):
    base = max(fpr, tpr)
    dist = (fpr - base)**2 + (tpr - base)**2
    dist = math.sqrt(dist)
    return dist
    
dists = [performance(x, y) for x, y in zip(fpr, tpr)]
dists
best = max(dists)
best

print("Threshold: {}".format(thresholds[dists.index(best)]))
print("FPR: {}".format(fpr[dists.index(best)]))
print("TPR: {}".format(tpr[dists.index(best)]))

threshold = thresholds[dists.index(best)]

#The above gave a threshold that is good for the recall but suffers on precision

In [None]:
y_train_probs = lr.predict_proba(X_train)
y_train_pred = [1 if x[1] > threshold else 0 for x in y_train_probs]

print(f"Precision: {precision_score(y_train, y_train_pred):.3f}")
print(f"Recall: {recall_score(y_train, y_train_pred):.3f}")
print(f"F1 score: {f1_score(y_train, y_train_pred):.3f}")

print(classification_report(y_train, y_train_pred))

In [None]:
y_test_probs = lr.predict_proba(X_test)
y_test_pred = [1 if x[1] > threshold else 0 for x in y_test_probs]

print(f"Precision: {precision_score(y_test, y_test_pred):.3f}")
print(f"Recall: {recall_score(y_test, y_test_pred):.3f}")
print(f"F1 score: {f1_score(y_test, y_test_pred):.3f}")

print(classification_report(y_test, y_test_pred))

In [None]:
import pickle

with open("data/lr_train_pred.pkl", "wb") as f:
    pickle.dump(y_train_pred)
    
with open("data/lr_test_pred.pkl", "wb") as f:
    pickle.dump(y_test_pred)