# Final Models

This notebook focuses on the baseline and final models used for this classification project.
<br>
<br>
For this project, I have focused on increasing the model's accuracy score given that I believe a false positive and false negativ are equally "bad". In this specific data set, a false positive means that the model misclassified a book as a bestseller and a false negative means that the model misclassified a book as NOT a bestseller.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
import itertools

In [None]:
df = pd.read_csv('final_dataframe.csv',index_col=0)

In [None]:
pd.set_option('display.max_columns', 500)

In [None]:
# select the columns to be features in the model
feats = ['series','rating','publish_month','publish_company','genre','top_authors']

# get dummies for the categorical data
X = pd.get_dummies(df[feats], drop_first=True)

Y = df['NYT_bestseller']

In [None]:
X.columns

In [None]:
# for publisher, genre, and month of publishing dummy variables, i only want the "important ones" (i.e. the ones
# that will actually impact the models) which i have deemed to be anything where the count was over 10 

X.drop([col for col, val in X.sum().iteritems() if val < 15], axis=1, inplace=True)

In [None]:
# normalize the features

for col in X.columns:
    X[col] = (X[col]-min(X[col]))/ (max(X[col]) - min(X[col])) 

#We subtract the minimum and divide by the range forcing a scale of 0 to 1 for each feature

X.head()

In [None]:
# split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)

# Baseline: Dummy Classifier

The Dummy Classifier employs a strategy of selecting the most frequent class for each observation. Given the data set has more non-bestsellers than bestsellers, the model consistently predicts non-bestsellers with an accuracy score of ~68%. 

In [None]:
dum_clf = DummyClassifier(strategy='most_frequent')
dum_model = dum_clf.fit(X_train,y_train)
y_hat_train = dum_clf.predict(X_train)
y_hat_test = dum_model.predict(X_test)

In [None]:
def print_metrics(labels, preds):
    print("Precision Score: {}".format(precision_score(labels, preds)))
    print("Recall Score: {}".format(recall_score(labels, preds)))
    print("Accuracy Score: {}".format(accuracy_score(labels, preds)))
    print("F1 Score: {}".format(f1_score(labels, preds)))

In [None]:
print_metrics(y_hat_test,y_test)

In [None]:
cnf_matrix = confusion_matrix(y_test, y_hat_test)

plt.figure()
plt.figure(figsize=(7,5))

plot_confusion_matrix(cnf_matrix, classes=[0,1],normalize=False,
                      title='Confusion matrix')
plt.show()

# Final: Logistic Regression

The model which was most accurate in predicting bestselling books is Logistic Regression. The model, using the default parameters from sklearn, has a 75% accuracy score (~7% higher than the baseline).

In [None]:
logreg = LogisticRegression(solver='liblinear')
model_log = logreg.fit(X_train, y_train)
model_log

In [None]:
# y_hat are predicted values
y_hat_test_log = logreg.predict(X_test)
y_hat_train_log = logreg.predict(X_train)

In [None]:
# metrics
print_metrics(y_hat_train_log, y_train)
print('----------')
print_metrics(y_hat_test_log,y_test)

In [None]:
# confusion matrix
cnf_matrix = confusion_matrix(y_test, y_hat_test_log)
print(cnf_matrix)
plt.figure()
plot_confusion_matrix(cnf_matrix, normalize=False,classes=[0,1],
                      title='Normalized confusion matrix')
plt.show()


In [None]:
# find the coefficients for each column
# those with the highest absolute values have the strongest impact on predicting bestsellers

coef_dict = {}
for coef, feat in zip(model_log.coef_[0,:],X.columns):
    coef_dict[feat] = coef
sorted(coef_dict.items(), key=lambda x: x[1])

In [None]:
y_test_score = model_log_2.decision_function(X_test)

fpr, tpr, thresholds = roc_curve(y_test, y_test_score)

y_train_score = model_log.decision_function(X_train)

train_fpr, train_tpr, thresholds = roc_curve(y_train, y_train_score)

In [None]:
# plot the ROC
plot_AUC_ROC(y_test_score,fpr,tpr)

## Grid Search

Based on the output from grid search, which tests out the optimal parameters from the options you provide it with, the default parameters for sklearn's Logistic Regression are the parameters for which the model returns  the highest accuracy score.

In [None]:
# Create regularization penalty space
penalty = ['l1', 'l2']

# Create regularization hyperparameter space
C = np.logspace(0, 4, 10)

# Create hyperparameter options
hyperparameters = dict(C=C, penalty=penalty)

clf_2 = GridSearchCV(LogisticRegression(solver='liblinear'), hyperparameters, cv=5, verbose=0,scoring='accuracy')

In [None]:
best_model = clf_2.fit(X_train, y_train)

In [None]:
best_model.best_estimator_

In [None]:
print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])
print('Best C:', best_model.best_estimator_.get_params()['C'])

In [None]:
logreg_2 = best_model.best_estimator_

In [None]:
logreg_2.fit(X_train, y_train)

In [None]:
y_hat2 = logreg_2.predict(X_test)

In [None]:
print_metrics(y_hat2,y_test)

## Helper functions

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    #Add Normalization Option
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes)
    plt.yticks(tick_marks, classes)
    
    fmt = '.2f'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.savefig('confusion')

In [None]:
def print_metrics(labels, preds):
    print("Precision Score: {}".format(precision_score(labels, preds)))
    print("Recall Score: {}".format(recall_score(labels, preds)))
    print("Accuracy Score: {}".format(accuracy_score(labels, preds)))
    print("F1 Score: {}".format(f1_score(labels, preds)))

In [None]:
def plot_AUC_ROC(y_score,fpr,tpr):
    sns.set_style("darkgrid", {"axes.facecolor": ".9"})
    print('AUC: {}'.format(auc(fpr, tpr)))
    plt.figure(figsize=(10,8))
    lw = 2
    plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve')
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.yticks([i/20.0 for i in range(21)])
    plt.xticks([i/20.0 for i in range(21)])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.show()