In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
import numpy as np
import xgboost as xgb
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from xgboost import XGBClassifier
import time
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.metrics import roc_curve, auc
from sklearn.externals.six import StringIO
from sklearn.tree import export_graphviz
import pydotplus
from IPython.display import Image  
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier 
from sklearn import tree 
import itertools

# Baseline: Dummy Classifier

In [None]:
df = pd.read_csv('final_dataframe.csv',index_col=0)
feats = ['series','rating','publish_month','length_of_title',
         'publish_company','genre','top_authors','top5']

X = pd.get_dummies(df[feats], drop_first=True)
X.drop([col for col, val in X.sum().iteritems() if val < 10], axis=1, inplace=True)
Y = df['NYT_bestseller']

In [None]:
# normalize the features

for col in X.columns:
    X[col] = (X[col]-min(X[col]))/ (max(X[col]) - min(X[col])) 

#We subtract the minimum and divide by the range forcing a scale of 0 to 1 for each feature

X.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)

In [None]:
dum_clf = DummyClassifier(strategy='most_frequent')
dum_model = dum_clf.fit(X_train,y_train)
y_hat_train = dum_clf.predict(X_train)
y_hat_test = dum_model.predict(X_test)

In [None]:
def print_metrics(labels, preds):
    print("Precision Score: {}".format(precision_score(labels, preds)))
    print("Recall Score: {}".format(recall_score(labels, preds)))
    print("Accuracy Score: {}".format(accuracy_score(labels, preds)))
    print("F1 Score: {}".format(f1_score(labels, preds)))

In [None]:
cnf_matrix = confusion_matrix(y_test, y_hat_test_log)

plt.figure()
plot_confusion_matrix(cnf_matrix, classes=[0,1],normalize=True,
                      title='Confusion matrix')
plt.show()

# Final: Logistic Regression

In [None]:
logreg = LogisticRegression(fit_intercept = False, solver='liblinear')
model_log = logreg.fit(X_train, y_train)
model_log

In [None]:
# y_hat predicted values
y_hat_test_log = logreg.predict(X_test)
y_hat_train_log = logreg.predict(X_train)

In [None]:
# metrics
print_metrics(y_hat_train_log, y_train)
print('----------')
print_metrics(y_hat_test_log,y_test)

In [None]:
# confusion matrix
cnf_matrix = confusion_matrix(y_test, y_hat_test_log)
print(cnf_matrix)
plt.figure()
plot_confusion_matrix(cnf_matrix, normalize=True,classes=[0,1],
                      title='Normalized confusion matrix')
plt.show()


In [None]:
y_test_score = model_log.decision_function(X_test)

fpr, tpr, thresholds = roc_curve(y_test, y_test_score)

y_train_score = model_log.decision_function(X_train)

train_fpr, train_tpr, thresholds = roc_curve(y_train, y_train_score)

In [None]:
plot_AUC_ROC(y_test_score,fpr,tpr)

In [None]:
# Create regularization penalty space
penalty = ['l1', 'l2']

# Create regularization hyperparameter space
C = np.logspace(0, 4, 10)

# Create hyperparameter options
hyperparameters = dict(C=C, penalty=penalty)

clf_2 = GridSearchCV(LogisticRegression(solver='liblinear'), hyperparameters, cv=5, verbose=0)

In [None]:
best_model = clf_2.fit(X_train, y_train)

In [None]:
print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])
print('Best C:', best_model.best_estimator_.get_params()['C'])

In [None]:
# Grid Search
logreg_2 = LogisticRegression(fit_intercept = False, solver='liblinear', C=59.9, penalty='l1')
model_log = logreg_2.fit(X_train, y_train)
model_log

In [None]:
y_hat_test_2 = best_model.predict(X_test)

In [None]:
print_metrics(y_hat_test_2,y_test)

In [None]:
model_log.coef_

In [None]:
model_log.coef_

## Test new data

In [None]:
Book1:
Book2: