In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.metrics import confusion_matrix
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from mlxtend.plotting import plot_confusion_matrix
from IPython.display import display, Markdown
%matplotlib inline

In [None]:
def printm(input_str):
    display(Markdown(input_str))

# 4.6.2 The stock market data

In [None]:
df = sm.datasets.get_rdataset("Smarket", "ISLR", cache=True).data

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.corr().style.background_gradient(cmap='viridis')

In [None]:
df["Volume"].plot();

# 4.6.2 Logistic Regression

In [None]:
y = pd.get_dummies(df["Direction"])["Up"]
X = df.drop(columns=["Direction", "Today", "Year"])

In [None]:
logit = sm.Logit(y, sm.add_constant(X)).fit()
print(logit.summary())

Access the coefficients and p values

In [None]:
logit.params

In [None]:
logit.pvalues

Display the first 10 predicted probabilities from the training data

In [None]:
predict_prob = logit.predict(sm.add_constant(X))
predict_prob[:10]

Note to self, don't call fittedvalues for logistic regression, it just returns the dot product of the training exogenous variables and the coefficients.

Make a confusion matrix

In [None]:
class_labels = ["Down", "Up"] # took the Up dummy column as my independent variable, so 1 = Up
predict_class = pd.Series(data=0, index=predict_prob.index)
predict_class.loc[predict_prob > 0.5] = 1
confusion_mat = confusion_matrix(y, predict_class)
fig, ax = plot_confusion_matrix(conf_mat=confusion_mat, class_names=class_labels)
ax.set_ylim(len(confusion_mat)-0.5, -0.5) # have to keep this in until matplotlib 3.1.2 comes out
#https://github.com/matplotlib/matplotlib/issues/14751
plt.show()

Train test split

In [None]:
train_mask = df["Year"] < 2005
train_df = df.loc[train_mask].copy()
test_df = df.loc[~train_mask].copy()

In [None]:
y_train = pd.get_dummies(train_df["Direction"])["Up"]
X_train = train_df.drop(columns=["Direction", "Today", "Year"])
y_test = pd.get_dummies(test_df["Direction"])["Up"]
X_test = test_df.drop(columns=["Direction", "Today", "Year"])

In [None]:
logit = sm.Logit(y_train, sm.add_constant(X_train)).fit()
print(logit.summary())

In [None]:
predict_prob = logit.predict(sm.add_constant(X_test))

Make a confusion matrix

In [None]:
class_labels = ["Down", "Up"] # took the Up dummy column as my independent variable, so 1 = Up
predict_class = pd.Series(data=0, index=predict_prob.index)
predict_class.loc[predict_prob > 0.5] = 1
confusion_mat = confusion_matrix(y_test, predict_class)
fig, ax = plot_confusion_matrix(conf_mat=confusion_mat, class_names=class_labels)
ax.set_ylim(len(confusion_mat)-0.5, -0.5) # have to keep this in until matplotlib 3.1.2 comes out
#https://github.com/matplotlib/matplotlib/issues/14751
plt.show()

In [None]:
(predict_class != y_test).mean()

# LDA
Have to switch over to sklearn for this

In [None]:
y_train = pd.get_dummies(train_df["Direction"])["Up"]
X_train = train_df[["Lag1", "Lag2"]]
y_test = pd.get_dummies(test_df["Direction"])["Up"]
X_test = test_df[["Lag1", "Lag2"]]

In [None]:
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)

In [None]:
printm("Prior probabilities of the groups: ")
lda.priors_

In [None]:
lda.coef_

The coefficients aren't the same as in R. Will have to figure out why that might be

[R lda docs](https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/lda.html)

In [None]:
class_labels = ["Down", "Up"] # took the Up dummy column as my independent variable, so 1 = Up
predict_class = lda.predict(X_test)
confusion_mat = confusion_matrix(y_test, predict_class)
fig, ax = plot_confusion_matrix(conf_mat=confusion_mat, class_names=class_labels)
ax.set_ylim(len(confusion_mat)-0.5, -0.5) # have to keep this in until matplotlib 3.1.2 comes out
#https://github.com/matplotlib/matplotlib/issues/14751
plt.show()

It has the exact same confusion matrix though, not going to worry about the different coefficients much in that case.

# QDA

In [None]:
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, y_train)

In [None]:
class_labels = ["Down", "Up"] # took the Up dummy column as my independent variable, so 1 = Up
predict_class = qda.predict(X_test)
confusion_mat = confusion_matrix(y_test, predict_class)
fig, ax = plot_confusion_matrix(conf_mat=confusion_mat, class_names=class_labels)
ax.set_ylim(len(confusion_mat)-0.5, -0.5) # have to keep this in until matplotlib 3.1.2 comes out
#https://github.com/matplotlib/matplotlib/issues/14751
plt.show()

# KNN

In [None]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
predict_class = knn.predict(X_test)
confusion_mat = confusion_matrix(y_test, predict_class)
fig, ax = plot_confusion_matrix(conf_mat=confusion_mat, class_names=class_labels)
ax.set_ylim(len(confusion_mat)-0.5, -0.5) # have to keep this in until matplotlib 3.1.2 comes out
#https://github.com/matplotlib/matplotlib/issues/14751
plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
predict_class = knn.predict(X_test)
confusion_mat = confusion_matrix(y_test, predict_class)
fig, ax = plot_confusion_matrix(conf_mat=confusion_mat, class_names=class_labels)
ax.set_ylim(len(confusion_mat)-0.5, -0.5) # have to keep this in until matplotlib 3.1.2 comes out
#https://github.com/matplotlib/matplotlib/issues/14751
plt.show()

## KNN on Caravan Insurance

In [None]:
df = sm.datasets.get_rdataset("Caravan", "ISLR", cache=True).data

In [None]:
df.head()

Want to use a standard scaler on all predictors so that they have a mean of 0 and a standard deviation of one

In [None]:
scaler = StandardScaler()
knn = KNeighborsClassifier(n_neighbors=1)
pipe = make_pipeline(scaler, knn)
y = pd.get_dummies(df["Purchase"])["Yes"]
X = df.drop(columns=["Purchase"])
y_train = y.iloc[:1000]
X_train = X.iloc[:1000]
y_test = y.iloc[1000:]
X_test = X.iloc[1000:]
pipe.fit(X_train, y_train)

In [None]:
y_pred = pipe.predict(X_test)

In [None]:
(y_pred != y_test).mean()

In [None]:
(y_test == 1).mean()

In [None]:
class_labels = ["No", "Yes"]
confusion_mat = confusion_matrix(y_test, y_pred)
fig, ax = plot_confusion_matrix(conf_mat=confusion_mat, class_names=class_labels)
ax.set_ylim(len(confusion_mat)-0.5, -0.5) # have to keep this in until matplotlib 3.1.2 comes out
#https://github.com/matplotlib/matplotlib/issues/14751
plt.show()

The model does the same thing for n_neighbors = 3 and 5

In [None]:
param_grid = {'kneighborsclassifier__n_neighbors': [1, 3, 5]}
search = GridSearchCV(pipe, param_grid, iid=False, cv=5)
search.fit(X_train, y_train)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

In [None]:
y_pred = search.predict(X_test)

In [None]:
confusion_mat = confusion_matrix(y_test, y_pred)
fig, ax = plot_confusion_matrix(conf_mat=confusion_mat, class_names=class_labels)
ax.set_ylim(len(confusion_mat)-0.5, -0.5) # have to keep this in until matplotlib 3.1.2 comes out
#https://github.com/matplotlib/matplotlib/issues/14751
plt.show()