In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn.linear_model as skl_lm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, classification_report, precision_score
from sklearn import preprocessing
from sklearn import neighbors

import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn import datasets 

In [2]:
# https://realpython.com/python-csv/
# REad CSV file python
import pandas
df = pandas.read_csv('Dataset/Smarket.csv',usecols=range(1,10), index_col=0, parse_dates=True)
print(df)

             Lag1   Lag2   Lag3   Lag4   Lag5   Volume  Today Direction
Year                                                                   
2001-01-01  0.381 -0.192 -2.624 -1.055  5.010  1.19130  0.959        Up
2001-01-01  0.959  0.381 -0.192 -2.624 -1.055  1.29650  1.032        Up
2001-01-01  1.032  0.959  0.381 -0.192 -2.624  1.41120 -0.623      Down
2001-01-01 -0.623  1.032  0.959  0.381 -0.192  1.27600  0.614        Up
2001-01-01  0.614 -0.623  1.032  0.959  0.381  1.20570  0.213        Up
2001-01-01  0.213  0.614 -0.623  1.032  0.959  1.34910  1.392        Up
2001-01-01  1.392  0.213  0.614 -0.623  1.032  1.44500 -0.403      Down
2001-01-01 -0.403  1.392  0.213  0.614 -0.623  1.40780  0.027        Up
2001-01-01  0.027 -0.403  1.392  0.213  0.614  1.16400  1.303        Up
2001-01-01  1.303  0.027 -0.403  1.392  0.213  1.23260  0.287        Up
2001-01-01  0.287  1.303  0.027 -0.403  1.392  1.30900 -0.498      Down
2001-01-01 -0.498  0.287  1.303  0.027 -0.403  1.25800 -0.189   

In [3]:
X_train = df[:'2004'][['Lag1','Lag2']]
y_train = df[:'2004']['Direction']

X_test = df['2005':][['Lag1','Lag2']]
y_test = df['2005':]['Direction']

lda = LinearDiscriminantAnalysis()
pred = lda.fit(X_train, y_train).predict(X_test)


In [4]:
# pi_hat1 og pi_hat2
lda.priors_
# in other words, 49.2% of the training observations correspond to days during which the market went down.

array([0.49198397, 0.50801603])

In [5]:
lda.means_

array([[ 0.04279022,  0.03389409],
       [-0.03954635, -0.03132544]])

In [6]:
# These do not seem to correspond to the values from the R output in the book?
lda.coef_

array([[-0.05544078, -0.0443452 ]])

In [7]:
# LDA prediction 
confusion_matrix(y_test, pred).T

array([[ 35,  35],
       [ 76, 106]], dtype=int64)

In [8]:
print(classification_report(y_test, pred, digits=3))

              precision    recall  f1-score   support

        Down      0.500     0.315     0.387       111
          Up      0.582     0.752     0.656       141

   micro avg      0.560     0.560     0.560       252
   macro avg      0.541     0.534     0.522       252
weighted avg      0.546     0.560     0.538       252



In [9]:
# 50% threshold: allows us to recreate the predictions
pred_p = lda.predict_proba(X_test)

np.unique(pred_p[:,1]>0.5, return_counts=True)

#Notice that the posterior probability output by the model corresponds to the probability that the market will decrease

(array([False,  True]), array([ 70, 182], dtype=int64))

In [10]:
# 90% threshold
np.unique(pred_p[:,1]>0.9, return_counts=True)

# No days in 2005 meet that threshold! In fact, the greatest posterior probability of decrease in all of 2005 was 52.02%.

(array([False]), array([252], dtype=int64))