# Chapter 4 AP

Jack Krebsbach | Statistics for Data Science | Nov 2023


## # 13
Using the ```Weekly``` data set.

In [146]:
import numpy as np
import pandas as pd
from matplotlib.pyplot import subplots
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import (ModelSpec as MS,
summarize)

In [147]:
from ISLP import confusion_table
from ISLP.models import contrast
from sklearn.discriminant_analysis import \
(LinearDiscriminantAnalysis as LDA, QuadraticDiscriminantAnalysis as QDA)

from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
 

##### Load Data
We load the Weekly data set from ISLP.

In [148]:
Weekly = load_data('Weekly')
Weekly.shape

(1089, 9)

(b) Use the full data set to perform a logistic regression with
Direction as the response and the five lag variables plus Volume as predictors. Use the summary function to print the results. Do any of the predictors appear to be statistically significant? If so, which ones?

None of the 5 lag variables or volume appear to be statistically significant. The variable with associated with the lowest P value is Lag1 with a p-value of 0.145.

In [149]:
design = MS([ 'Lag1', 'Lag2', 'Lag3', 'Lag4', 'Lag5', 'Volume'])
X = design.fit_transform(Weekly)
y = Weekly.Direction == "Up"
glm = sm.GLM(y, X, family=sm.families.Binomial())
results = glm.fit()
summarize(results)

Unnamed: 0,coef,std err,z,P>|z|
intercept,0.2669,0.086,3.106,0.002
Lag1,-0.0413,0.026,-1.563,0.118
Lag2,0.0584,0.027,2.175,0.03
Lag3,-0.0161,0.027,-0.602,0.547
Lag4,-0.0278,0.026,-1.05,0.294
Lag5,-0.0145,0.026,-0.549,0.583
Volume,-0.0227,0.037,-0.616,0.538


(c) Compute the confusion matrix and overall fraction of correct predictions. Explain what the confusion matrix is telling you about the types of mistakes made by logistic regression.

From the confusion matrix, the logistic model is having a hard time predicting when the market will go down. It has an easier time prediciting if the market will go up.

In [150]:
## Get the probabilities
probs = results.predict() 
probs [:10]

array([0.60862494, 0.60103144, 0.58756995, 0.48164156, 0.61690129,
       0.56841902, 0.57860971, 0.51519724, 0.57151998, 0.55542873])

In [151]:
labels = np.array(['Down']*1089) 
labels[probs>0.5] = "Up"

In [152]:
confusion_table(labels, Weekly.Direction)

Truth,Down,Up
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,54,48
Up,430,557


(d) Now fit the logistic regression model using a training data period from 1990 to 2008, with Lag2 as the only predictor. Compute the confusion matrix and the overall fraction of correct predictions for the held out data (that is, the data from 2009 and 2010).
)

In [153]:
train = (Weekly.Year < 2009) & (Weekly.Year > 1989) 
Weekly_train = Weekly.loc[train]
Weekly_test = Weekly.loc[~train] 
print(f'Testing Shape: {Weekly_test.shape}')
print(f'Training Shape: {Weekly_train.shape}')
Weekly_test

Testing Shape: (104, 9)
Training Shape: (985, 9)


Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
985,2009,6.760,-1.698,0.926,0.418,-2.251,3.793110,-4.448,Down
986,2009,-4.448,6.760,-1.698,0.926,0.418,5.043904,-4.518,Down
987,2009,-4.518,-4.448,6.760,-1.698,0.926,5.948758,-2.137,Down
988,2009,-2.137,-4.518,-4.448,6.760,-1.698,6.129763,-0.730,Down
989,2009,-0.730,-2.137,-4.518,-4.448,6.760,5.602004,5.173,Up
...,...,...,...,...,...,...,...,...,...
1084,2010,-0.861,0.043,-2.173,3.599,0.015,3.205160,2.969,Up
1085,2010,2.969,-0.861,0.043,-2.173,3.599,4.242568,1.281,Up
1086,2010,1.281,2.969,-0.861,0.043,-2.173,4.835082,0.283,Up
1087,2010,0.283,1.281,2.969,-0.861,0.043,4.454044,1.034,Up


In [154]:
X_train, X_test = X.loc[train], X.loc[~train] 
y_train, y_test = y.loc[train], y.loc[~train]
design = MS(['Lag2'])
X = design.fit_transform(X_train)
glm_train = sm.GLM(y_train, X_train , family=sm.families.Binomial()) 
results = glm_train.fit()

probs = results.predict(exog=X_test)

In [155]:
D = Weekly.Direction
L_train, L_test = D.loc[train], D.loc[~train]

In [156]:
labels = np.array(['Down']*104)
labels[probs>0.5] = 'Up' 
confusion_table(labels, L_test)

Truth,Down,Up
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,31,44
Up,12,17


In [157]:
np.mean(labels == L_test), np.mean(labels != L_test)

(0.46153846153846156, 0.5384615384615384)

(e) Repeat (d) using LDA.

In [158]:
lda = LDA(store_covariance=True)
lda.fit(X_train, L_train)

In [159]:
lda.means_

array([[ 1.        ,  0.28944444, -0.03568254,  0.17080045,  0.15925624,
         0.21409297,  1.26696554],
       [ 1.        , -0.00921324,  0.26036581,  0.08404044,  0.09220956,
         0.04548897,  1.15652914]])

In [160]:
lda.classes_

array(['Down', 'Up'], dtype='<U4')

In [161]:
lda.priors_

array([0.44771574, 0.55228426])

In [162]:
lda.scalings_

array([[ 0.        ],
       [-0.27269007],
       [ 0.19316443],
       [-0.06828419],
       [-0.13646358],
       [-0.16316423],
       [-0.39859766]])

(f) Repeat (d) using QDA.

In [163]:
qda = QDA(store_covariance=True) 
qda.fit(X_train, L_train)



In [164]:
qda.means_, qda.priors_

(array([[ 1.        ,  0.28944444, -0.03568254,  0.17080045,  0.15925624,
          0.21409297,  1.26696554],
        [ 1.        , -0.00921324,  0.26036581,  0.08404044,  0.09220956,
          0.04548897,  1.15652914]]),
 array([0.44771574, 0.55228426]))

In [165]:
qda.covariance_[0]

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  4.89171103, -0.79291936, -0.05611882, -0.51970121,
         0.30028217, -0.03055177],
       [ 0.        , -0.79291936,  4.83781758, -0.18804763,  0.79573313,
        -0.30630458, -0.65752451],
       [ 0.        , -0.05611882, -0.18804763,  4.96604353, -1.02101318,
         0.45582361, -0.35893653],
       [ 0.        , -0.51970121,  0.79573313, -1.02101318,  5.76020274,
        -0.40363573, -0.42847083],
       [ 0.        ,  0.30028217, -0.30630458,  0.45582361, -0.40363573,
         5.61038878, -0.50475747],
       [ 0.        , -0.03055177, -0.65752451, -0.35893653, -0.42847083,
        -0.50475747,  1.742802  ]])

In [166]:
qda_pred = qda.predict(X_test) 
confusion_table(qda_pred, L_test)

  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])


Truth,Down,Up
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,43,61
Up,0,0


In [167]:
np.mean(qda_pred == L_test)

0.41346153846153844

(g) Repeat (d) using KNN with K = 1.

(h) Repeat (d) using naive Bayes.

(i) Which of these methods appears to provide the best results on this data?

(j) Experiment with different combinations of predictors, includ- ing possible transformations and interactions, for each of the methods. Report the variables, method, and associated confu- sion matrix that appears to provide the best results on the held out data. Note that you should also experiment with values for K in the KNN classifier.

## 2 

#### Load the data

In [168]:
from sklearn import datasets
iris = datasets.load_iris()

The features are stored in iris['data'] and the labels are in iris['target'].

### Afterwards, standardize the features and apply 2D PCA to the standardized data. Plot the first two principal components of the data, color coded by the true labels.


### (2) Now focus on the two classes, 'versicolor' and 'virginica', and fit a binary logistic regression model. What is the training error? Plot also the decision boundary.


### (3) For the above two iris classes, fit two more models: LDA and QDA. What are their training error rates? Plot their decision boundaries together with the binary logistic regression model. Which model do you think is the most appropriate for these two classes? 

### (4) Apply the one-versus-rest multiclass logistic regression classifier to all three classes of the iris data (using the two dimensional principal components obtained above). Display the confusion matrix and comment on it. What is the overall training error?

### (5)  Repeat (4) with the multinomial logistic regression classifier instead. How does it compare with the one-versus-rest extension?

### (6) Repeat (4) with each of the LDA and QDA classifiers. How do they compare with logistic regression in terms of training error? Which one will generalize the best to test data (when they become available)?