In [1]:
import numpy as np
import pandas as pd
from matplotlib.pyplot import subplots
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import (ModelSpec as MS,
                         summarize)

In [2]:
from ISLP import confusion_table
from ISLP.models import contrast
from sklearn.discriminant_analysis import \
     (LinearDiscriminantAnalysis as LDA,
      QuadraticDiscriminantAnalysis as QDA)
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [3]:
Smarket = load_data('Smarket')
Smarket

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
0,2001,0.381,-0.192,-2.624,-1.055,5.010,1.19130,0.959,Up
1,2001,0.959,0.381,-0.192,-2.624,-1.055,1.29650,1.032,Up
2,2001,1.032,0.959,0.381,-0.192,-2.624,1.41120,-0.623,Down
3,2001,-0.623,1.032,0.959,0.381,-0.192,1.27600,0.614,Up
4,2001,0.614,-0.623,1.032,0.959,0.381,1.20570,0.213,Up
...,...,...,...,...,...,...,...,...,...
1245,2005,0.422,0.252,-0.024,-0.584,-0.285,1.88850,0.043,Up
1246,2005,0.043,0.422,0.252,-0.024,-0.584,1.28581,-0.955,Down
1247,2005,-0.955,0.043,0.422,0.252,-0.024,1.54047,0.130,Up
1248,2005,0.130,-0.955,0.043,0.422,0.252,1.42236,-0.298,Down


In [4]:
Smarket.columns

Index(['Year', 'Lag1', 'Lag2', 'Lag3', 'Lag4', 'Lag5', 'Volume', 'Today',
       'Direction'],
      dtype='object')

In [5]:
Smarket.corr(numeric_only=True)


Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today
Year,1.0,0.0297,0.030596,0.033195,0.035689,0.029788,0.539006,0.030095
Lag1,0.0297,1.0,-0.026294,-0.010803,-0.002986,-0.005675,0.04091,-0.026155
Lag2,0.030596,-0.026294,1.0,-0.025897,-0.010854,-0.003558,-0.043383,-0.01025
Lag3,0.033195,-0.010803,-0.025897,1.0,-0.024051,-0.018808,-0.041824,-0.002448
Lag4,0.035689,-0.002986,-0.010854,-0.024051,1.0,-0.027084,-0.048414,-0.0069
Lag5,0.029788,-0.005675,-0.003558,-0.018808,-0.027084,1.0,-0.022002,-0.03486
Volume,0.539006,0.04091,-0.043383,-0.041824,-0.048414,-0.022002,1.0,0.014592
Today,0.030095,-0.026155,-0.01025,-0.002448,-0.0069,-0.03486,0.014592,1.0


In [6]:
allvars = Smarket.columns.drop(['Today', 'Direction', 'Year'])
design = MS(allvars)
X = design.fit_transform(Smarket)
y = Smarket.Direction == 'Up'
glm = sm.GLM(y,
             X,
             family=sm.families.Binomial())
results = glm.fit()
summarize(results)

  if is_categorical[i]:
  if is_categorical[i]:
  if is_categorical[i]:
  if is_categorical[i]:
  if is_categorical[i]:
  if is_categorical[i]:
  if is_categorical[i]:
  if is_categorical[i]:
  if is_categorical[i]:
  results_table = pd.read_html(tab.as_html(),


Unnamed: 0,coef,std err,z,P>|z|
intercept,-0.126,0.241,-0.523,0.601
Lag1,-0.0731,0.05,-1.457,0.145
Lag2,-0.0423,0.05,-0.845,0.398
Lag3,0.0111,0.05,0.222,0.824
Lag4,0.0094,0.05,0.187,0.851
Lag5,0.0103,0.05,0.208,0.835
Volume,0.1354,0.158,0.855,0.392


In [7]:
results.params


intercept   -0.126000
Lag1        -0.073074
Lag2        -0.042301
Lag3         0.011085
Lag4         0.009359
Lag5         0.010313
Volume       0.135441
dtype: float64

In [8]:
results.pvalues

intercept    0.600700
Lag1         0.145232
Lag2         0.398352
Lag3         0.824334
Lag4         0.851445
Lag5         0.834998
Volume       0.392404
dtype: float64

In [9]:
probs = results.predict()
probs[:10]

array([0.50708413, 0.48146788, 0.48113883, 0.51522236, 0.51078116,
       0.50695646, 0.49265087, 0.50922916, 0.51761353, 0.48883778])

In [10]:
labels = np.array(['Down']*1250)
labels[probs>0.5] = "Up" 

In [11]:
confusion_table(labels, Smarket.Direction)


Truth,Down,Up
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,145,141
Up,457,507


In [12]:
((145+507)/(145+141+457+507), np.mean(labels==Smarket.Direction))

(0.5216, 0.5216)

In [13]:
train = (Smarket.Year < 2005)
Smarket_train = Smarket.loc[train]
Smarket_test = Smarket.loc[~train]
Smarket_test.shape

(252, 9)

In [14]:
X_train, X_test = X.loc[train], X.loc[~train]
y_train, y_test = y.loc[train], y.loc[~train]
glm_train = sm.GLM(y_train,
                   X_train,
                   family=sm.families.Binomial())
results = glm_train.fit()
probs = results.predict(exog=X_test)

In [15]:
D = Smarket.Direction
L_train, L_test = D.loc[train], D.loc[~train]

In [16]:
labels = np.array(['Down']*252)
labels[probs>0.5] = 'Up'
confusion_table(labels, L_test)

Truth,Down,Up
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,77,97
Up,34,44


In [17]:
np.mean(labels == L_test), np.mean(labels != L_test)


(0.4801587301587302, 0.5198412698412699)

In [18]:
# model = MS(['Lag1', 'Lag2']).fit(Smarket)
# X = model.transform(Smarket)
# X_train, X_test = X.loc[train], X.loc[~train]
# glm_train = sm.GLM(y_train,
#                    X_train,
#                    family=sm.families.Binomial())
# results = glm_train.fit()
# probs = results.predict(exog=X_test)
# labels = np.array(['Down']*252)
# labels[probs>0.5] = 'Up'
# confusion_table(labels, L_test)

model = MS(['Lag1', 'Lag2']).fit(Smarket)
X = model.transform(Smarket)
glm_train = sm.GLM(y_train,
                   X_train,
                   family=sm.families.Binomial())
results = glm_train.fit()
probs = results.predict(exog=X_test)
labels = np.array(['Down']*252)
labels[probs>0.5] = 'Up'
confusion_table(labels, L_test)

  if is_categorical[i]:
  if is_categorical[i]:
  if is_categorical[i]:
  if is_categorical[i]:
  if is_categorical[i]:
  if is_categorical[i]:
  if is_categorical[i]:
  if is_categorical[i]:
  if is_categorical[i]:


Truth,Down,Up
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,77,97
Up,34,44


In [19]:
(35+106)/252,106/(106+76)


(0.5595238095238095, 0.5824175824175825)

In [20]:
newdata = pd.DataFrame({'Lag1':[1.2, 1.5],
                        'Lag2':[1.1, -0.8]});
newX = model.transform(newdata)
print(newX)
# results.predict(newX)


   intercept  Lag1  Lag2
0        1.0   1.2   1.1
1        1.0   1.5  -0.8


In [21]:
lda = LDA(store_covariance=True)


In [22]:
X_train, X_test = [M.drop(columns=['intercept'])
                   for M in [X_train, X_test]]
lda.fit(X_train, L_train)

In [23]:
lda.means_

array([[ 4.27902240e-02,  3.38940937e-02, -9.80651731e-03,
        -1.05987780e-02,  4.36659878e-03,  1.37184259e+00],
       [-3.95463511e-02, -3.13254438e-02,  5.83431953e-03,
         3.11045365e-03, -6.50887574e-04,  1.36320990e+00]])

In [24]:
lda.classes_

array(['Down', 'Up'], dtype='<U4')

In [25]:
lda.priors_


array([0.49198397, 0.50801603])

In [26]:
lda.scalings_


array([[-0.58081056],
       [-0.49111007],
       [ 0.07707664],
       [ 0.06904095],
       [-0.04549853],
       [-1.24678716]])

In [27]:
lda_pred = lda.predict(X_test)

In [28]:
confusion_table(lda_pred, L_test)


Truth,Down,Up
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,77,97
Up,34,44


In [29]:
lda_prob = lda.predict_proba(X_test)
np.all(
       np.where(lda_prob[:,1] >= 0.5, 'Up','Down') == lda_pred
       )


True

In [30]:
np.all(
       [lda.classes_[i] for i in np.argmax(lda_prob, 1)] == lda_pred
       )

True

In [31]:
np.sum(lda_prob[:,0] > 0.9)


0

In [32]:
qda = QDA(store_covariance=True)
qda.fit(X_train, L_train)

In [33]:
qda.means_, qda.priors_


(array([[ 4.27902240e-02,  3.38940937e-02, -9.80651731e-03,
         -1.05987780e-02,  4.36659878e-03,  1.37184259e+00],
        [-3.95463511e-02, -3.13254438e-02,  5.83431953e-03,
          3.11045365e-03, -6.50887574e-04,  1.36320990e+00]]),
 array([0.49198397, 0.50801603]))

In [34]:
qda.covariance_[0]


array([[ 1.50662277, -0.03924806, -0.1161981 ,  0.02773081,  0.10375151,
         0.01264993],
       [-0.03924806,  1.53559498, -0.05837697,  0.00911436,  0.00535896,
        -0.02853927],
       [-0.1161981 , -0.05837697,  1.5261648 ,  0.02759497, -0.0866616 ,
        -0.02793508],
       [ 0.02773081,  0.00911436,  0.02759497,  1.60391547, -0.08697959,
        -0.03358478],
       [ 0.10375151,  0.00535896, -0.0866616 , -0.08697959,  1.44963901,
         0.00287333],
       [ 0.01264993, -0.02853927, -0.02793508, -0.03358478,  0.00287333,
         0.0758683 ]])

In [35]:
qda_pred = qda.predict(X_test)
confusion_table(qda_pred, L_test)

Truth,Down,Up
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,82,111
Up,29,30


In [36]:
np.mean(qda_pred == L_test)


0.4444444444444444

In [37]:
NB = GaussianNB()
NB.fit(X_train, L_train)

In [38]:
NB.classes_

array(['Down', 'Up'], dtype='<U4')

In [39]:
NB.class_prior_

array([0.49198397, 0.50801603])

In [40]:
NB.theta_

array([[ 4.27902240e-02,  3.38940937e-02, -9.80651731e-03,
        -1.05987780e-02,  4.36659878e-03,  1.37184259e+00],
       [-3.95463511e-02, -3.13254438e-02,  5.83431953e-03,
         3.11045365e-03, -6.50887574e-04,  1.36320990e+00]])

In [41]:
NB.var_

array([[1.50355429, 1.53246749, 1.52305652, 1.60064884, 1.44668659,
        0.07571378],
       [1.51401364, 1.48732877, 1.51198994, 1.43804198, 1.63638575,
        0.06747517]])

In [42]:
NB?

In [43]:
X_train[L_train == 'Down'].mean()


Lag1      0.042790
Lag2      0.033894
Lag3     -0.009807
Lag4     -0.010599
Lag5      0.004367
Volume    1.371843
dtype: float64

In [44]:
X_train[L_train == "Down"].var(ddof = 0)

Lag1      1.503554
Lag2      1.532467
Lag3      1.523057
Lag4      1.600649
Lag5      1.446687
Volume    0.075714
dtype: float64

In [45]:
nb_labels = NB.predict(X_test)
confusion_table(nb_labels, L_test)

Truth,Down,Up
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,85,111
Up,26,30


In [46]:
NB.predict_proba(X_test)[:5]

array([[0.52512367, 0.47487633],
       [0.46691112, 0.53308888],
       [0.48813902, 0.51186098],
       [0.50349077, 0.49650923],
       [0.50231369, 0.49768631]])

In [47]:
knn1 = KNeighborsClassifier(n_neighbors=1)
X_train, X_test = [np.asarray(X) for X in [X_train, X_test]]
knn1.fit(X_train, L_train)
knn1_pred = knn1.predict(X_test)
confusion_table(knn1_pred, L_test)

Truth,Down,Up
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,50,62
Up,61,79


In [48]:
(83+43)/252, np.mean(knn1_pred == L_test)

(0.5, 0.5119047619047619)

In [49]:
knn3 = KNeighborsClassifier(n_neighbors=3)
knn3_pred = knn3.fit(X_train, L_train).predict(X_test)
np.mean(knn3_pred == L_test)

0.503968253968254

In [58]:
Caravan = load_data('Caravan')
Purchase = Caravan.Purchase
Purchase.value_counts()


Purchase
No     5474
Yes     348
Name: count, dtype: int64

In [51]:
348 / 5822


0.05977327378907592

In [59]:
feature_df = Caravan.drop(columns=['Purchase'])

In [60]:
scaler = StandardScaler(with_mean=True,
                        with_std=True,
                        copy=True)

In [61]:
scaler.fit(feature_df)
X_std = scaler.transform(feature_df)

In [62]:
feature_std = pd.DataFrame(
                 X_std,
                 columns=feature_df.columns);
feature_std.std()


MOSTYPE     1.000086
MAANTHUI    1.000086
MGEMOMV     1.000086
MGEMLEEF    1.000086
MOSHOOFD    1.000086
              ...   
AZEILPL     1.000086
APLEZIER    1.000086
AFIETS      1.000086
AINBOED     1.000086
ABYSTAND    1.000086
Length: 85, dtype: float64

In [63]:
(X_train,
 X_test,
 y_train,
 y_test) = train_test_split(np.asarray(feature_std),
                            Purchase,
                            test_size=1000,
                            random_state=0)

In [72]:
knn1 = KNeighborsClassifier(n_neighbors=1)
knn1_pred = knn1.fit(X_train, y_train).predict(np.asarray(X_test))
np.mean(y_test != knn1_pred), np.mean(y_test != "No")

AttributeError: 'NoneType' object has no attribute 'split'

In [71]:
confusion_table(knn1_pred, y_test)


TypeError: 'KNeighborsClassifier' object is not iterable

In [None]:
5/(59+5)

In [None]:
for K in range(1,6):
    knn = KNeighborsClassifier(n_neighbors=K)
    knn_pred = knn.fit(X_train, y_train).predict(X_test)
    C = confusion_table(knn_pred, y_test)
    templ = ('K={0:d}: # predicted to rent: {1:>2},' +
            '  # who did rent {2:d}, accuracy {3:.1%}')
    pred = C.loc['Yes'].sum()
    did_rent = C.loc['Yes','Yes']
    print(templ.format(
          K,
          pred,
          did_rent,
          did_rent / pred))
