# Classification

### Loading Libraries

In [None]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd

# Data Visualization
import matplotlib.pyplot as plt
from matplotlib .pyplot import subplots

# StatsModel
import statsmodels .api as sm

# Scikit-Learn
from sklearn. discriminant_analysis import \
( LinearDiscriminantAnalysis as LDA ,
QuadraticDiscriminantAnalysis as QDA)
from sklearn. naive_bayes import GaussianNB
from sklearn. preprocessing import StandardScaler
from sklearn. neighbors import KNeighborsClassifier
from sklearn. linear_model import LogisticRegression
from sklearn. model_selection import train_test_split

In [None]:
# Custom Libraries
from ISLP import load_data
from ISLP import confusion_table
from ISLP.models import contrast
from ISLP.models import ( ModelSpec as MS ,
summarize )

## Logistic Regression, LDA, QDA, & KNN

### The Stock Market Data

In [None]:
Smarket = load_data ('Smarket')

Smarket

In [None]:
# Checking column's Name
Smarket.columns

In [None]:
# Smarket.corr()

corr = Smarket.drop(columns="Direction").corr()

print(corr)

In [None]:
Smarket2 = Smarket.copy()
Smarket2["Direction"] = Smarket2["Direction"].map({"Down": 0, "Up": 1})

corr = Smarket2.corr(numeric_only=True)
print(corr)

In [None]:
Smarket2.corr()

In [None]:
Smarket2.plot(y='Volume', color="tomato");
plt.grid(True)
plt.show()

### Logistic Regression

In [None]:
allvars = Smarket2.columns.drop (['Today', 'Direction', 'Year'])

design = MS(allvars)

X = design. fit_transform (Smarket)
y = Smarket. Direction == 'Up'

glm = sm.GLM(y,
             X,
             family=sm. families . Binomial ())

results = glm.fit ()
summarize (results)

In [None]:
results.params

In [None]:
results.pvalues

In [None]:
probs = results.predict()
probs [:10]

In [None]:
labels = np.array (['Down']*1250)

labels[probs >0.5] = "Up"

In [None]:
confusion_table(labels, Smarket.Direction )

In [None]:
(507+145) /1250, np.mean(labels == Smarket. Direction )

In [None]:
train = (Smarket2.Year < 2005)

Smarket_train = Smarket2.loc[train]
Smarket_test = Smarket2.loc[~train]

Smarket_test.shape

In [None]:
X_train , X_test = X.loc[train], X.loc[~train]

y_train , y_test = y.loc[train], y.loc[~train]

glm_train = sm.GLM(y_train,
                   X_train,
                   family=sm. families . Binomial ())

results = glm_train.fit()
probs = results.predict(exog=X_test)

In [None]:
D = Smarket2.Direction

L_train , L_test = D.loc[train], D.loc[~train]

In [None]:
# labels = np.array (['Down']*252)

# labels[probs >0.5] = 'Up'

# confusion_table(labels, L_test)

In [None]:
# 1) Predicciones con etiquetas originales
labels = np.array(['Down'] * len(probs))
labels[probs > 0.5] = 'Up'

# 2) Aseguramos que las verdaderas también sean 'Down'/'Up'
true_labels = pd.Series(L_test).replace({
    0: 'Down', 1: 'Up',
    '0': 'Down', '1': 'Up'
}).astype(str)

# 3) Alineamos y eliminamos cualquier fila problemática (NaN)
df_eval = pd.DataFrame({'pred': labels, 'true': true_labels}).dropna()

# 4) Matriz de confusión con etiquetas originales
cm = confusion_table(df_eval['pred'], df_eval['true'], labels=['Down', 'Up'])

# 5) Accuracy
acc = (df_eval['pred'].to_numpy() == df_eval['true'].to_numpy()).mean()

print(cm)
print("Accuracy:", acc)

In [None]:
# Accuracy

labels = labels[:len(L_test.values)]  # Truncate labels to match L_test.values length

# Option 2: If they should be the same size but might be shaped differently
labels = labels.reshape(-1)  # Flatten to 1D array
L_test_values = L_test.values.reshape(-1)  # Flatten to 1D array

print("Accuracy:", (labels == L_test_values).mean())

In [None]:
model = MS(['Lag1', 'Lag2']).fit(Smarket2)

X = model. transform(Smarket2)
X_train , X_test = X.loc[train], X.loc[~train]

glm_train = sm.GLM(y_train,
                   X_train,
                   family=sm.families.Binomial())

results = glm_train.fit()
probs = results.predict(exog=X_test)
labels = np.array (['Down']*252)
labels[probs >0.5] = 'Up'

confusion_table (pred_str, true_str)

In [None]:
(35+106) /252, 106/(106+76)

In [None]:
newdata = pd. DataFrame ({'Lag1':[1.2 , 1.5] ,
                          'Lag2':[1.1 , -0.8]});

newX = model.transform(newdata)

results.predict(newX)

### Linear Discriminant Analysis

In [None]:
lda = LDA( store_covariance =True)

In [None]:
X_train , X_test = [M.drop(columns =['intercept'])
                    
for M in [X_train , X_test ]]
lda.fit(X_train , L_train)

In [None]:
lda.means_

In [None]:
lda. classes_

In [None]:
lda.priors_

In [None]:
lda. scalings_

In [None]:
lda_pred = lda.predict(X_test)

In [None]:
confusion_table (lda_pred, L_test)

In [None]:
lda_prob = lda. predict_proba (X_test)

np.all(
    np.where( lda_prob [: ,1] >= 0.5, 'Up','Down') == lda_pred)

In [None]:
np.all(
    [lda.classes_ [i] for i in np.argmax(lda_prob , 1)] ==
    lda_pred)

In [None]:
np.sum(lda_prob [: ,0] > 0.9)

### Quadratic Discriminant Analysis

In [None]:
qda = QDA( store_covariance =True)

qda.fit(X_train , L_train)

In [None]:
qda.means_ , qda.priors_

In [None]:
qda. covariance_ [0]

In [None]:
qda_pred = qda.predict(X_test)

confusion_table (qda_pred , L_test)

In [None]:
np.mean( qda_pred == L_test)

### Naive Bayes

In [None]:
NB = GaussianNB()

NB.fit(X_train, L_train)

In [None]:
NB.classes_

In [None]:
NB.class_prior_

In [None]:
NB.theta_

In [None]:
NB.var_

In [None]:
X_train[L_train == 'Down'].mean()

In [None]:
X_train[L_train == 'Down ']. var(ddof =0)

In [None]:
nb_labels = NB.predict(X_test)

confusion_table (nb_labels , L_test)

In [None]:
NB. predict_proba (X_test)[:5]

### K-Nearest Neighbors

In [None]:
knn1 = KNeighborsClassifier ( n_neighbors =1)
knn1.fit(X_train , L_train)
knn1_pred = knn1.predict(X_test)

confusion_table (knn1_pred , L_test)

In [None]:
(83+43) /252 , np.mean( knn1_pred == L_test)

In [None]:
knn3 = KNeighborsClassifier ( n_neighbors =3)
knn3_pred = knn3.fit(X_train , L_train).predict(X_test)
np.mean( knn3_pred == L_test)

In [None]:
Caravan = load_data ('Caravan')

Purchase = Caravan. Purchase
Purchase.value_counts ()

In [None]:
348 / 5822

In [None]:
feature_df = Caravan.drop(columns =['Purchase'])

In [None]:
scaler = StandardScaler(with_mean =True,
                        with_std =True,
                        copy=True)

In [None]:
scaler.fit(feature_df)

X_std = scaler.transform(feature_df)

In [None]:
feature_std = pd. DataFrame(X_std,
                            columns= feature_df .columns);

feature_std.std ()

In [None]:
(X_train, X_test,
 y_train,
 y_test) = train_test_split (feature_std,
                             Purchase,
                             test_size =1000,
                             random_state =0)

In [None]:
knn1 = KNeighborsClassifier(n_neighbors =1)
knn1_pred = knn1.fit(X_train , y_train).predict(X_test)

np.mean(y_test != knn1_pred), np.mean(y_test != "No")

In [None]:
confusion_table(knn1_pred, y_test)

In [None]:
9/(53+9)

In [None]:
for K in range(1 ,6):
    knn = KNeighborsClassifier(n_neighbors =K)
    knn_pred = knn.fit(X_train , y_train).predict(X_test)
    C = confusion_table (knn_pred , y_test)
    templ = ('K={0:d}: # predicted to rent: {1: >2} ,' +
             '# who did rent {2:d}, accuracy {3:.1%}')
    pred = C.loc['Yes'].sum ()
    did_rent = C.loc['Yes','Yes']
    print(templ.format(
        K,
        pred,
        did_rent,
        did_rent / pred))

In [None]:
logit = LogisticRegression (C=1e10 , solver='liblinear')
logit.fit(X_train , y_train)
logit_pred = logit. predict_proba (X_test)
logit_labels = np.where( logit_pred [: ,1] > 5, 'Yes', 'No')

confusion_table (logit_labels , y_test)

In [None]:
logit_labels = np.where( logit_pred [: ,1] >0.25 , 'Yes', 'No')

confusion_table (logit_labels , y_test)

In [None]:
9/(20+9)

### Linear & Poisson Regression on the Bikeshare Data

In [None]:
Bike = load_data ('Bikeshare')

In [None]:
Bike.shape, Bike.columns

In [None]:
X = MS(['mnth',
        'hr',
        'workingday',
        'temp',
        'weathersit']).fit_transform (Bike)

Y = Bike['bikers']
M_lm = sm.OLS(Y, X).fit ()
summarize (M_lm)

In [None]:
hr_encode = contrast ('hr', 'sum')

mnth_encode = contrast ('mnth', 'sum')

In [None]:
# X2 = MS([ mnth_encode,
#           hr_encode,
#           'workingday',
#           'temp',
#           'weathersit ']). fit_transform (Bike)

# M2_lm = sm.OLS(Y, X2).fit ()
# S2 = summarize (M2_lm)
# S2

In [None]:
X2 = MS([ mnth_encode,
          hr_encode,
          'workingday',
          'temp',
          'weathersit']). fit_transform (Bike)  

M2_lm = sm.OLS(Y, X2).fit()
S2 = summarize(M2_lm)
S2



In [None]:
np.sum((M_lm. fittedvalues - M2_lm. fittedvalues)**2)

In [None]:
np.allclose(M_lm.fittedvalues , M2_lm. fittedvalues)

In [None]:
coef_month = S2[S2.index.str.contains ('mnth')]['coef']

coef_month

In [None]:
months = Bike['mnth'].dtype.categories

coef_month = pd.concat ([
coef_month,
pd.Series ([- coef_month .sum ()],
           index =['mnth[Dec]'])
])

coef_month

In [None]:
fig_month, ax_month = subplots(figsize=(8, 8))

x_month = np.arange(coef_month.shape[0])

ax_month.plot(x_month, coef_month, marker='o', ms=10)
ax_month.set_xticks(x_month)
ax_month.set_xticklabels([l[5] for l in coef_month.index], fontsize=20)

ax_month.set_xlabel('Month', fontsize=20)
ax_month.set_ylabel('Coefficient', fontsize=20)

plt.grid(True)
plt.show()

In [None]:
coef_hr = S2[S2.index.str.contains ('hr')]['coef']

coef_hr = coef_hr.reindex (['hr [{0}] '.format(h) for h in range (23) ])
coef_hr = pd.concat ([ coef_hr,
                       pd.Series ([- coef_hr.sum ()], index =['hr [23] '])
])

In [None]:
fig_hr , ax_hr = subplots(figsize =(8 ,8))

x_hr = np.arange(coef_hr.shape [0])
ax_hr.plot(x_hr , coef_hr , marker='o', ms =10)
ax_hr. set_xticks (x_hr [::2])
ax_hr. set_xticklabels (range (24) [::2] , fontsize =20)
ax_hr. set_xlabel ('Hour', fontsize =20)
ax_hr. set_ylabel ('Coefficient', fontsize =20);
plt.grid(True)
plt.show()

#### Poisson Regression

In [None]:
M_pois = sm.GLM(Y, X2 , family=sm. families .Poisson()).fit()

In [None]:
S_pois = summarize (M_pois)
coef_month = S_pois[S_pois.index.str.contains ('mnth ')]['coef']
coef_month = pd.concat ([ coef_month ,
pd.Series ([- coef_month .sum ()],
index =['mnth[Dec]'])])
coef_hr = S_pois[S_pois.index.str.contains ('hr')]['coef']
coef_hr = pd.concat ([ coef_hr ,
pd.Series ([- coef_hr.sum ()],
index =['hr [23] '])])

In [None]:
# fig_pois , (ax_month , ax_hr) = subplots (1, 2, figsize =(16 ,8))
# ax_month .plot(x_month , coef_month , marker='o', ms =10)
# ax_month . set_xticks (x_month)
# ax_month . set_xticklabels ([l[5] for l in coef_month .index], fontsize=20)
# ax_month . set_xlabel ('Month', fontsize =20)
# ax_month . set_ylabel ('Coefficient', fontsize =20)
# ax_hr.plot(x_hr , coef_hr , marker='o', ms =10)
# ax_hr. set_xticklabels (range (24) [::2] , fontsize =20)
# ax_hr. set_xlabel ('Hour', fontsize =20)
# ax_hr. set_ylabel ('Coefficient', fontsize =20);
# plt.grid(True)
# plt.show()

In [None]:
fig , ax = subplots (figsize =(8, 8))
ax.scatter(M2_lm.fittedvalues,
           M_pois.fittedvalues, s=20)
ax. set_xlabel ('Linear Regression Fit', fontsize =20)
ax. set_ylabel ('Poisson Regression Fit', fontsize =20)
ax.axline ([0 ,0] , c='black', linewidth =3,
linestyle ='--', slope =1);
plt.show()