In [93]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [94]:
Smarket = pd.read_csv("/Users/arpanganguli/Documents/Finance/ISLR/Datasets/Smarket.csv", index_col = 'SlNo')

In [95]:
Smarket.head()

Unnamed: 0_level_0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
SlNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,2001,0.381,-0.192,-2.624,-1.055,5.01,1.1913,0.959,Up
2,2001,0.959,0.381,-0.192,-2.624,-1.055,1.2965,1.032,Up
3,2001,1.032,0.959,0.381,-0.192,-2.624,1.4112,-0.623,Down
4,2001,-0.623,1.032,0.959,0.381,-0.192,1.276,0.614,Up
5,2001,0.614,-0.623,1.032,0.959,0.381,1.2057,0.213,Up


In [116]:
Smarket.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1250 entries, 1 to 1250
Data columns (total 9 columns):
Year         1250 non-null int64
Lag1         1250 non-null float64
Lag2         1250 non-null float64
Lag3         1250 non-null float64
Lag4         1250 non-null float64
Lag5         1250 non-null float64
Volume       1250 non-null float64
Today        1250 non-null float64
Direction    1250 non-null object
dtypes: float64(7), int64(1), object(1)
memory usage: 97.7+ KB


In [117]:
from sklearn.model_selection import train_test_split

In [118]:
X = np.array(Smarket[['Lag1', 'Lag2']])
y = np.array(Smarket['Direction'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2016, random_state=101)

**K-Means without standardisation (K = 1)**

In [119]:
from sklearn.neighbors import KNeighborsClassifier

In [120]:
knn_1 = KNeighborsClassifier(n_neighbors=1).fit(X_train, y_train)

In [121]:
knn_1_pred = knn_1.predict(X_test)

In [122]:
from sklearn.metrics import classification_report, confusion_matrix

In [123]:
print(confusion_matrix(y_test, knn_1_pred))

[[54 64]
 [68 66]]


In [124]:
print(classification_report(y_test, knn_1_pred))

              precision    recall  f1-score   support

        Down       0.44      0.46      0.45       118
          Up       0.51      0.49      0.50       134

   micro avg       0.48      0.48      0.48       252
   macro avg       0.48      0.48      0.47       252
weighted avg       0.48      0.48      0.48       252



**K-Means without standardisation (K = 3)**

In [125]:
from sklearn.neighbors import KNeighborsClassifier

In [126]:
knn_3 = KNeighborsClassifier().fit(X_train, y_train)

In [127]:
knn_3_pred = knn_3.predict(X_test)

In [128]:
from sklearn.metrics import classification_report, confusion_matrix

In [129]:
print(confusion_matrix(y_test, knn_3_pred))

[[51 67]
 [58 76]]


In [130]:
print(classification_report(y_test, knn_3_pred))

              precision    recall  f1-score   support

        Down       0.47      0.43      0.45       118
          Up       0.53      0.57      0.55       134

   micro avg       0.50      0.50      0.50       252
   macro avg       0.50      0.50      0.50       252
weighted avg       0.50      0.50      0.50       252



*As we can see, increase the number of K marginally improves the precision of the model.*

**K-Means with standardisation (K = 1)**
<br><br>
**Why standardise?** *Because KNN classifier classifies variables of different sizes, in which distances may vary on an 
absolute scale (e.g. we might be classifying a variable based on house prices (where the distances could be in '000s of 
 £ and age, where the distances could be a few years). Standardisation ensures that these distances are accounted for 
and there "standardised".*

In [131]:
from sklearn.preprocessing import StandardScaler

In [132]:
scaler_1 = StandardScaler()

In [161]:
scaler_1.fit(Smarket.drop(columns = 'Direction', axis = 1).astype(float))

StandardScaler(copy=True, with_mean=True, with_std=True)

In [162]:
scaled_features_1 = scaler_1.transform(Smarket.drop(columns = 'Direction', axis = 1).astype(float))

In [163]:
df_1 = pd.DataFrame(scaled_features_1, columns = Smarket.columns[:-1] )

In [164]:
df_1.head()

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today
0,-1.431356,0.332058,-0.172491,-2.306806,-0.928243,4.362679,-0.796765,0.841517
1,-1.431356,0.84093,0.331988,-0.170188,-2.306592,-0.924608,-0.504715,0.905784
2,-1.431356,0.905199,0.840869,0.333218,-0.170107,-2.292416,-0.186293,-0.551237
3,-1.431356,-0.551867,0.90514,0.841016,0.333267,-0.17227,-0.561626,0.537787
4,-1.431356,0.537191,-0.55195,0.905149,0.841034,0.327254,-0.756789,0.184757


In [165]:
from sklearn.model_selection import train_test_split

In [166]:
X_train, X_test, y_train, y_test = train_test_split(scaled_features_1,Smarket['Direction'],
                                                    test_size=0.30)

In [167]:
from sklearn.neighbors import KNeighborsClassifier

In [168]:
knn_s_1 = KNeighborsClassifier(n_neighbors=1)

In [169]:
knn_s_1.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=1, p=2,
           weights='uniform')

In [170]:
knn_s_1_pred = knn_s_1.predict(X_test)

In [171]:
from sklearn.metrics import classification_report, confusion_matrix

In [172]:
print(confusion_matrix(y_test, knn_s_1_pred))

[[147  44]
 [ 34 150]]


In [173]:
print(classification_report(y_test, knn_s_1_pred))

              precision    recall  f1-score   support

        Down       0.81      0.77      0.79       191
          Up       0.77      0.82      0.79       184

   micro avg       0.79      0.79      0.79       375
   macro avg       0.79      0.79      0.79       375
weighted avg       0.79      0.79      0.79       375



**K-Means with standardisation (K = 3)**

In [174]:
from sklearn.preprocessing import StandardScaler

In [175]:
scaler_3 = StandardScaler()

In [177]:
scaler_3.fit(Smarket.drop(columns='Direction', axis = 1).astype(float))

StandardScaler(copy=True, with_mean=True, with_std=True)

In [178]:
scaled_features_3 = scaler_3.transform(Smarket.drop(columns='Direction', axis = 1).astype(float))

In [179]:
df_3 = pd.DataFrame(scaled_features_3, columns = Smarket.columns[:-1] )

In [180]:
df_3.head()

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today
0,-1.431356,0.332058,-0.172491,-2.306806,-0.928243,4.362679,-0.796765,0.841517
1,-1.431356,0.84093,0.331988,-0.170188,-2.306592,-0.924608,-0.504715,0.905784
2,-1.431356,0.905199,0.840869,0.333218,-0.170107,-2.292416,-0.186293,-0.551237
3,-1.431356,-0.551867,0.90514,0.841016,0.333267,-0.17227,-0.561626,0.537787
4,-1.431356,0.537191,-0.55195,0.905149,0.841034,0.327254,-0.756789,0.184757


In [181]:
from sklearn.model_selection import train_test_split

In [182]:
X_train, X_test, y_train, y_test = train_test_split(scaled_features_3,Smarket['Direction'],
                                                    test_size=0.30)

In [183]:
from sklearn.neighbors import KNeighborsClassifier

In [184]:
knn_s_3 = KNeighborsClassifier(n_neighbors=3)

In [185]:
knn_s_3.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='uniform')

In [186]:
knn_s_3_pred = knn_s_3.predict(X_test)

In [187]:
from sklearn.metrics import classification_report, confusion_matrix

In [188]:
print(confusion_matrix(y_test, knn_s_3_pred))

[[152  29]
 [ 22 172]]


In [189]:
print(classification_report(y_test, knn_s_3_pred))

              precision    recall  f1-score   support

        Down       0.87      0.84      0.86       181
          Up       0.86      0.89      0.87       194

   micro avg       0.86      0.86      0.86       375
   macro avg       0.86      0.86      0.86       375
weighted avg       0.86      0.86      0.86       375



**As we can see, there is a significant improvement in results with standardisation (precision rate of 85% in models with
standardisation as opposed to 47%-48% in models without standardisation).**