# Machine Learning

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('keystroke.csv')

In [3]:
df = df.drop(['sessionIndex','rep'],axis=1)

In [4]:
df['subject'] = df['subject'].apply(lambda x: x[2:])

## Splitting data

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X = df.drop('subject',axis=1)
y = df['subject']

In [8]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

## Preprocessing

In [9]:
from sklearn.preprocessing import QuantileTransformer

In [10]:
scaler = QuantileTransformer(output_distribution='normal')

In [11]:
X_train = scaler.fit_transform(X_train)

In [12]:
X_test = scaler.transform(X_test)

## K Nearest Neighbor

## Fitting and predicting

In [13]:
from sklearn.neighbors import KNeighborsClassifier

In [14]:
knn = KNeighborsClassifier(n_neighbors=5,p=1,algorithm='auto',weights='distance')

In [15]:
knn.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=1,
                     weights='distance')

In [16]:
knn_pred = knn.predict(X_test)

## Evaluating model

In [17]:
from sklearn.metrics import classification_report

In [18]:
print(classification_report(y_test,knn_pred))

              precision    recall  f1-score   support

          02       0.76      0.86      0.81       116
          03       0.87      0.83      0.85       107
          04       0.87      0.88      0.87       113
          05       0.89      0.96      0.92       121
          07       0.81      0.89      0.85        99
          08       0.91      0.85      0.88       124
          10       0.99      0.99      0.99       135
          11       0.98      0.88      0.93       116
          12       0.98      0.93      0.96       129
          13       0.93      0.90      0.92       114
          15       0.97      0.89      0.93       122
          16       0.81      0.99      0.89       112
          17       0.97      0.96      0.97       118
          18       0.99      0.79      0.88       115
          19       0.98      0.98      0.98       122
          20       0.97      0.80      0.88       124
          21       0.91      0.93      0.92       127
          22       0.98    

## Grid search

In [19]:
from sklearn.model_selection import GridSearchCV

In [20]:
test_params = {'n_neighbors':[1,5,10,50,100],'weights':['uniform','distance'],
               'algorithm':['auto','ball_tree','kd_tree','brute'],'p':[1,2]}

In [21]:
grid = GridSearchCV(KNeighborsClassifier(),test_params,verbose=1)

In [22]:
grid.fit(X_train,y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 80 candidates, totalling 240 fits


[Parallel(n_jobs=1)]: Done 240 out of 240 | elapsed:  8.5min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid={'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                         'n_neighbors': [1, 5, 10, 50, 100], 'p': [1, 2],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=1)

In [23]:
grid.best_params_

{'algorithm': 'auto', 'n_neighbors': 5, 'p': 1, 'weights': 'distance'}

In [24]:
grid_pred = grid.predict(X_test)

In [25]:
from sklearn.metrics import classification_report

In [26]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

          02       0.76      0.86      0.81       116
          03       0.87      0.83      0.85       107
          04       0.87      0.88      0.87       113
          05       0.89      0.96      0.92       121
          07       0.81      0.89      0.85        99
          08       0.91      0.85      0.88       124
          10       0.99      0.99      0.99       135
          11       0.98      0.88      0.93       116
          12       0.98      0.93      0.96       129
          13       0.93      0.90      0.92       114
          15       0.97      0.89      0.93       122
          16       0.81      0.99      0.89       112
          17       0.97      0.96      0.97       118
          18       0.99      0.79      0.88       115
          19       0.98      0.98      0.98       122
          20       0.97      0.80      0.88       124
          21       0.91      0.93      0.92       127
          22       0.98    

## Cross Validation

In [27]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(KNeighborsClassifier(algorithm='auto',n_neighbors=5,p=1,weights='distance'), X, y, cv=20)
print(scores.mean())

0.8142647058823531


## Support Vector Classifier

## Fitting and predicting

In [28]:
from sklearn.svm import SVC

In [29]:
svc = SVC()

In [30]:
svc.fit(X_train,y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [31]:
svc_pred = svc.predict(X_test)

## Evaluating model

In [32]:
from sklearn.metrics import classification_report

In [33]:
print(classification_report(y_test,svc_pred))

              precision    recall  f1-score   support

          02       0.76      0.87      0.81       116
          03       0.89      0.90      0.89       107
          04       0.90      0.91      0.91       113
          05       0.97      0.95      0.96       121
          07       0.85      0.90      0.87        99
          08       0.96      0.84      0.90       124
          10       0.96      0.99      0.97       135
          11       0.92      0.93      0.93       116
          12       0.97      0.89      0.93       129
          13       0.94      0.95      0.94       114
          15       0.93      0.94      0.94       122
          16       0.89      0.98      0.94       112
          17       0.99      0.97      0.98       118
          18       0.95      0.85      0.90       115
          19       1.00      0.97      0.98       122
          20       0.90      0.90      0.90       124
          21       0.89      0.91      0.90       127
          22       1.00    

## Grid search

In [34]:
from sklearn.model_selection import GridSearchCV

In [35]:
test_params = {'C':[0.1,1,50,100],'gamma':[1,0.1,0.01,0.001],'kernel':['rbf','linear','poly']}

In [36]:
grid = GridSearchCV(SVC(),test_params,verbose=1)

In [37]:
grid.fit(X_train,y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=1)]: Done 144 out of 144 | elapsed: 14.2min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [0.1, 1, 50, 100], 'gamma': [1, 0.1, 0.01, 0.001],
                         'kernel': ['rbf', 'linear', 'poly']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=1)

In [38]:
grid.best_params_

{'C': 50, 'gamma': 0.1, 'kernel': 'rbf'}

In [39]:
grid_pred = grid.predict(X_test)

In [40]:
from sklearn.metrics import classification_report

In [41]:
print(classification_report(y_test,grid_pred))

              precision    recall  f1-score   support

          02       0.85      0.90      0.87       116
          03       0.90      0.90      0.90       107
          04       0.93      0.93      0.93       113
          05       0.97      0.95      0.96       121
          07       0.85      0.88      0.87        99
          08       0.97      0.90      0.93       124
          10       0.97      0.97      0.97       135
          11       0.93      0.91      0.92       116
          12       0.97      0.91      0.94       129
          13       0.97      0.94      0.96       114
          15       0.86      0.96      0.91       122
          16       0.93      0.98      0.96       112
          17       1.00      0.97      0.98       118
          18       0.94      0.86      0.90       115
          19       0.99      0.98      0.98       122
          20       0.80      0.95      0.87       124
          21       0.91      0.91      0.91       127
          22       1.00    

## Cross validation

In [42]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(SVC(C=50,gamma=0.1, kernel='rbf'), X, y, cv=20)
print(scores.mean())

0.8478921568627451
