In [253]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

import warnings
warnings.filterwarnings("ignore")

In [254]:
actions = pd.read_csv("../../data/processed_balabit_data.csv")
actions.head()

Unnamed: 0,traveled_distance_pixel,elapsed_time,straightness,num_points,sum_of_angles,mean_curv,sd_curv,max_curv,min_curv,mean_omega,...,action_3,action_4,direction_1,direction_2,direction_3,direction_4,direction_5,direction_6,direction_7,user
0,2596.537181,9.329,0.198032,82,56.525861,-0.085272,0.527422,1.091459,-3.141593,15.669106,...,0,0,0,0,1,0,0,0,0,12
1,179.260212,1.919,0.959645,14,11.255258,-0.235679,0.916836,0.314159,-3.141593,-10.058717,...,0,1,0,1,0,0,0,0,0,12
2,887.903498,2.137,0.97954,21,-14.268377,-0.078308,0.112436,0.261799,-1.570796,-5.221251,...,0,0,0,0,0,0,0,0,1,12
3,75.035669,1.358,0.999613,7,-0.071307,-0.00065,0.00258,0.001828,-0.00508,0.05271,...,0,1,0,0,0,0,0,0,1,12
4,1078.67394,3.541,0.949591,27,-24.025555,-0.040986,0.174415,0.249828,-0.62839,4.743248,...,1,0,0,0,0,1,0,0,0,12


# Three users only

A multiclass K-Nearest Neighbors classification for three users.

Data processed.
PCA done on scaled data.
GridSearchCV used on unscaled data.
Predictions made on unscaled data.

## Processing

In [255]:
# actions = actions[(actions["user"] == 7) | (actions["user"] == 29)]
# actions.reset_index(drop=True, inplace=True)

In [256]:
# X = actions.drop("user", axis=1)
# y = actions["user"]

In [257]:
# scaler = StandardScaler()

In [258]:
# X_scale = scaler.fit_transform(X)

## PCA

In [259]:
# pca = PCA(n_components=3)

In [260]:
# pca.fit(X_scale)

In [261]:
# pca.explained_variance_ratio_

In [262]:
# X_reduced = pca.fit_transform(X_scale)

In [263]:
# def biplot(score, coeff, labels=None):
#     xs = score[:,0]
#     ys = score[:,1]
#     n = coeff.shape[0]
#     scalex = 1.0 / (xs.max() - xs.min())
#     scaley = 1.0 / (ys.max() - ys.min())
#     plt.figure(figsize=(16,8))
#     plt.scatter(xs * scalex, ys * scaley, c = y)
    
#     for i in range(n):
#         plt.arrow(0, 0, coeff[i,0], coeff[i,1],color = 'r',alpha = 0.5)
        
#         if labels is None:
#             plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, "Var"+str(i+1), color = 'g', ha = 'center', va = 'center')
#         else:
#             plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, labels[i], color = 'g', ha = 'center', va = 'center')
    
#     plt.xlim(-1,1)
#     plt.ylim(-1,1)
#     plt.xlabel("PC{}".format(1))
#     plt.ylabel("PC{}".format(2))
#     plt.grid()

In [264]:
# biplot(X_reduced[:,0:2], np.transpose(pca.components_[0:2, :]), labels=actions.columns)
# plt.show()

In [265]:
# n_pcs = pca.components_.shape[0]

In [266]:
most_important = [np.abs(pca.components_[i]).argmax() for i in range(n_pcs)]

In [267]:
initial_feature_names = list(range(47))

In [268]:
most_important_names = [initial_feature_names[most_important[i]] for i in range(n_pcs)]

In [269]:
dic = {'PC{}'.format(i): most_important_names[i] for i in range(n_pcs)}

In [270]:
pca_df = pd.DataFrame(dic.items())

In [271]:
pca_df

In [272]:
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_reduced, y, test_size=0.3,
                                                            random_state=42)

In [273]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## KNN GridSearch

In [274]:
param_grid = {"n_neighbors": np.arange(1, 200, 2),
              "weights":["uniform", "distance"]}

In [275]:
knn_gscv = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)

In [276]:
knn_gscv.fit(X_train, y_train)

In [277]:
knn_gscv.best_params_

In [278]:
knn_gscv.best_score_

## Prediction

In [279]:
pred_gscv = knn_gscv.predict(X_test)

In [280]:
print(classification_report(y_test, pred_gscv))
print(confusion_matrix(y_test, pred_gscv))

In [281]:
print("Accuracy: 87% of all the actions were classified correctly.")
print("Precision: 87% (FPR: 13%) of the actions predicted positive were truly positive.")
print("Recall: 87% of the actions predicted positive were correctly classified.")
print("F1-score: 0.87 Weighted average of precision and recall.")

In [282]:
all_user_performance = classification_report(y_test, pred_gscv)

In [283]:
print(all_user_performance)