# Autofit Programme

#### Can I build a programme which can take an arbitrary number of users and fit their data to models of my choosing?
#### Goal: be able to quickly assess which models are good at detecting one user's actions vs. all the others.

In [33]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.metrics import classification_report, confusion_matrix

import warnings
warnings.filterwarnings("ignore")

In [2]:
actions = pd.read_csv("../../data/processed_balabit_data.csv")
actions.head()

Unnamed: 0,traveled_distance_pixel,elapsed_time,straightness,num_points,sum_of_angles,mean_curv,sd_curv,max_curv,min_curv,mean_omega,...,action_3,action_4,direction_1,direction_2,direction_3,direction_4,direction_5,direction_6,direction_7,user
0,2596.537181,9.329,0.198032,82,56.525861,-0.085272,0.527422,1.091459,-3.141593,15.669106,...,0,0,0,0,1,0,0,0,0,12
1,179.260212,1.919,0.959645,14,11.255258,-0.235679,0.916836,0.314159,-3.141593,-10.058717,...,0,1,0,1,0,0,0,0,0,12
2,887.903498,2.137,0.97954,21,-14.268377,-0.078308,0.112436,0.261799,-1.570796,-5.221251,...,0,0,0,0,0,0,0,0,1,12
3,75.035669,1.358,0.999613,7,-0.071307,-0.00065,0.00258,0.001828,-0.00508,0.05271,...,0,1,0,0,0,0,0,0,1,12
4,1078.67394,3.541,0.949591,27,-24.025555,-0.040986,0.174415,0.249828,-0.62839,4.743248,...,1,0,0,0,0,1,0,0,0,12


In [3]:
# renaming the users to ensure uniqueness of user ID
actions["user"] = actions["user"] + 100

In [4]:
# list of all users
user_names = ["user"+str(num) for num in list(actions["user"].unique())]

In [5]:
# function to create user columns
def user_maker(data, column="user"):
    for i in list(data[column].unique()):
        data[column+str(i)] = [1 if j == i else 0 for j in data[column]]

In [6]:
user_maker(actions)

In [7]:
# function to create user specific column indexes

col = []

def index_maker(data, column="user"):
    for user in user_names:
        col.append(data.columns.difference([j for j in data.columns if ((str(user) not in j) & ("user" in j))]))

In [8]:
index_maker(actions)

In [9]:
col_indexer = dict(zip(user_names, col))

In [10]:
# function to select all user-specific actions and a random sample of other users' actions, joining the two in one user-specific dataframe for each user
dataframes = []

def row_selector(data):
    for user in user_names:
        df = data[data[user]==1].append(data[actions[user]==0].sample(n=len(data[data[user]==1]))).reset_index(drop=True)
        dataframes.append(df)

In [11]:
row_selector(actions)

In [12]:
user_mapper = dict(zip(user_names, dataframes))

In [13]:
# function to select columns specific to an individual user
user_dataframes = []

def col_selector(dictionary, indexer=col_indexer):
    for user in user_names:
        df = dictionary[user][col_indexer[user]]
        user_dataframes.append(df)
        
    final_data = dict(zip(user_names, user_dataframes))
    
    return final_data

In [14]:
final_data = col_selector(user_mapper)

In [15]:
#check shape of each dataframe in the "final_data" dictionary and check the user corresponds to the correct indexer
for i in user_names:
    print(i, final_data[i].shape)
    # print(final_data[i].columns)

user112 (15310, 47)
user115 (14304, 47)
user116 (21534, 47)
user120 (7618, 47)
user121 (11534, 47)
user123 (10880, 47)
user129 (14534, 47)
user135 (8450, 47)
user107 (11148, 47)
user109 (6498, 47)


In [16]:
# check how many data points there should be for each user
for i in user_names:
    print(len(actions[actions[i]==1])*2)

15310
14304
21534
7618
11534
10880
14534
8450
11148
6498


In [27]:
# function to fit any model to any user's data
def model_fitter(user, model, dict_of_data=final_data, test_size=0.3, random_state=42):
    
    features = dict_of_data[str(user)].drop(str(user), axis=1)
    labels = dict_of_data[str(user)][str(user)]
    
    training_data, test_data, training_labels, test_labels = train_test_split(features,
                                                                              labels,
                                                                              test_size=test_size,
                                                                              random_state=random_state)
    
    clf = model()
    clf.fit(training_data, training_labels)
    
    pred = clf.predict(test_data)
    performance = classification_report(test_labels, pred)
    performance_2 = confusion_matrix(test_labels, pred)
    
    print("Model used:", model, "\n")
    print(performance)
    print(performance_2)

In [34]:
models = [KNeighborsClassifier, RandomForestClassifier, LogisticRegression, SVC]

In [35]:
for user in user_names:
    for model in models:
        print("\nUser:", user, "\n")
        model_fitter(user=user, model=model)


User: user112 

Model used: <class 'sklearn.neighbors.classification.KNeighborsClassifier'> 

              precision    recall  f1-score   support

           0       0.62      0.58      0.60      2311
           1       0.60      0.65      0.62      2282

    accuracy                           0.61      4593
   macro avg       0.61      0.61      0.61      4593
weighted avg       0.61      0.61      0.61      4593

[[1340  971]
 [ 805 1477]]

User: user112 

Model used: <class 'sklearn.ensemble.forest.RandomForestClassifier'> 

              precision    recall  f1-score   support

           0       0.69      0.75      0.71      2311
           1       0.72      0.66      0.69      2282

    accuracy                           0.70      4593
   macro avg       0.70      0.70      0.70      4593
weighted avg       0.70      0.70      0.70      4593

[[1722  589]
 [ 786 1496]]

User: user112 

Model used: <class 'sklearn.linear_model.logistic.LogisticRegression'> 

              preci

In [None]:
# what would output look like?
# what graphs are we going to do?