# Distances

In [1]:
from math import sqrt

In [2]:
def euclidean_distance(list1, list2):
    distance = 0
    min_len = min(len(list1), len(list2))

    for i in range(0, min_len):
        distance += sqrt(abs(list1[i]**2 - list2[i]**2))

    return distance

In [3]:
def manhattan_distance(list1, list2):
    distance = 0
    min_len = min(len(list1), len(list2))

    for i in range(0, min_len):
        distance += abs(list1[i] - list2[i])
    return distance

In [4]:
list_a = [1,2,3,4,5,6,7,8,9,10]
list_b = [10,9,8,7,6,5,4,3,2,1]

In [5]:
print(euclidean_distance(list_a, list_b))
print(euclidean_distance(list_a, list_a))

70.40444936489483
0.0


In [6]:
print(manhattan_distance(list_a, list_b))
print(manhattan_distance(list_a, list_a))

50
0


# Database

In [7]:
import os
import sqlite3

In [8]:
class DatabaseManager:
    '''Manages database connections, executes scripts'''
    def __init__(self, path, create_db_script):
        '''Initializes sqlite3 database.

        Keyword arguments:
        path -- the path where database file is saved
        '''
        self.path = path

        if not os.path.exists(path):
            self.conn = sqlite3.connect(path)
            print('Creating database...')
            self.execute(create_db_script)
        else:
            self.conn = sqlite3.connect(path)

    def run_sql_from_file(self, path):
        '''Executes specified by path script'''
        with open(path) as f:
            script = f.read()
        self.conn.execute(script)

    def execute(self, command):
        '''Executes specified sql command'''
        return self.conn.execute(command)

    def open(self):
        '''Opens database connection'''
        self.conn = sqlite3.connect(self.path)

    def close(self):
        '''Closes database connection'''
        self.conn.close()

In [9]:
PATH = os.path.abspath('./database')
create_db_script = '''create table user_typing_data (
                          time timestamp not null default current_timestamp,
                          user_id integer not null,
                          input0 text not null,
                          ip text,
                          browser text,
                          primary key  (time, user_id)
                        );'''
db_path = os.path.join(PATH, 'database.db')
database = DatabaseManager(db_path, create_db_script)

In [10]:
insert_data_script = os.path.join('./data', 'insert_data.sql')
try:
    database.run_sql_from_file(insert_data_script)
except sqlite3.IntegrityError:
    pass

# User

In [11]:
from datetime import datetime
import random

In [12]:
class UserData:
    def __init__(self, tuple_object):
        self.time = datetime.strptime(tuple_object[0], "%Y-%m-%d %H:%M:%S")
        self.user_id = int(tuple_object[1])
        self.input0_time = self.parse_input(tuple_object[2])
        self.ip = tuple_object[3]
        self.browser = tuple_object[4]

    def parse_input(self, input0):
        splitted = input0.split(' ')
        inputs = list()
        last_time = 0

        for i in range(1, len(splitted)):
            key = splitted[i]
            if 'd_16' in key:
                continue
            if 'u_16' in key:
                continue
            if key == '':
                break
            data = key.split('_')
            diff = int(data[2]) - last_time
            last_time = int(data[2])
            inputs.append(diff)

        return inputs

    def __str__(self):
        return str(self.user_id)

In [13]:
num_ids = 10
entries_raw = database.execute(
    f'SELECT * from user_typing_data where user_id < {num_ids}'
)
for desc in entries_raw.description:
    print(desc[0], end=', ')
print()
print(entries_raw.fetchone())

time, user_id, input0, ip, browser, 
('2013-11-20 07:25:12', 7, 'd_190_0_0 u_190_64 d_84_1008_1 u_84_1063 d_73_1776_2 u_73_1832 d_69_1872_3 u_69_1952 d_53_3128_4 u_53_3176 d_16_5256_5 d_82_5424_5 u_82_5464 u_16_5672 d_79_7303_6 u_79_7359 d_65_10167_7 u_65_10223 d_78_11420_8 u_78_11439 d_76_11679_9 u_76_11719 d_13_12156_10 u_13_12183 ', '81.219.51.76', 'Mozilla/5.0 (Windows NT 5.1; rv:24.0) Gecko/20100101 Firefox/24.0')


In [14]:
entries_list = list()

for item_raw in entries_raw:
    user = UserData(item_raw)
    entries_list.append(user)
print(len(entries_list))
print(entries_list[0].input0_time)
print(entries_list[0].user_id)
print(entries_list[0].ip)
print(entries_list[0].browser)

546
[47, 1176, 58, 598, 48, 32, 80, 1136, 56, 1232, 48, 1319, 45, 1139, 45, 156, 52, 232, 48, 568, 24]
7
81.219.51.76
Mozilla/5.0 (Windows NT 5.1; rv:24.0) Gecko/20100101 Firefox/24.0


In [15]:
random.shuffle(entries_list)
percent = 0.73
train_len = int(len(entries_list) * percent)
test_len = len(entries_list) - train_len
train_set = entries_list[test_len:]
test_set = entries_list[:-train_len]

In [16]:
print(len(train_set), len(test_set), len(test_set) / len(entries_list))

398 148 0.27106227106227104


# Classifier

In [17]:
from operator import itemgetter
from datetime import timedelta


In [18]:
class kNNClassifier:
    def __init__(self, distance_function):
        self.distance_function = distance_function

    def get_k_neighbours(self, test_instance, training_set, k):
        distances = list()
        for x in range(len(training_set)):
            train_instance = training_set[x]
            dist = self.distance_function(
                test_instance.input0_time,
                train_instance.input0_time
            )
            if test_instance.ip != train_instance.ip:
                dist += 1000
            if test_instance.browser != train_instance.browser:
                dist += 600
            if test_instance.time - train_instance.time > timedelta(hours=1):
                dist += 200

            distances.append((training_set[x], dist))
        distances.sort(key=itemgetter(1))
        neighbors = list()
        for x in range(k):
            neighbors.append(distances[x][0])
        return neighbors

    def frequent_sort(self, neighbours):
        id_to_num = dict()

        for neighbour in neighbours:
            class_id = neighbour.user_id
            if class_id in id_to_num:
                id_to_num[class_id] += 1
            else:
                id_to_num[class_id] = 0

        sorted_id_to_num = sorted(
            id_to_num.items(),
            key=itemgetter(1),
            reverse=True

        )
        return sorted_id_to_num

    def classify(self, test_instance, training_set, k):
        neighbours = self.get_k_neighbours(
            test_instance,
            training_set,
            k
        )

        ret_class_id = self.frequent_sort(neighbours)[0][0]

        return ret_class_id

In [19]:
distance_funcs = {
    "Manhattan Distance": manhattan_distance,
    "Euclidean Distance": euclidean_distance
}

In [20]:
for k_num in range(1, 2):
    print(f"k = {k_num}")
    for distance_func_name in distance_funcs:
        successes = 0
        knn_classifier = kNNClassifier(distance_funcs[distance_func_name])

        for test_inst in test_set:
            result_id = knn_classifier.classify(
                test_inst,
                train_set,
                k_num
            )
            if test_inst.user_id == result_id:
                successes += 1

        accuracy = successes / len(test_set)
        print(f'Accuracy: {accuracy} using {distance_func_name}')


k = 1
Accuracy: 0.9797297297297297 using Manhattan Distance
Accuracy: 0.972972972972973 using Euclidean Distance


In [21]:
database.close()