# CSC490H5 Model.

---

This is the "backend" of our Random Forest Classifier Model. 
Here the watch and keyboard data is processed, and our model is then trained on that data.

In [0]:
%tensorflow_version 1.14

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn

import re
import glob

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

Connect to google drive

In [0]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

In [133]:
path_to_keys = "/content/gdrive/My Drive/School Winter 2020/Csc490/data/v2/keys/"
path_to_watch = "/content/gdrive/My Drive/School Winter 2020/Csc490/data/v2/watch/"

watch_data = path_to_watch + '*.csv'
keyboard_log = path_to_keys + '*.log'

watch_acceleration = {}
keyboard_logs = {}

for file in glob.glob(watch_data):
  filename = file.split("/")[-1].strip(".csv").strip("watch_")
  wa = [line.strip().split(",") for line in open(file) if line.strip().split(",") != '']
  watch_acc = wa

for file in glob.glob(keyboard_log):
  filename = int(file.split("/")[-1].strip(".log").strip("keys_"))-1
  kd = [line.strip() for line in open(file)]
  keyboard_logs[filename] = kd

print(len(watch_acc))

127914


Seperate the keyboard log into "key pressed" and "key up" events.

In [134]:
locations = {"left": 0, "center": 1, "right": 2}
special_keys = {"shift": 1, "control": 2, "alt": 3, "None": 0}

# Big Boi List
keys_mapped = {'1': 1, '2': 1, '3': 1, '!': 1, '@': 1, '#': 1, '4': 2, '5': 2, 
               '6': 2, '$': 2, '%': 2, '^': 2, '7': 3, '8': 3, '&': 3, '*': 3, 
               '9': 4, '0': 4, '(': 4, ')': 4, '-': 5, '_': 5, '=': 5, '+': 5, 
               'backspace': 5, 'q': 6, 'tab': 6, 'w': 7, 'e': 7, 'r': 7, 't': 8, 
               'y': 8, 'u': 9, 'i': 9, 'o': 9, 'p': 9, '[': 10, ']': 10, 
               '\\': 10, '{': 10, '}': 10, '|': 10, 'a': 11, 's': 11, 'd': 11, 
               'f': 12, 'g': 12, 'h': 12, 'j': 13, 'k': 13, 'l': 13, ';': 14, 
               "'": 14, '"': 14, ':': 14, 'enter': 14, 'z': 15, 'x': 15, 'c': 15, 
               'space': 16, 'b': 16, 'n': 16, 'm': 16, ',': 17, '.': 17, '/': 17, 
               '<': 17, '>': 17, '?': 17, 'arrowleft': 18, 'arrowright': 18, 
               'arrowup': 18, 'arrowdown': 18}

all_keys_pressed = {}
all_keys_released = {}
last_event = None

p = re.compile(r"index.html:[0-9]* (?P<timestamp>[0-9]*), (?P<event>[a-z]*), (?P<key>([a-zA-Z0-9]*|[^a-zA-Z0-9_])), (?P<location>(left|center|right))")

for index in keyboard_logs.keys():
  key_log = keyboard_logs[index]
  keys_pressed = []
  keys_released = []
  for line in key_log:
    s = p.search(line)

    timestamp = int(s.group("timestamp"))
    event     = s.group("event")
    key       = s.group("key").lower()
    location  = locations[s.group("location")]

    # If the special keys were pressed with the others, then just add as a tag
    if key in ["shift", "control", "alt", "meta"] and last_event == "keypress":
      keys_pressed[-1][-1] = special_keys[key]
      continue 

    last_event = event

    if event == "keypress":
      # Timestamp, key side, key      
      keys_pressed.append([timestamp, keys_mapped.get(key, 19), location, 0]) # Last is for special keys
    elif event == "keyup" and key not in ["shift", "meta", "control", "alt", "backspace", "arrowleft", "arrowright"]:
      # For now ignore these, figuring out what to do with them is a later problem
      keys_released.append([timestamp, keys_mapped.get(key, 19), location, 0])
  all_keys_pressed[index] = keys_pressed
  all_keys_released[index] = keys_released

# We would like to start measuring from when our first key is let go 
# Because that is how we measure sequences

keys_pressed = []
keys_released = []

for index in all_keys_pressed.keys():
  kp = all_keys_pressed[index]
  keys_pressed.extend(kp[1:])


for index in all_keys_released.keys():
  kr = all_keys_released[index]
  keys_released.extend(kr[0:-1])

print(len(keys_pressed))
print(len(keys_released))

5680
5680


Go through all our data and split up the watch data by the timestamp from the keyboard log. 

Each sequence begins when the first key is lifted, and ends when the next key is pressed. 

Map each sequence to a key.

In [135]:
sequences = []
predictions = []

copy_acc = watch_acc

for i in range(len(keys_pressed)):

  start = int(keys_released[i][0])
  end = int(keys_pressed[i][0])

  key = keys_pressed[i][1]
  loc = keys_pressed[i][2] #Ignore for now 

  sequence = []

  while len(copy_acc) != 0:
    # We want to remove the line so we dont have to iterate trough everything again
    line = copy_acc.pop(0)
    if line == ['']:
      continue 

    time, acc_x, acc_y, acc_z = line[0], line[1], line[2], line[3]

    current_time = int(time)

    if current_time < start:
      continue 
    if current_time >= end:
      break 

    sequence.append([float(acc_x), float(acc_y), float(acc_z)])
  predictions.append(key)
  sequences.append(sequence)

# We want to know how much data we have
# Should match up with how many keys_pressed we have
print(len(sequences))
print(len(predictions)) 
print(sequences[:5])

5680
5680
[[[-0.10409682989120483, -0.3792048990726471, -0.142575204372406], [0.06669677793979645, -0.3815690279006958, 0.061362385749816895], [0.15640202164649963, -0.3372584879398346, 0.1829262375831604], [0.12054543197154999, -0.24565374851226807, 0.1976635456085205], [0.1397428810596466, -0.1329779028892517, 0.2774924039840698], [0.08984654396772385, 0.034388840198516846, 0.2922077178955078], [0.05156753212213516, 0.18801024556159973, 0.2595299482345581], [-0.007214866578578949, 0.157533198595047, 0.20696258544921875], [-0.06059535592794418, 0.21180042624473572, 0.08828258514404297], [-0.10858726501464844, 0.262290894985199, -0.028947114944458008], [-0.053401052951812744, 0.23143446445465088, -0.022578299045562744], [0.020038381218910217, 0.18292322754859924, 0.04327189922332764], [0.024208277463912964, 0.06968152523040771, 0.03918260335922241], [-0.012914493680000305, 0.01488754153251648, -0.01999586820602417], [0.03397625684738159, 0.030261099338531494, 0.008123457431793213], [0.

Now we pad the shorter sequences with [0, 0, 0] to match the length of the longest sequence.

In [136]:
max_seq_3 = max(sequences,key=len)
print(max_seq_3)
max_len = len(max_seq_3)
print(max_len)

padded_sequences = []
for sequence in sequences:
  while len(sequence) < max_len:
     sequence.append([0, 0, 0])
  np.stack(sequence)
  padded_sequences.append(sequence)
np_sequences = np.stack(padded_sequences)

print(np_sequences.shape)

[[-0.0051771849393844604, -0.013598904013633728, -0.014017820358276367], [-0.006830289959907532, 0.011901572346687317, -0.0205308198928833], [-0.019906193017959595, 0.00543588399887085, -0.013185441493988037], [0.000808030366897583, -0.0221090167760849, -0.008004605770111084], [-0.015628039836883545, 0.007476001977920532, -0.008981108665466309], [-0.013946115970611572, 0.0027497410774230957, 0.0018992424011230469], [0.024932637810707092, 0.0785786509513855, 0.018825650215148926], [0.03461426496505737, 0.07330025732517242, 0.017297744750976562], [0.0284845232963562, 0.02799740433692932, 0.005921423435211182], [0.01885378360748291, 0.03290256857872009, 0.013258099555969238], [-0.0024113506078720093, 0.06434117257595062, 0.0100325345993042], [-0.014567315578460693, 0.025759294629096985, 0.0014353394508361816], [-0.0005162805318832397, -0.03930597007274628, -0.007545888423919678], [-0.006729334592819214, -0.041155338287353516, -0.002635478973388672], [-0.01593148708343506, -0.0009172111749

Now we make our classifier

It should take in a vector thats **N * M * 3**

Where N = Number of squences and M = Sequence length

In [137]:
# Our shape is (5680, 270, 3)

# So sklearn expects 2d arrays... gotta reshape
N, nx, ny = np_sequences.shape
new_sequences = np_sequences.reshape((N,nx*ny))

# Split into train, validation, and test sets
# I will do about 70 train and 30 test
train_data, test_data = new_sequences[:3976,:], new_sequences[3976:,:]
print (train_data.shape)
print (test_data.shape)
train_ts, test_ts = predictions[:3976], predictions[3976:]

# Create Classifier
rfc=RandomForestClassifier(n_estimators=150, max_features=0.15, min_samples_leaf=60, oob_score=True)

# Train the model using the training set
rfc.fit(train_data,train_ts)

# Test our accuracy
test_ys=rfc.predict(test_data)


# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(test_ts, test_ys))
# Key Prediction =~ 23% accuracy

# Store Model
import joblib

filename = "/content/gdrive/My Drive/School Winter 2020/Csc490/data/weights.joblib"
weights = joblib.dump(rfc,filename)

(3976, 810)
(1704, 810)
Accuracy: 0.24354460093896713


This is to just to show that our model is bad at predicting... because of all the space predictions.

In [0]:
for y, t in zip(test_ys, test_ts):
  if y!= t:
    print(y, t)

Ok let's try a different model - rnn sequential, 

Just to compare accuracy

First we will have to convert the predictions into their numerical form

In [138]:
uniq_pred = np.unique(predictions)
print(uniq_pred)
num_uniq = uniq_pred.shape[0]

pred_numbered = []
for key in predictions:
  pred_numbered.append(list(uniq_pred).index(key))

uniq_pred2 = np.unique(pred_numbered)
num_uniq2 = uniq_pred2.shape[0]

print(uniq_pred, num_uniq)
print(uniq_pred2, num_uniq2)

def make_onehot(indicies, total=250):
    """
    Convert indicies into one-hot vectors by
        1. Creating an identity matrix of shape [total, total]
        2. Indexing the appropriate columns of that identity matrix
    """
    I = np.eye(total)
    return I[indicies]

hot_predictions = make_onehot(pred_numbered, num_uniq2)
print (hot_predictions[:5])

[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 19]
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 19] 18
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17] 18
[[0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]]


This function broke at some point, and I cant be bothered to fix it :(

In [0]:
from keras.models import Sequential
from keras.layers import Dense

# Split into train, validation, and test sets
# I will do about 60 train, 20 valid and 20 test

N, nx, ny = np_sequences.shape
print(N, nx, ny)
new_sequences = np_sequences.reshape((N,nx*ny))

train_data, valid_data, test_data = new_sequences[:3408‬,:],  new_sequences[3408‬:4544,:], new_sequences[4544:,:]
print (train_data.shape)
print (valid_data.shape)
print (test_data.shape)
train_ts, valid_ts, test_ts = hot_predictions[:3408‬],  hot_predictions[3408‬:4544], hot_predictions[4544:]

dim = nx*ny

model = Sequential()
model.add(Dense(N, activation='relu', input_dim=dim))
#model.add(Dense(N, activation='relu'))
model.add(Dense(3, activation='sigmoid'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(train_data, train_ts,
          batch_size=64, epochs=10,
          validation_data=(valid_data, valid_ts))


score = model.evaluate(test_data, test_ts, batch_size=64)
print(score)

Finally this is what the prediction function should look like (in flask).

In [126]:
def predict(batch_of_10, ts):
  path = "/content/gdrive/My Drive/School Winter 2020/Csc490/data/weights.joblib"
  model = joblib.load(path)
  acc = model.score(batch_of_10, ts)
  if acc >= 0.2:
    return True
  return False  

batch_of_10 = new_sequences[:10]
ts = predictions[:10]
predict(batch_of_10, ts)

True