# CSC490H5 Model.

---

I am trying to make the model that the researchers used in this paper: https://arxiv.org/pdf/1505.05779.pdf

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn

import re
import glob

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

Connect to google drive

In [87]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [95]:
path_to_data = "/content/gdrive/My Drive/School Winter 2020/Csc490/data/"

watch_data = path_to_data + '*.csv'
keyboard_log = path_to_data + '*.log'

watch_acceleration = {}
keyboard_logs = {}

for file in glob.glob(watch_data):
  # So using glob messes with order, which is why we are doing this
  filename = int(file.split("/")[-1].strip(".csv").strip("watch_"))-1
  wa = [line.strip().split(",") for line in open(file) if line.strip().split(",") != '']
  watch_acceleration[filename] = wa

for file in glob.glob(keyboard_log):
  filename = int(file.split("/")[-1].strip(".log").strip("keys_"))-1
  kd = [line.strip() for line in open(file)]
  keyboard_logs[filename] = kd


# This is to fix the mess caused by glob
watch_acc = []
for i in range(len(watch_acceleration.keys())):
  watch_acc.extend(watch_acceleration[i])

print(len(watch_acc))

190241


Seperate the keyboard log into "key pressed" and "key up" events.

In [96]:
locations = {"left": 0, "center": 1, "right": 2}
special_keys = {"Shift": 1, "Control": 2, "Alt": 3, "None": 0}

all_keys_pressed = {}
all_keys_released = {}
all_keys_used = [] # Used later in one_hotting the keys
last_event = None

p = re.compile(r"index.html:[0-9]* (?P<timestamp>[0-9]*), (?P<event>[a-z]*), (?P<key>([a-zA-Z0-9]*|[^a-zA-Z0-9_])), (?P<location>(left|center|right))")

for index in keyboard_logs.keys():
  key_log = keyboard_logs[index]
  keys_pressed = []
  keys_released = []
  for line in key_log:
    s = p.search(line)

    timestamp = int(s.group("timestamp"))
    event     = s.group("event")
    key       = s.group("key")
    location  = s.group("location")

    # If the special keys were pressed with the others, then just add as a tag
    if key in ["Shift", "Control", "Alt"] and last_event == "keypress":
      keys_pressed[-1][-1] = special_keys[key]
      continue 

    last_event = event

    if event == "keypress":
      # Timestamp, key side, key
      keys_pressed.append([timestamp, key, locations[location], 0]) # Last is for special keys
    elif event == "keyup" and key not in ["Shift", "Control", "Alt", "Backspace", "ArrowLeft", "ArrowRight"]:
      # For now ignore these, figuring out what to do with them is a later problem
      keys_released.append([timestamp, key, locations[location], 0])
  all_keys_pressed[index] = keys_pressed
  all_keys_released[index] = keys_released

# We would like to start measuring from when our first key is let go 
# Because that is how we measure sequences

keys_pressed = []
keys_released = []

for index in all_keys_pressed.keys():
  kp = all_keys_pressed[index]
  keys_pressed.extend(kp[1:])

for index in all_keys_released.keys():
  kr = all_keys_released[index]
  keys_released.extend(kr[0:-1])

print(len(keys_pressed))    # 10473
print(len(keys_released))   # 10473

10473
10473


Go through all our data and split up the watch data by the timestamp from the keyboard log. 

Each sequence begins when the first key is lifted, and ends when the next key is pressed. 

Map each sequence to a key.

In [0]:
sequences = []
predictions = []
predictions_loc = []
count = 0

copy_acc = watch_acc

for i in range(len(keys_pressed)):

  if count > 10:
    break 
  start = int(keys_released[i][0])
  end = int(keys_pressed[i][0])

  key = keys_pressed[i][1]
  loc = keys_pressed[i][2] #Ignore for now

  sequence = []

  while len(copy_acc) != 0:
    # We want to remove the line so we dont have to iterate trough everything again
    line = copy_acc.pop(0)
    if line == ['']:
      continue 

    time, acc_x, acc_y, acc_z = line[0], line[1], line[2], line[3]

    current_time = int(time)

    if current_time < start:
      continue 
    if current_time >= end:
      break 

    sequence.append([float(acc_x), float(acc_y), float(acc_z)])
  predictions.append(key)
  predictions_loc.append(loc)
  sequences.append(sequence)

# We want to know how much data we have
# Should match up with how many keys_pressed we have
print(len(sequences))
print(len(predictions)) 
print(sequences[:5])
print()

Now we pad the shorter sequences with [0, 0, 0] to match the length of the longest sequence.

In [0]:
print(max(sequences,key=len))
max_len = len(max(sequences,key=len))
print(max_len)

padded_sequences = []
for sequence in sequences:
  while len(sequence) < max_len:
     sequence.append([0, 0, 0])
  np.stack(sequence)
  padded_sequences.append(sequence)
np_sequences = np.stack(padded_sequences)

print(np_sequences.shape)

Now we make our classifier

It should take in a vector thats **N * M * 3**

Where N = Number of squences and M = Sequence length

In [66]:
# Our shape is (10473, 256, 3)

# Ok weird the shape changed to (10473, 220, 3)

# So sklearn expects 2d arrays... gotta reshape
N, nx, ny = np_sequences.shape
new_sequences = np_sequences.reshape((N,nx*ny))

# Split into train, validation, and test sets
# I will do about 70 train and 30 test
train_data, test_data = new_sequences[:7730,:], new_sequences[7730:,:]
print (train_data.shape)
print (test_data.shape)
train_ts, test_ts = predictions[:7730], predictions[7730:]

#Create a Gaussian Classifier
rfc=RandomForestClassifier(n_estimators=150, max_features=0.15, min_samples_leaf=60, oob_score=True)

#Train the model using the training sets y_pred=clf.predict(X_test)
rfc.fit(train_data,train_ts)

test_ys=rfc.predict(test_data)


# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(test_ts, test_ys))
# Key Prediction = 19% accuracy
# Location Pred = 54.5 - 54.8%

(7730, 660)
(2743, 660)
Accuracy: 0.19248997448049582


Ok let's try a different model - rnn sequential

First we will have to convert the predictions into their numerical form

In [99]:
uniq_pred = np.unique(predictions_loc)
num_uniq = uniq_pred.shape[0]

pred_numbered = []
for key in predictions_loc:
  pred_numbered.append(list(uniq_pred).index(key))

uniq_pred2 = np.unique(pred_numbered)
num_uniq2 = uniq_pred2.shape[0]

print(uniq_pred, num_uniq)
print(uniq_pred2, num_uniq2)

def make_onehot(indicies, total=250):
    """
    Convert indicies into one-hot vectors by
        1. Creating an identity matrix of shape [total, total]
        2. Indexing the appropriate columns of that identity matrix
    """
    I = np.eye(total)
    return I[indicies]

hot_predictions = make_onehot(pred_numbered, num_uniq2)
print (hot_predictions[:5])

[0 1 2] 3
[0 1 2] 3
[[0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]


In [100]:
from keras.models import Sequential
from keras.layers import Dense

# Split into train, validation, and test sets
# I will do about 76 train, 20 valid and 20 test

N, nx, ny = np_sequences.shape
new_sequences = np_sequences.reshape((N,nx*ny))

train_data, valid_data, test_data = new_sequences[:6283,:],  new_sequences[6283:8378,:], new_sequences[8378:,:]
print (train_data.shape)
print (valid_data.shape)
print (test_data.shape)
train_ts, valid_ts, test_ts = hot_predictions[:6283],  hot_predictions[6283:8378], hot_predictions[8378:]

dim = nx*ny

model = Sequential()
model.add(Dense(N, activation='relu', input_dim=dim))  # returns a sequence of vectors of dimension 32
#model.add(Dense(N, activation='relu'))  # returns a sequence of vectors of dimension 32
model.add(Dense(3, activation='sigmoid'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(train_data, train_ts,
          batch_size=64, epochs=10,
          validation_data=(valid_data, valid_ts))


score = model.evaluate(test_data, test_ts, batch_size=64)
print(score)

(6283, 660)
(2095, 660)
(2095, 660)
Train on 6283 samples, validate on 2095 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[1.0567520599945632, 0.5632458229053561]
