# CSC490H5 Model.

---

I am trying to make the model that the researchers used in this paper: https://arxiv.org/pdf/1505.05779.pdf

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn

import re
import glob

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

Connect to google drive

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [231]:
path_to_data = "/content/gdrive/My Drive/School Winter 2020/Csc490/data/"

watch_data = path_to_data + '*.csv'
keyboard_log = path_to_data + '*.log'

watch_acceleration = {}
keyboard_logs = {}

for file in glob.glob(watch_data):
  # So using glob messes with order, which is why we are doing this
  filename = int(file.split("/")[-1].strip(".csv").strip("watch_"))-1
  wa = [line.strip().split(",") for line in open(file) if line.strip().split(",") != '']
  watch_acceleration[filename] = wa

for file in glob.glob(keyboard_log):
  filename = int(file.split("/")[-1].strip(".log").strip("keys_"))-1
  kd = [line.strip() for line in open(file)]
  keyboard_logs[filename] = kd


# This is to fix the mess caused by glob
watch_acc = []
for i in range(len(watch_acceleration.keys())):
  watch_acc.extend(watch_acceleration[i])

print(len(watch_acc))

190241


Seperate the keyboard log into "key pressed" and "key up" events.

In [232]:
locations = {"left": 0, "center": 1, "right": 2}
special_keys = {"Shift": 1, "Control": 2, "Alt": 3, "None": 0}

all_keys_pressed = {}
all_keys_released = {}
all_keys_used = [] # Used later in one_hotting the keys
last_event = None

p = re.compile(r"index.html:[0-9]* (?P<timestamp>[0-9]*), (?P<event>[a-z]*), (?P<key>([a-zA-Z0-9]*|[^a-zA-Z0-9_])), (?P<location>(left|center|right))")

for index in keyboard_logs.keys():
  key_log = keyboard_logs[index]
  keys_pressed = []
  keys_released = []
  for line in key_log:
    s = p.search(line)

    timestamp = int(s.group("timestamp"))
    event     = s.group("event")
    key       = s.group("key")
    location  = s.group("location")

    # If the special keys were pressed with the others, then just add as a tag
    if key in ["Shift", "Control", "Alt"] and last_event == "keypress":
      keys_pressed[-1][-1] = special_keys[key]
      continue 

    last_event = event

    if event == "keypress":
      # Timestamp, key side, key
      keys_pressed.append([timestamp, key, locations[location], 0]) # Last is for special keys
    elif event == "keyup" and key not in ["Shift", "Control", "Alt", "Backspace", "ArrowLeft", "ArrowRight"]:
      # For now ignore these, figuring out what to do with them is a later problem
      keys_released.append([timestamp, key, locations[location], 0])
  all_keys_pressed[index] = keys_pressed
  all_keys_released[index] = keys_released

# We would like to start measuring from when our first key is let go 
# Because that is how we measure sequences

keys_pressed = []
keys_released = []

for index in all_keys_pressed.keys():
  kp = all_keys_pressed[index]
  keys_pressed.extend(kp[1:])

for index in all_keys_released.keys():
  kr = all_keys_released[index]
  keys_released.extend(kr[0:-1])

print(len(keys_pressed))    # 10473
print(len(keys_released))   # 10473

10473
10473


Go through all our data and split up the watch data by the timestamp from the keyboard log. 

Each sequence begins when the first key is lifted, and ends when the next key is pressed. 

Map each sequence to a key.

In [233]:
sequences = []
predictions = []
count = 0

copy_acc = watch_acc

for i in range(len(keys_pressed)):

  if count > 10:
    break 
  start = int(keys_released[i][0])
  end = int(keys_pressed[i][0])

  key = keys_pressed[i][1]
  loc = keys_pressed[i][2] #Ignore for now

  sequence = []

  while len(copy_acc) != 0:
    # We want to remove the line so we dont have to iterate trough everything again
    line = copy_acc.pop(0)
    if line == ['']:
      continue 

    time, acc_x, acc_y, acc_z = line[0], line[1], line[2], line[3]

    current_time = int(time)

    if current_time < start:
      continue 
    if current_time >= end:
      break 

    sequence.append([float(acc_x), float(acc_y), float(acc_z)])
  predictions.append(key)
  sequences.append(sequence)

# We want to know how much data we have
# Should match up with how many keys_pressed we have
print(len(sequences))
print(len(predictions)) 
print(sequences[:5])
print()

10473
10473
[[[0.07492512464523315, 0.010624483227729797, 0.06074047088623047], [0.023304224014282227, -0.0027136504650115967, -0.0067768096923828125], [-0.025712013244628906, -0.01874162256717682, 0.02060467004776001], [0.04109373688697815, -0.024065181612968445, 0.022174596786499023], [-0.012615591287612915, -0.04327927529811859, -0.03681755065917969], [-0.02747499942779541, -0.03363946080207825, -0.026925265789031982], [0.01081821322441101, -0.035822898149490356, -0.0064498186111450195]], [[0.01199999451637268, 0.042885422706604004, -0.002495288848876953], [0.008670210838317871, 0.043125420808792114, -0.006142079830169678], [0.004594326019287109, 0.012444019317626953, -0.007005810737609863], [-0.003905355930328369, 0.003743290901184082, -0.002028822898864746], [-0.003909558057785034, 0.01520344614982605, -0.009246468544006348], [-0.003146350383758545, 0.02301722764968872, -0.009371042251586914], [-0.00883626937866211, 0.003609389066696167, -0.007367610931396484], [-0.008556425571441

Now we pad the shorter sequences with [0, 0, 0] to match the length of the longest sequence.

In [259]:
print(max(sequences,key=len))
max_len = len(max(sequences,key=len))
print(max_len)

padded_sequences = []
for sequence in sequences:
  while len(sequence) < max_len:
     sequence.append([0, 0, 0])
  np.stack(sequence)
  padded_sequences.append(sequence)
np_sequences = np.stack(padded_sequences)

print(np_sequences.shape)

[[0.07492512464523315, 0.010624483227729797, 0.06074047088623047], [0.023304224014282227, -0.0027136504650115967, -0.0067768096923828125], [-0.025712013244628906, -0.01874162256717682, 0.02060467004776001], [0.04109373688697815, -0.024065181612968445, 0.022174596786499023], [-0.012615591287612915, -0.04327927529811859, -0.03681755065917969], [-0.02747499942779541, -0.03363946080207825, -0.026925265789031982], [0.01081821322441101, -0.035822898149490356, -0.0064498186111450195], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], 

Make onehot our preditions 

In [0]:
# TODO - Maybe implement this later
def make_onehot(indicies, total=250):
    """
    Convert indicies into one-hot vectors by
        1. Creating an identity matrix of shape [total, total]
        2. Indexing the appropriate columns of that identity matrix
    """
    I = np.eye(total)
    return I[indicies]

unique_pred = np.unique(predictions)
num_unique = np.unique(predictions).shape[0]
ascii_predicts = [ord(a) for a in predictions]
predicted_keys = make_onehot(predictions, num_unique)
print(predicted_keys)

Now we make our classifier

It should take in a vector thats **N * M * 3**

Where N = Number of squences and M = Sequence length

In [261]:
# Our shape is (10473, 256, 3)

# So sklearn expects 2d arrays... gotta reshape
N, nx, ny = np_sequences.shape
new_sequences = np_sequences.reshape((N,nx*ny))

# Split into train, validation, and test sets
# I will do about 70 train and 30 test
train_data, test_data = new_sequences[:7730,:], new_sequences[7730:,:]
print (train_data.shape)
print (test_data.shape)
train_ts, test_ts = predictions[:7730], predictions[7730:]

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=200)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(train_data,train_ts)

test_ys=clf.predict(test_data)

#for prediction in test_ys:
#  print (prediction, test_ts)


# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(test_ts, test_ys))

(7730, 768)
(2743, 768)
Accuracy: 0.1866569449507838


In [0]:
# Ignore this for not, I might have to come back to it later if the rfc doesnt work

from keras.models import Sequential
from keras.layers import LSTM, Dense
import numpy as np

data_dim = 16
timesteps = 8
num_classes = 10

# expected input data shape: (batch_size, timesteps, data_dim)
model = Sequential()
model.add(LSTM(32, return_sequences=True,
               input_shape=(timesteps, data_dim)))  # returns a sequence of vectors of dimension 32
model.add(LSTM(32, return_sequences=True))  # returns a sequence of vectors of dimension 32
model.add(LSTM(32))  # return a single vector of dimension 32
model.add(Dense(10, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

# Generate dummy training data
x_train = np.random.random((1000, timesteps, data_dim))
y_train = np.random.random((1000, num_classes))

# Generate dummy validation data
x_val = np.random.random((100, timesteps, data_dim))
y_val = np.random.random((100, num_classes))

model.fit(x_train, y_train,
          batch_size=64, epochs=5,
          validation_data=(x_val, y_val))


