In [85]:
import json

import numpy as np


class Feature:
    def __init__(
            self,
            vocab_path="../configs/vocab.json",
            feature_type_path="../configs/feature_type.json"
    ):
        with open(vocab_path) as f:
            self.vocab = json.load(f)

        with open(feature_type_path) as f:
            self.feature_vocab = json.load(f)

    def input_from_raw(self, raw_seq):
        features, duration = self.extract(raw_seq)

        return self.input_from_feature(features)

    def input_from_feature(self, features):
        raise NotImplementedError

    def extract_key(self, sub_seq):
        raise NotImplementedError

    def agg_feature(self, feature, features=None):
        raise NotImplementedError

    def extract(self, raw_seq):
        raw_seq = Feature.clean_raw_seq(raw_seq)
        duration = (raw_seq[-1]['time'] - raw_seq[0]['time'])

        features = None
        for i in range(len(raw_seq)):
            features = self.agg_feature(self.extract_key(raw_seq[i:]), features)

        return features, duration

    @staticmethod
    def clean_raw_seq(data):
        data = sorted(data, key=lambda x: x["time"])

        i = 0
        while i < len(data):
            if "keycode" not in data[i]:
                data.pop(i)
                continue
            i += 1

        return data


class AnonymousSeqFeature(Feature):
    def input_from_feature(self, features, duration=None, norm=None):
        steps = []

        for feature in features:
            step = [None for _ in feature]
            step[self.feature_vocab["DD"]["index"]] = feature["DD"]
            step[self.feature_vocab["DU"]["index"]] = feature["DU"]
            step[self.feature_vocab["UD"]["index"]] = feature["UD"]
            step[self.feature_vocab["UU"]["index"]] = feature["UU"]
            step[self.feature_vocab["Hold"]["index"]] = feature["Hold"]

            steps.append(step)

        steps = np.array(steps)
        if np.max(steps) > 1500:
            return

        # normalize
        # steps = steps * len(steps) / duration

        if norm == 'max':
            res = steps / np.max(steps)
        elif norm == 'min_max':
            res = (steps - np.min(steps)) / (np.max(steps) - np.min(steps))
        elif norm == 'none':
            res = steps / 1000.
        else:
            res = steps
#             raise ValueError("Must norm")

        if np.isnan(res).any():
            print(steps)
        return res

    def extract_key(self, sub_seq):
        features = dict()
        source_down = {}
        source_up = {}
        target_down = {}
        target_up = {}

        for step_idx, step in enumerate(sub_seq):
            if step["type"] == "down":
                if not source_down:
                    source_down = step
                    continue

                if not target_down:
                    target_down = step
                    continue

            if step["type"] == "up":
                if step_idx == 0:
                    return {}

                if (not source_up) and source_down and (step["keycode"] == source_down["keycode"]):
                    source_up = step
                    continue

                if (not target_up) and target_down and (step["keycode"] == target_down["keycode"]):
                    target_up = step
                    continue

            if source_down and source_up and target_down and target_up:
                break

        if (not source_down) or (not source_up) or (not target_down) or (not target_up):
            return {}

        features["DD"] = target_down["time"] - source_down["time"]
        features["DU"] = target_up["time"] - source_down["time"]
        features["UD"] = target_down["time"] - source_up["time"]
        features["UU"] = target_up["time"] - source_up["time"]
        features["Hold"] = source_up["time"] - source_down["time"]

        return features

    def agg_feature(self, feature, features=None):
        if not features:
            features = list()

        if feature:
            features.append(feature)

        return features


In [86]:
data_dir = "/home/hoang/Downloads/data/PolitehnicaUniversityTimisoaraKeystrokeDataSet/"

In [87]:
import os

In [88]:
for user_file in sorted(os.listdir(data_dir)):
    user_file = os.path.join(data_dir, user_file)
    with open(user_file) as f:
        data = [list(map(int, line.strip().split())) for line in f]
    new_data = []
    for ls in data:
        if ls[1] == 0:
            typing = 'down'
        elif ls[1] == 1:
            typing = 'up'
        else:
            raise
        d = {'keycode': ls[0], 'type': typing, 'time': ls[2]}
        new_data.append(d)
    data = new_data
    
    # split data
    new_data = []
    action = []
    prev = None
    
    for d in data:
        if prev is not None:
            if d['time'] - prev['time'] <= 1500:
                action.append(d)
            else:
                new_data.append(action)
                action = []
        else:
            action.append(d)
        prev = d

    break

In [100]:
action

[{'keycode': 32, 'type': 'up', 'time': 1526360},
 {'keycode': 80, 'type': 'down', 'time': 1526490},
 {'keycode': 80, 'type': 'up', 'time': 1526597},
 {'keycode': 82, 'type': 'down', 'time': 1526600},
 {'keycode': 82, 'type': 'up', 'time': 1526687},
 {'keycode': 73, 'type': 'down', 'time': 1526737},
 {'keycode': 69, 'type': 'down', 'time': 1526818},
 {'keycode': 73, 'type': 'up', 'time': 1526869},
 {'keycode': 69, 'type': 'up', 'time': 1526939},
 {'keycode': 84, 'type': 'down', 'time': 1527019},
 {'keycode': 69, 'type': 'down', 'time': 1527127},
 {'keycode': 84, 'type': 'up', 'time': 1527128},
 {'keycode': 69, 'type': 'up', 'time': 1527215},
 {'keycode': 78, 'type': 'down', 'time': 1527276},
 {'keycode': 78, 'type': 'up', 'time': 1527398},
 {'keycode': 73, 'type': 'down', 'time': 1527401},
 {'keycode': 73, 'type': 'up', 'time': 1527478},
 {'keycode': 32, 'type': 'down', 'time': 1527539},
 {'keycode': 83, 'type': 'down', 'time': 1527634},
 {'keycode': 32, 'type': 'up', 'time': 1527636},


In [96]:
feature_extractor = AnonymousSeqFeature()

In [97]:
features, duration = feature_extractor.extract(action)

In [98]:
features, duration

([{'DD': 110, 'DU': 197, 'UD': 3, 'UU': 90, 'Hold': 107},
  {'DD': 137, 'DU': 269, 'UD': 50, 'UU': 182, 'Hold': 87},
  {'DD': 81, 'DU': 202, 'UD': -51, 'UU': 70, 'Hold': 132},
  {'DD': 201, 'DU': 310, 'UD': 80, 'UU': 189, 'Hold': 121},
  {'DD': 108, 'DU': 196, 'UD': -1, 'UU': 87, 'Hold': 109},
  {'DD': 149, 'DU': 271, 'UD': 61, 'UU': 183, 'Hold': 88},
  {'DD': 125, 'DU': 202, 'UD': 3, 'UU': 80, 'Hold': 122},
  {'DD': 138, 'DU': 235, 'UD': 61, 'UU': 158, 'Hold': 77},
  {'DD': 95, 'DU': 216, 'UD': -2, 'UU': 119, 'Hold': 97},
  {'DD': 100, 'DU': 171, 'UD': -21, 'UU': 50, 'Hold': 121},
  {'DD': 131, 'DU': 263, 'UD': 60, 'UU': 192, 'Hold': 71},
  {'DD': 134, 'DU': 186, 'UD': 2, 'UU': 54, 'Hold': 132},
  {'DD': 121, 'DU': 222, 'UD': 69, 'UU': 170, 'Hold': 52},
  {'DD': 131, 'DU': 252, 'UD': 30, 'UU': 151, 'Hold': 101},
  {'DD': 151, 'DU': 262, 'UD': 30, 'UU': 141, 'Hold': 121},
  {'DD': 184, 'DU': 298, 'UD': 73, 'UU': 187, 'Hold': 111},
  {'DD': 83, 'DU': 173, 'UD': -31, 'UU': 59, 'Hold': 11

In [94]:
inputs = feature_extractor.input_from_feature(features)
inputs, inputs.shape

(array([[107, 197, 110,  90,   3],
        [ 87, 269, 137, 182,  50],
        [132, 202,  81,  70, -51],
        [121, 310, 201, 189,  80],
        [109, 196, 108,  87,  -1],
        [ 88, 271, 149, 183,  61],
        [122, 202, 125,  80,   3],
        [ 77, 235, 138, 158,  61],
        [ 97, 216,  95, 119,  -2],
        [121, 171, 100,  50, -21],
        [ 71, 263, 131, 192,  60],
        [132, 186, 134,  54,   2],
        [ 52, 222, 121, 170,  69],
        [101, 252, 131, 151,  30],
        [121, 262, 151, 141,  30],
        [111, 298, 184, 187,  73],
        [114, 173,  83,  59, -31],
        [ 90, 582, 517, 492, 427],
        [ 65, 124,  62,  59,  -3],
        [ 62, 876, 791, 814, 729],
        [ 85, 286, 195, 201, 110],
        [ 91, 201, 115, 110,  24]]),
 (22, 5))