In [5]:
import json
from multiprocessing import Pool
from tqdm import tqdm
import numpy as np
from collections import defaultdict
from tqdm import tqdm
import tensorflow as tf
import os

In [6]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
            logical_gpus = tf.config.list_logical_devices('GPU')
            print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

1 Physical GPUs, 1 Logical GPUs


In [7]:
x = np.array([[1,2,3,8,5],[4,5,6,7,-1]])
x, np.min(x, axis=0)
x - np.min(x, axis=0)
x.shape

(2, 5)

In [2]:
import json

import numpy as np


class Feature:
    def __init__(
            self,
            vocab_path="../configs/vocab.json",
            feature_type_path="../configs/feature_type.json"
    ):
        with open(vocab_path) as f:
            self.vocab = json.load(f)

        with open(feature_type_path) as f:
            self.feature_vocab = json.load(f)

    def input_from_raw(self, raw_seq):
        features = self.extract(raw_seq)

        return self.input_from_feature(features)

    def input_from_feature(self, features):
        raise NotImplementedError

    def extract_key(self, sub_seq):
        raise NotImplementedError

    def agg_feature(self, feature, features=None):
        raise NotImplementedError

    def extract(self, raw_seq):
        raw_seq = Feature.clean_raw_seq(raw_seq)
        duration = (raw_seq[-1]['time'] - raw_seq[0]['time'])

        features = None
        for i in range(len(raw_seq)):
            features = self.agg_feature(self.extract_key(raw_seq[i:]), features)

        return features, duration

    @staticmethod
    def clean_raw_seq(data):
        data = sorted(data, key=lambda x: x["time"])

        i = 0
        while i < len(data):
            if "keycode" not in data[i]:
                data.pop(i)
                continue
            i += 1

        return data


class MatrixFeature(Feature):
    def input_from_feature(self, features):
        n_features = 5
        n_keycodes = len(self.vocab)
        feature_matrix = np.full((n_keycodes, n_keycodes, n_features), 0, dtype=np.float32)

        for key, value in features.items():
            key_items = key.split('_')
            source_key = key_items[0]
            feature_type = key_items[-1]
            target_key = source_key if feature_type == "Hold" else key_items[1]

            value = [item for item in value if (item < self.feature_vocab[feature_type]["max"]) and (item > 0)]

            if not value:
                continue

            if source_key not in self.vocab:
                continue

            if target_key not in self.vocab:
                continue

            value = np.array(value)
            feature_matrix[
                self.vocab[source_key], self.vocab[target_key], self.feature_vocab[feature_type]["index"]
            ] = np.mean(value)

        feature_matrix = feature_matrix / 1000.

        return feature_matrix

    def extract_key(self, sub_seq):
        features = dict()
        source_down = {}
        source_up = {}
        target_down = {}
        target_up = {}

        for step_idx, step in enumerate(sub_seq):
            if step["type"] == "down":
                if not source_down:
                    source_down = step
                    continue

                if not target_down:
                    target_down = step
                    continue

            if step["type"] == "up":
                if (not source_up) and source_down and (step["keycode"] == source_down["keycode"]):
                    source_up = step
                    continue

                if (not target_up) and target_down and (step["keycode"] == target_down["keycode"]):
                    target_up = step
                    continue

            if source_down and source_up and target_down and target_up:
                break

        if (not source_down) or (not source_up) or (not target_down) or (not target_up):
            return {}

        features["{}_{}_DD".format(
            source_down["keycode"],
            target_down["keycode"]
        )] = target_down["time"] - source_down["time"]

        features["{}_{}_DU".format(
            source_down["keycode"],
            target_up["keycode"]
        )] = target_up["time"] - source_down["time"]

        features["{}_{}_UD".format(
            source_up["keycode"],
            target_down["keycode"]
        )] = target_down["time"] - source_up["time"]

        features["{}_{}_UU".format(
            source_up["keycode"],
            target_up["keycode"]
        )] = target_up["time"] - source_up["time"]

        features["{}_Hold".format(
            source_down["keycode"]
        )] = source_up["time"] - source_down["time"]

        return features

    def agg_feature(self, feature, features=None):
        if not features:
            features = dict()

        for key, value in feature.items():
            features[key] = features.get(key, [])
            features[key].append(value)

        return features


class StatsFeature(Feature):
    def input_from_feature(self, features):
        n_features = 5
        n_keycodes = len(self.vocab)
        feature_mean = np.full((n_keycodes, n_features), 0, dtype=np.float32)
        feature_std = np.full((n_keycodes, n_features), 0, dtype=np.float32)

        for key, value in features.items():
            key_items = key.split('_')
            source_key = key_items[0]
            feature_type = key_items[-1]

            value = [item for item in value if (item < self.feature_vocab[feature_type]["max"]) and (item > 0)]

            if not value:
                continue

            if source_key not in self.vocab:
                continue

            value = np.array(value)
            feature_mean[self.vocab[source_key], self.feature_vocab[feature_type]["index"]] = np.mean(value)
            feature_std[self.vocab[source_key], self.feature_vocab[feature_type]["index"]] = np.std(value)

        feature_mean = feature_mean / 1000.
        feature_std = feature_std / 1000.

        return np.concatenate([feature_mean, feature_std], axis=-1)

    def extract_key(self, sub_seq):
        features = dict()
        source_down = {}
        source_up = {}
        target_down = {}
        target_up = {}

        for step_idx, step in enumerate(sub_seq):
            if step["type"] == "down":
                if not source_down:
                    source_down = step
                    continue

                if not target_down:
                    target_down = step
                    continue

            if step["type"] == "up":
                if (not source_up) and source_down and (step["keycode"] == source_down["keycode"]):
                    source_up = step
                    continue

                if (not target_up) and target_down and (step["keycode"] == target_down["keycode"]):
                    target_up = step
                    continue

            if source_down and source_up and target_down and target_up:
                break

        if (not source_down) or (not source_up) or (not target_down) or (not target_up):
            return {}

        features["{}_{}_DD".format(
            source_down["keycode"],
            target_down["keycode"]
        )] = target_down["time"] - source_down["time"]

        features["{}_{}_DU".format(
            source_down["keycode"],
            target_up["keycode"]
        )] = target_up["time"] - source_down["time"]

        features["{}_{}_UD".format(
            source_up["keycode"],
            target_down["keycode"]
        )] = target_down["time"] - source_up["time"]

        features["{}_{}_UU".format(
            source_up["keycode"],
            target_up["keycode"]
        )] = target_up["time"] - source_up["time"]

        features["{}_Hold".format(
            source_down["keycode"]
        )] = source_up["time"] - source_down["time"]

        return features

    def agg_feature(self, feature, features=None):
        if not features:
            features = dict()

        for key, value in feature.items():
            features[key] = features.get(key, [])
            features[key].append(value)

        return features


class AnonymousSeqFeature(Feature):
    def input_from_feature(self, features, duration, norm):
        steps = []

        for feature in features:
            step = [None for _ in feature]
            step[self.feature_vocab["DD"]["index"]] = feature["DD"]
            step[self.feature_vocab["DU"]["index"]] = feature["DU"]
            step[self.feature_vocab["UD"]["index"]] = feature["UD"]
            step[self.feature_vocab["UU"]["index"]] = feature["UU"]
            step[self.feature_vocab["Hold"]["index"]] = feature["Hold"]

            steps.append(step)

        steps = np.array(steps)

        # normalize
        # steps = steps * len(steps) / duration

        if norm == 'max':
            res = steps / np.max(steps)
        elif norm == 'min_max':
            res = (steps - np.min(steps)) / (np.max(steps) - np.min(steps))
        elif norm == 'none':
            res = steps / 1000.
        else:
            raise ValueError("Must norm")

        if np.isnan(res).any():
            print(steps)
        return res

    def extract_key(self, sub_seq):
        features = dict()
        source_down = {}
        source_up = {}
        target_down = {}
        target_up = {}

        for step_idx, step in enumerate(sub_seq):
            if step["type"] == "down":
                if not source_down:
                    source_down = step
                    continue

                if not target_down:
                    target_down = step
                    continue

            if step["type"] == "up":
                if step_idx == 0:
                    return {}

                if (not source_up) and source_down and (step["keycode"] == source_down["keycode"]):
                    source_up = step
                    continue

                if (not target_up) and target_down and (step["keycode"] == target_down["keycode"]):
                    target_up = step
                    continue

            if source_down and source_up and target_down and target_up:
                break

        if (not source_down) or (not source_up) or (not target_down) or (not target_up):
            return {}

        features["DD"] = target_down["time"] - source_down["time"]
        features["DU"] = target_up["time"] - source_down["time"]
        features["UD"] = target_down["time"] - source_up["time"]
        features["UU"] = target_up["time"] - source_up["time"]
        features["Hold"] = source_up["time"] - source_down["time"]

        return features

    def agg_feature(self, feature, features=None):
        if not features:
            features = list()

        if feature:
            features.append(feature)

        return features


In [10]:
sequence_feature = AnonymousSeqFeature()
count = 0

from tqdm import tqdm

with open("/media/hoang/Data/keystroke_dataset/Keystrokes/features/all_by_user.json") as f:
    for idx, line in tqdm(enumerate(f)):
        line = json.loads(line)
        try:
            data = line['sequences'][0]
        except:
            pass
        raw_data = []
        for d in data:
            new_d = {'time': d['press_time'], 'keycode': d['keycode'], 'type': 'down'}
            raw_data.append(new_d)
            new_d = {'time': d['release_time'], 'keycode': d['keycode'], 'type': 'up'}
            raw_data.append(new_d)
        raw_data = sorted(raw_data, key=lambda x: x['time'])
        anchor, duration = sequence_feature.extract(raw_data)
#         print(anchor, duration)
        anchor_np = sequence_feature.input_from_feature(anchor, duration, norm='none')
#         print(anchor_np, anchor_np.shape)
        if np.max(anchor_np) > 10:
            print(raw_data, anchor)
            break
print(count)

264it [00:00, 1576.24it/s]

[{'time': 1476530255679.0, 'keycode': 16, 'type': 'down'}, {'time': 1476530256194.0, 'keycode': 73, 'type': 'down'}, {'time': 1476530256423.0, 'keycode': 73, 'type': 'up'}, {'time': 1476530256428.0, 'keycode': 16, 'type': 'up'}, {'time': 1476530256760.0, 'keycode': 70, 'type': 'down'}, {'time': 1476530256856.0, 'keycode': 70, 'type': 'up'}, {'time': 1476530256998.0, 'keycode': 32, 'type': 'down'}, {'time': 1476530257313.0, 'keycode': 32, 'type': 'up'}, {'time': 1476530258413.0, 'keycode': 72, 'type': 'down'}, {'time': 1476530258532.0, 'keycode': 72, 'type': 'up'}, {'time': 1476530258688.0, 'keycode': 69, 'type': 'down'}, {'time': 1476530258808.0, 'keycode': 69, 'type': 'up'}, {'time': 1476530258887.0, 'keycode': 32, 'type': 'down'}, {'time': 1476530259458.0, 'keycode': 32, 'type': 'up'}, {'time': 1476530259552.0, 'keycode': 16, 'type': 'down'}, {'time': 1476530260090.0, 'keycode': 80, 'type': 'down'}, {'time': 1476530260190.0, 'keycode': 80, 'type': 'up'}, {'time': 1476530260209.0, 'ke




In [7]:
count

141

In [12]:
def gen(label_idx=0, n_classes=50000, scenario="train"):

    min_sample = 10
    max_len = 70
    pad_value=0.
    error_rate = 0.2

    X_train = []
    Y_train = []

    with open("/media/hoang/Data/keystroke_dataset/Keystrokes/features/all_by_user.json") as f:
        for idx, line in tqdm(enumerate(f)):
            if idx >= label_idx:
                # each user
                line = json.loads(line)
                if len(line['sequences']) >= min_sample:

                    X = []
                    Y = []
                    # each sequence of user
                    for data in line['sequences']:

                        # flatten to raw_data
                        raw_data = []
                        raw_text = None
                        for d in data:
                            new_d = {'time': d['press_time'], 'keycode': d['keycode'], 'type': 'down'}
                            raw_data.append(new_d)
                            new_d = {'time': d['release_time'], 'keycode': d['keycode'], 'type': 'up'}
                            raw_data.append(new_d)
                            if raw_text is None:
                                raw_text = d['text']

                        # compare raw data/text
                        raw_data = sorted(raw_data, key=lambda x: x['time'])
                        if len(raw_text)*2*(1 - error_rate) <= len(raw_data) <= len(raw_text)*2*(1 + error_rate):
                            feature, duration = sequence_feature.extract(raw_data)
                            x = sequence_feature.input_from_feature(feature, duration)
                            # append
                            X.append(x)
                            Y.append(label_idx)

                    X = tf.keras.preprocessing.sequence.pad_sequences(X,
                                                                      padding="pre",
                                                                      value=pad_value,
                                                                      maxlen=max_len,
                                                                      dtype="float")
                    # concat
                    X = np.array(X)
                    Y = np.array(Y)

                    if len(X) >= min_sample:
                        X_train.append(X)
                        Y_train.append(Y)

                        label_idx += 1
                        if label_idx == n_classes:

                            X_train = np.concatenate(X_train)
                            Y_train = np.concatenate(Y_train)

                            np.save("X_{}.npy".format(scenario), X_train)
                            np.save("Y_{}.npy".format(scenario), Y_train)

                            print("Save data", X_train.shape, Y_train.shape)

                            return

In [13]:
gen(label_idx=0, n_classes=50000, scenario="train")

75082it [02:49, 443.72it/s]

Save data (616022, 70, 5) (616022,)





In [15]:
gen(label_idx=128000, n_classes=5000+128000, scenario="dev")

135596it [00:20, 6735.89it/s] 

Save data (61513, 70, 5) (61513,)





In [16]:
gen(label_idx=148000, n_classes=5000+148000, scenario="test")

155540it [00:20, 7495.73it/s] 

Save data (61613, 70, 5) (61613,)



