In [None]:
import json
import random
import numpy as np


class Feature:
    def __init__(
            self,
            vocab_path="model/vocab.json",
            feature_type_path="model/feature_type.json"
    ):
        with open(vocab_path) as f:
            self.vocab = json.load(f)

        with open(feature_type_path) as f:
            self.feature_vocab = json.load(f)

    def input_from_raw(self, raw_seq):
        features = self.extract(raw_seq)

        return self.input_from_feature(features)

    def input_from_feature(self, features):
        raise NotImplementedError

    def extract_key(self, sub_seq):
        raise NotImplementedError

    def agg_feature(self, feature, features=None):
        raise NotImplementedError

    def extract(self, raw_seq):
        raw_seq = Feature.clean_raw_seq(raw_seq)

        features = None
        for i in range(len(raw_seq)):
            features = self.agg_feature(self.extract_key(raw_seq[i:]), features)

        return features

    @staticmethod
    def clean_raw_seq(data):
        data = sorted(data, key=lambda x: x["time"])

        i = 0
        while i < len(data):
            if "keycode" not in data[i]:
                data.pop(i)
                continue
            i += 1

        return data


class MatrixFeature(Feature):
    def input_from_feature(self, features):
        n_features = 5
        n_keycodes = len(self.vocab)
        feature_matrix = np.full((n_keycodes, n_keycodes, n_features), 0, dtype=np.float32)

        for key, value in features.items():
            key_items = key.split('_')
            source_key = key_items[0]
            feature_type = key_items[-1]
            target_key = source_key if feature_type == "Hold" else key_items[1]

            value = [item for item in value if (item < self.feature_vocab[feature_type]["max"]) and (item > 0)]

            if not value:
                continue

            if source_key not in self.vocab:
                continue

            if target_key not in self.vocab:
                continue

            value = np.array(value)
            feature_matrix[
                self.vocab[source_key], self.vocab[target_key], self.feature_vocab[feature_type]["index"]
            ] = np.mean(value)

        feature_matrix = feature_matrix / 1000.

        return feature_matrix

    def extract_key(self, sub_seq):
        features = dict()
        source_down = {}
        source_up = {}
        target_down = {}
        target_up = {}

        for step_idx, step in enumerate(sub_seq):
            if step["type"] == "down":
                if not source_down:
                    source_down = step
                    continue

                if not target_down:
                    target_down = step
                    continue

            if step["type"] == "up":
                if step_idx == 0:
                    return {}
                
                if (not source_up) and source_down and (step["keycode"] == source_down["keycode"]):
                    source_up = step
                    continue

                if (not target_up) and target_down and (step["keycode"] == target_down["keycode"]):
                    target_up = step
                    continue

            if source_down and source_up and target_down and target_up:
                break

        if (not source_down) or (not source_up) or (not target_down) or (not target_up):
            return {}

        features["{}_{}_DD".format(
            source_down["keycode"],
            target_down["keycode"]
        )] = target_down["time"] - source_down["time"]

        features["{}_{}_DU".format(
            source_down["keycode"],
            target_up["keycode"]
        )] = target_up["time"] - source_down["time"]

        features["{}_{}_UD".format(
            source_up["keycode"],
            target_down["keycode"]
        )] = target_down["time"] - source_up["time"]

        features["{}_{}_UU".format(
            source_up["keycode"],
            target_up["keycode"]
        )] = target_up["time"] - source_up["time"]

        features["{}_Hold".format(
            source_down["keycode"]
        )] = source_up["time"] - source_down["time"]

        return features

    def agg_feature(self, feature, features=None):
        if not features:
            features = dict()

        for key, value in feature.items():
            features[key] = features.get(key, [])
            features[key].append(value)

        return features


class AnonymousSeqFeature(Feature):
    def input_from_feature(self, features):
        steps = []

        for feature in features:
            step = [None for _ in feature]
            step[self.feature_vocab["DD"]["index"]] = [feature[k] for k in feature if "DD" in k][0] 
            step[self.feature_vocab["DU"]["index"]] = [feature[k] for k in feature if "DU" in k][0] 
            step[self.feature_vocab["UD"]["index"]] = [feature[k] for k in feature if "UD" in k][0] 
            step[self.feature_vocab["UU"]["index"]] = [feature[k] for k in feature if "UU" in k][0] 
            step[self.feature_vocab["Hold"]["index"]] = [feature[k] for k in feature if "Hold" in k][0] 

            steps.append(step)

        return np.array(steps)/1000.

    def extract_key(self, sub_seq):
        features = dict()
        source_down = {}
        source_up = {}
        target_down = {}
        target_up = {}

        for step_idx, step in enumerate(sub_seq):
            if step["type"] == "down":
                if not source_down:
                    source_down = step
                    continue

                if not target_down:
                    target_down = step
                    continue

            if step["type"] == "up":
                if step_idx == 0:
                    return {}

                if (not source_up) and source_down and (step["keycode"] == source_down["keycode"]):
                    source_up = step
                    continue

                if (not target_up) and target_down and (step["keycode"] == target_down["keycode"]):
                    target_up = step
                    continue

            if source_down and source_up and target_down and target_up:
                break

        if (not source_down) or (not source_up) or (not target_down) or (not target_up):
            return {}

        features["{}_{}_DD".format(
            source_down["keycode"],
            target_down["keycode"]
        )] = target_down["time"] - source_down["time"]

        features["{}_{}_DU".format(
            source_down["keycode"],
            target_up["keycode"]
        )] = target_up["time"] - source_down["time"]

        features["{}_{}_UD".format(
            source_up["keycode"],
            target_down["keycode"]
        )] = target_down["time"] - source_up["time"]

        features["{}_{}_UU".format(
            source_up["keycode"],
            target_up["keycode"]
        )] = target_up["time"] - source_up["time"]

        features["{}_Hold".format(
            source_down["keycode"]
        )] = source_up["time"] - source_down["time"]

        return features


    def agg_feature(self, feature, features=None):
        if not features:
            features = list()

        if feature:
            features.append(feature)

        return features

In [None]:
def read_data(file_path, start_idx, end_idx):
    data = []
    with open(file_path) as f:
        for idx, line in enumerate(f):
            if start_idx <= idx < end_idx:
                line = json.loads(line)
                data.append(line)
            if idx >= end_idx:
                break
    return data

In [None]:
# from collections import defaultdict
# from tqdm import tqdm


# matrix_feature = MatrixFeature()


# with open("/media/hoang/Data/keystroke_dataset/Keystrokes/features/all_by_user.json") as f:
#     for idx, line in tqdm(enumerate(f)):
#         line = json.loads(line)
#         feature_map = defaultdict(list)
#         for data in line['sequences']:
#             raw_data = []
#             for d in data:
#                 new_d = {'time': d['press_time'], 'keycode': d['keycode'], 'type': 'down'}
#                 raw_data.append(new_d)
#                 new_d = {'time': d['release_time'], 'keycode': d['keycode'], 'type': 'up'}
#                 raw_data.append(new_d)
#             raw_data = sorted(raw_data, key=lambda x: x['time'])
            
#             feat = matrix_feature.extract(raw_data)
#             for k, v in feat.items():
#                 feature_map[k].extend(v)
        
#         with open("/media/hoang/Data/keystroke_dataset/Keystrokes/features/matrix_by_user.json", "a") as fout:
#             fout.write("{}\n".format(json.dumps({"label": idx, "matrix": feature_map})))
# #         break

In [None]:
sequence_feature = AnonymousSeqFeature()

seed = 1000

with open("/media/hoang/Data/keystroke_dataset/Keystrokes/features/all_by_user.json") as f:
    for idx, line in enumerate(f):
        if idx == seed:
            line = json.loads(line)

            # anchor
            data = line['sequences'][random.randint(0,10)]
            raw_data = []
            for d in data:
                new_d = {'time': d['press_time'], 'keycode': d['keycode'], 'type': 'down'}
                raw_data.append(new_d)
                new_d = {'time': d['release_time'], 'keycode': d['keycode'], 'type': 'up'}
                raw_data.append(new_d)
            raw_data = sorted(raw_data, key=lambda x: x['time'])
            anchor = sequence_feature.extract(raw_data)

            break


with open("/media/hoang/Data/keystroke_dataset/Keystrokes/features/all_by_user.json") as f:
    for idx, line in enumerate(f):
        if idx == seed + 2:
            line = json.loads(line)

            # negative
            data = line['sequences'][random.randint(0,10)]
            raw_data = []
            for d in data:
                new_d = {'time': d['press_time'], 'keycode': d['keycode'], 'type': 'down'}
                raw_data.append(new_d)
                new_d = {'time': d['release_time'], 'keycode': d['keycode'], 'type': 'up'}
                raw_data.append(new_d)
            raw_data = sorted(raw_data, key=lambda x: x['time'])
            negative = sequence_feature.extract(raw_data)

            break

            
matrix_feature = MatrixFeature()
import random

with open("/media/hoang/Data/keystroke_dataset/Keystrokes/features/matrix_by_user.json") as f:
    for idx, line in enumerate(f):
        if idx == seed:
            line = json.loads(line)
            data = line['matrix']

            # positive
            positive = [{} for _ in range(len(anchor))]
            for i in range(len(anchor)):
                for k in anchor[i]:
                    positive[i][k] = random.choice(data[k])
            break

In [None]:
anchor

In [None]:
import numpy as np
from scipy.spatial.distance import cdist

# numpy
anchor_np = sequence_feature.input_from_feature(anchor)
positive_np = sequence_feature.input_from_feature(positive)
negative_np = sequence_feature.input_from_feature(negative)

# embed
# cdist(model(padding(anchor_np)).numpy(), model(padding(positive_np)).numpy(), metric='cosine'), \
# cdist(model(padding(anchor_np)).numpy(), model(padding(negative_np)).numpy(), metric='cosine')

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
from pathlib import Path
import numpy as np
import os

In [None]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
            logical_gpus = tf.config.list_logical_devices('GPU')
            print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

In [None]:
model = tf.keras.models.load_model("ckpt/1664974974/serving/")
model.summary()

In [None]:
def padding(x):
    if x.shape[0] >= 70:
        x = x[:70,:]
    else:
        pad = np.zeros((70,5))
        pad[:x.shape[0],:] = x
        x = pad
    return np.expand_dims(x, 0)

In [1]:
import numpy as np

In [4]:
features = np.random.rand(10, 16)
features

array([[0.04400676, 0.4980518 , 0.336619  , 0.74284246, 0.32142784,
        0.656133  , 0.5189083 , 0.71966972, 0.54544263, 0.51767995,
        0.03481792, 0.87794271, 0.47146673, 0.9382151 , 0.44326373,
        0.54187053],
       [0.54019633, 0.55392904, 0.54982311, 0.84319384, 0.79291673,
        0.6303543 , 0.60272158, 0.29576317, 0.35308013, 0.32921348,
        0.7115316 , 0.22646623, 0.41510773, 0.27470998, 0.85707284,
        0.46064789],
       [0.53129733, 0.8961459 , 0.51208858, 0.01837355, 0.62275407,
        0.64267983, 0.20646536, 0.64995602, 0.34888213, 0.38876181,
        0.0454567 , 0.67550253, 0.47426781, 0.03552144, 0.35247588,
        0.46628131],
       [0.70492968, 0.61101062, 0.84625606, 0.7052156 , 0.11790961,
        0.23426823, 0.65426764, 0.81134102, 0.59902489, 0.6027204 ,
        0.94284562, 0.73486196, 0.42813356, 0.37660165, 0.12340125,
        0.37893834],
       [0.2740366 , 0.84965522, 0.81626292, 0.9443809 , 0.44195122,
        0.75781647, 0.42953373, 

In [5]:
from scipy.spatial.distance import cdist

In [6]:
dist = cdist(features, features, metric='cosine')

In [9]:
labels = np.array([1,1,1,2,2,2,3,3,3,3])
labels.shape

(10,)

In [10]:
dist.shape

(10, 10)

In [11]:
for label in np.unique(labels):
    positive = vectors[label == labels]
    negative = vectors[label != labels]
    
    d_positive = cdist(positive, positive, metric=metric)
    d_negative = cdist(positive, negative, metric=metric)
    
#     print(np.mean(d_positive), np.mean(d_negative))
    
    d_positive = np.reshape(d_positive, (-1,))
    d_positive = d_positive[d_positive >= 1e-9]
    d_negative = np.reshape(d_negative, (-1,))
    
    if len(d_positive) == 0:
        print("FAIL ON GET POSITIVE, IGNORE")
        continue
    
#     plt.boxplot([d_positive, d_negative], whis=5)
#     plt.legend(['d_positive', 'd_negative'])
#     plt.show()

    fn = np.sum(d_negative < threshold) / len(d_negative) 
    fp = np.sum(d_positive > threshold) / len(d_positive)

array([[0.00000000e+00, 2.23663582e-01, 2.20553953e-01, 2.02893781e-01,
        2.04850990e-01, 2.42568363e-01, 2.38324650e-01, 1.90989351e-01,
        1.53123018e-01, 2.07280049e-01],
       [2.23663582e-01, 0.00000000e+00, 2.24364437e-01, 1.82424112e-01,
        1.25775570e-01, 1.56188321e-01, 1.70716316e-01, 1.02763213e-01,
        1.41210778e-01, 2.07788351e-01],
       [2.20553953e-01, 2.24364437e-01, 0.00000000e+00, 2.33657251e-01,
        1.82594904e-01, 2.35101321e-01, 2.13012264e-01, 2.49808388e-01,
        2.58889917e-01, 2.50552177e-01],
       [2.02893781e-01, 1.82424112e-01, 2.33657251e-01, 1.11022302e-16,
        1.00363893e-01, 1.21171410e-01, 1.43994653e-01, 2.38471148e-01,
        2.49096087e-01, 2.06861008e-01],
       [2.04850990e-01, 1.25775570e-01, 1.82594904e-01, 1.00363893e-01,
        0.00000000e+00, 1.49424752e-01, 1.51760460e-01, 2.65993964e-01,
        2.60236064e-01, 1.63866012e-01],
       [2.42568363e-01, 1.56188321e-01, 2.35101321e-01, 1.21171410e-01,
   