In [1]:
%config Completer.use_jedi = False

In [2]:
import numpy as np
import os
import tensorflow as tf
import json
from tqdm import tqdm
import random
from scipy.spatial.distance import cdist
from multiprocessing import Pool

In [3]:
def index_2d(myList, v):
    for i, x in enumerate(myList):
        if v in x:
            return (i, x.index(v))
    return

In [4]:
def parser(data):
        
    KEYS = [["", "", "", "", "", "", "", "", "", "", "", "", "", "8", "8"],
            ["", "81", "87", "69", "82", "84", "89", "85", "73", "79", "80"],
            ["", "65", "83", "68", "70", "71", "72", "74", "75", "76"],
            ["16", "90", "88", "67", "86", "66", "78", "77", "", "", "", ""],
            ["", "", "", "", "32", "32", "32", "32", "32"]]

    KEYS_FLAT = ["81", "87", "69", "82", "84", "89", "85", "73", "79", "80", "65",
                 "83", "68", "70", "71", "72", "74", "75", "76", "90", "88", "67",
                 "86", "66", "78", "77", "32", "16", "8"]
    
    pressedKeys = []
    for d in data:
        pressedKeys.append((str(d['keycode']), d['press_time'], d['release_time']))

    feature = []
    min_time = 300
    max_time = 1500
    
    for i in range(len(pressedKeys)):
                
        if i == len(pressedKeys) - 1:
            break
        
        ht1 = int(pressedKeys[i][2]) - int(pressedKeys[i][1])
        if ht1 > min_time:
            ht1 = min_time

        ht2 = int(pressedKeys[i + 1][2]) - int(pressedKeys[i + 1][1])
        if ht2 > min_time:
            ht2 = min_time

        ptp = int(pressedKeys[i + 1][1]) - int(pressedKeys[i][1])
        if ptp > max_time:
            ptp = max_time

        rtp = int(pressedKeys[i + 1][1]) - int(pressedKeys[i][2])
        if rtp > max_time:
            rtp = max_time

        key1 = pressedKeys[i][0]
        key2 = pressedKeys[i + 1][0]

        d_key1 = index_2d(KEYS, key1)
        d_key2 = index_2d(KEYS, key2)
        
        if not d_key1 or not d_key2:
            continue
            
        keyDistance = np.sum(np.absolute(np.array(d_key1) - np.array(d_key2)))
        # keyDistance = np.array(index_2d(KEYS, key1)) - np.array(index_2d(KEYS, key2))
        feature.append((keyDistance / 15,
                        ht1 / max_time,
                        ht2 / max_time,
                        ptp / max_time,
                        rtp / max_time))
                
    # preprocessing
    n_features = 5
    maxlen = 100

    feature = np.array(feature)[ : maxlen]
    padding = np.full((maxlen, n_features), 0., dtype=np.float32)
    padding[ : len(feature), :] = feature

    return np.expand_dims(padding, 0)

In [5]:
def read_data(file_path, start_idx, end_idx):
    data = []
    with open(file_path) as f:
        for idx, line in enumerate(f):
            if start_idx <= idx < end_idx:
                line = json.loads(line)
                data.append(line)
            if idx >= end_idx:
                break
    return data

In [6]:
def split_subsequence(sequences, maxlen=100, overlap=0):
    
    flatten = []
    for seq in sequences:
        flatten.extend(seq)
    sequences = flatten
    sequences = sorted(sequences, key=lambda x: x['press_time'])
    
    subsequence = []
    for i in range(0, len(sequences), maxlen - overlap):
        sub = sequences[i: i + maxlen]
        if len(sub) >= 0.8 * maxlen:
            subsequence.append(sub)
    
    return subsequence

In [7]:
num_users = 10000
batch_users = 500

args = [(i, i + batch_users, "/home/anhtt_vcs/Public/hoangp46/typing-net/ckpt/base/serving") for i in range(0, num_users, batch_users)]
print(len(args), args)

20 [(0, 500, '/home/anhtt_vcs/Public/hoangp46/typing-net/ckpt/base/serving'), (500, 1000, '/home/anhtt_vcs/Public/hoangp46/typing-net/ckpt/base/serving'), (1000, 1500, '/home/anhtt_vcs/Public/hoangp46/typing-net/ckpt/base/serving'), (1500, 2000, '/home/anhtt_vcs/Public/hoangp46/typing-net/ckpt/base/serving'), (2000, 2500, '/home/anhtt_vcs/Public/hoangp46/typing-net/ckpt/base/serving'), (2500, 3000, '/home/anhtt_vcs/Public/hoangp46/typing-net/ckpt/base/serving'), (3000, 3500, '/home/anhtt_vcs/Public/hoangp46/typing-net/ckpt/base/serving'), (3500, 4000, '/home/anhtt_vcs/Public/hoangp46/typing-net/ckpt/base/serving'), (4000, 4500, '/home/anhtt_vcs/Public/hoangp46/typing-net/ckpt/base/serving'), (4500, 5000, '/home/anhtt_vcs/Public/hoangp46/typing-net/ckpt/base/serving'), (5000, 5500, '/home/anhtt_vcs/Public/hoangp46/typing-net/ckpt/base/serving'), (5500, 6000, '/home/anhtt_vcs/Public/hoangp46/typing-net/ckpt/base/serving'), (6000, 6500, '/home/anhtt_vcs/Public/hoangp46/typing-net/ckpt/bas

In [8]:
def evaluate(params):
    
    import tensorflow as tf
    import os
    os.environ["CUDA_VISIBLE_DEVICES"]="-1"
    
    start_idx, end_idx, model_path = params
        
    metric = 'cosine'
    threshold = 0.5
    negative_sample = 100
    maxlen = 100
    overlap = 0

    with tf.device('/cpu:0'):
        model = tf.keras.models.load_model(model_path)

    data = read_data(file_path="/home/anhtt_vcs/Public/keystrokes_feature/all_by_user.json",
                     start_idx=start_idx,
                     end_idx=end_idx)

    for i, user_data in tqdm(enumerate(data)):

        user_id = user_data['label']
        sequences = user_data['sequences']
        sequences = split_subsequence(sequences, maxlen)

        positive = []
        for sequence in sequences:
            x = parser(sequence)
            positive.append(x)
        positive = np.concatenate(positive)

        negative = []
        negative_user = data[:i] + data[i + 1:] # remove user i-th
        negative_user = random.sample(negative_user, negative_sample)
        for negative_data in negative_user:
            negative_id = negative_data['label']
            negative_sequences = negative_data['sequences']
            negative_sequences = split_subsequence(negative_sequences)
            for sequence in negative_sequences:
                x = parser(sequence)
                negative.append(x)
        negative = np.concatenate(negative)

        # get embedding
        positive = model(positive)
        negative = model(negative)

        positive_dist = cdist(positive, positive, metric).round(4)
        negative_dist = cdist(positive, negative, metric).round(4)

        positive_dist = np.reshape(positive_dist, (-1,))
        positive_dist = positive_dist[positive_dist >= 1e-6]
        negative_dist = np.reshape(negative_dist, (-1,))

        fn = np.sum(negative_dist < threshold) / len(negative_dist) 
        fp = np.sum(positive_dist > threshold) / len(positive_dist)

        log = {"id": user_id,
               "fp": float(np.mean(fp) * 100),
               "fn": float(np.mean(fn) * 100)}
        
        dist_log = {"id": user_id,
                    "d_positive": positive_dist.tolist(),
                    "d_negative": negative_dist.tolist()}

        with open("eval_{}_threshold_{}_negatives_{}_maxlen_{}.json".format(metric, threshold, negative_sample, maxlen), "a") as f:
            f.write(json.dumps(log)+"\n")
            
        with open("eval_{}_distance_negatives_{}_maxlen_{}.json".format(metric, negative_sample, maxlen), "a") as f:
            f.write(json.dumps(dist_log)+"\n")

    return

In [9]:
pool = Pool(len(args))
pool.map(evaluate, args)










0it [00:00, ?it/s]




21it [00:43,  2.07s/it]
44it [01:30,  2.05s/it]
139it [04:44,  2.05s/it]
277it [09:05,  1.97s/it]
377it [12:13,  1.82s/it]
500it [15:40,  1.88s/it]
500it [15:43,  1.89s/it]
500it [15:44,  1.89s/it]
500it [15:45,  1.89s/it]
500it [15:45,  1.89s/it]
500it [15:45,  1.89s/it]
500it [15:47,  1.89s/it]
500it [15:48,  1.90s/it]
500it [15:48,  1.90s/it]
500it [15:48,  1.90s/it]
500it [15:48,  1.90s/it]
500it [15:50,  1.90s/it]
500it [15:50,  1.90s/it]
500it [15:50,  1.90s/it]
500it [15:51,  1.90s/it]


ValueError: need at least one array to concatenate

In [11]:
import pandas as pd

with open("eval_cosine_threshold_0.5_negatives_100_maxlen_100.json") as f:
    df = [json.loads(line) for line in f]
    
df = pd.DataFrame(df)

In [16]:
df['fp'].mean(), df['fn'].mean()

(18.51297121370114, 20.088107744210422)

In [None]:
import pandas as pd

with open("eval_cosine_threshold_0.5_negatives_100_maxlen_100.json") as f:
    df = [json.loads(line) for line in f]
    
df = pd.DataFrame(df)