In [1]:
%config Completer.use_jedi = False

In [2]:
import json
from multiprocessing import Pool
from tqdm import tqdm
import numpy as np
from collections import defaultdict

In [4]:
def index_2d(myList, v):
    for i, x in enumerate(myList):
        if v in x:
            return (i, x.index(v))
    return

In [4]:
def parser(data):

    KEYS = [["", "", "", "", "", "", "", "", "", "", "", "", "", "8"],
            ["", "81", "87", "69", "82", "84", "89", "85", "73", "79", "80"],
            ["", "65", "83", "68", "70", "71", "72", "74", "75", "76"],
            ["16", "90", "88", "67", "86", "66", "78", "77"],
            ["", "", "", "", "32", "32", "32", "32", "32"]]

    KEYS_FLAT = ["49", "50", "51", "52", "53", "54", "55", "56", "57", "48", "8",
                 "81", "87", "69", "82", "84", "89", "85", "73", "79", "80",
                 "65", "83", "68", "70", "71", "72", "74", "75", "76", 
                 "16", "90", "88", "67", "86", "66", "78", "77", "32"]

    pressedKeys = []
    for d in data:
        pressedKeys.append((str(d['keycode']), d['press_time'], d['release_time']))

    feature = []
    max_dist = 16
    max_time = 1500
    
    d = defaultdict(list)
    m = np.full(shape=(17,17,5), fill_value=0, dtype=np.float32)
    s = np.full(shape=(17,17,5), fill_value=0, dtype=np.float32)
    
    for i in range(len(pressedKeys)):
                
        if i == len(pressedKeys) - 1:
            break

        ht1 = int(pressedKeys[i][2]) - int(pressedKeys[i][1])
        
        ptp = int(pressedKeys[i + 1][1]) - int(pressedKeys[i][1])
        ptr = int(pressedKeys[i + 1][2]) - int(pressedKeys[i][1])

        rtr = int(pressedKeys[i + 1][2]) - int(pressedKeys[i][2])
        rtp = int(pressedKeys[i + 1][1]) - int(pressedKeys[i][2])
        
        key1 = pressedKeys[i][0]
        key2 = pressedKeys[i + 1][0]

        d_key1 = index_2d(KEYS, key1)
        d_key2 = index_2d(KEYS, key2)

        if not d_key1 or not d_key2:
            continue

        if ptp < max_time and abs(rtp) < max_time:
            keyDistance = np.absolute(np.array(d_key1) - np.array(d_key2))
            row = int(keyDistance[0])
            col = int(keyDistance[1])
            
            d[f"{row},{col},0"].append(ht1)
            d[f"{row},{col},1"].append(ptp)
            d[f"{row},{col},2"].append(ptr)
            d[f"{row},{col},3"].append(rtr)
            d[f"{row},{col},4"].append(rtp)
    
    for k, v in d.items():
        row, col, channel = list(map(int, k.split(",")))
        m[row, col, channel] = np.mean(v) / 1000.
        s[row, col, channel] = np.std(v) / 1000.
    
    x = np.concatenate([s, m], axis=-1)
#     print(x.shape)
            
    return x

In [5]:
KEYS = [["", "49", "50", "51", "52", "53", "54", "55", "56", "57", "48", "", "", "8"],
        ["", "81", "87", "69", "82", "84", "89", "85", "73", "79", "80"],
        ["", "65", "83", "68", "70", "71", "72", "74", "75", "76"],
        ["16", "90", "88", "67", "86", "66", "78", "77"],
        ["", "", "", "", "32", "32", "32", "32", "32"]]


KEYS_FLAT = ["49", "50", "51", "52", "53", "54", "55", "56", "57", "48", "8",
             "81", "87", "69", "82", "84", "89", "85", "73", "79", "80",
             "65", "83", "68", "70", "71", "72", "74", "75", "76", 
             "16", "90", "88", "67", "86", "66", "78", "77", "32"]

DIST = []

for s_key in KEYS_FLAT:
    for t_key in KEYS_FLAT:
        ds = index_2d(KEYS, s_key)
        dt = index_2d(KEYS, t_key)
        if not ds or not dt:
            continue
        d = np.sum(np.absolute(np.array(ds) - np.array(dt)))
        DIST.append(d)

In [6]:
from collections import Counter
c = Counter(DIST)
len(c), c

(17,
 Counter({0: 39,
          1: 120,
          2: 194,
          3: 226,
          4: 220,
          5: 192,
          6: 160,
          7: 126,
          8: 90,
          9: 64,
          12: 14,
          10: 38,
          11: 20,
          13: 10,
          14: 4,
          16: 2,
          15: 2}))

In [7]:
def read_data(file_path, start_idx, end_idx):
    data = []
    with open(file_path) as f:
        for idx, line in enumerate(f):
            if start_idx <= idx < end_idx:
                line = json.loads(line)
                data.append(line)
            if idx >= end_idx:
                break
    return data

In [8]:
def split_subsequence(sequences, maxlen=100, overlap=0):
    
    flatten = []
    for seq in sequences:
        flatten.extend(seq)
    sequences = flatten
    sequences = sorted(sequences, key=lambda x: x['press_time'])
    
    subsequence = []
    for i in range(0, len(sequences), maxlen - overlap):
        sub = sequences[i: i + maxlen]
        if len(sub) >= 0.8 * maxlen:
            subsequence.append(sub)
    
    return subsequence

In [9]:
import tensorflow as tf

def save_tfrecord(data, label, filepath):
    with tf.io.TFRecordWriter(filepath) as writer:
        for i in range(len(data)):
            features = tf.train.Features(
                feature = {
                    "data":tf.train.Feature(bytes_list = tf.train.BytesList(value = [data[i].astype(np.float32).tostring()])),
                    "label":tf.train.Feature(int64_list = tf.train.Int64List(value = [label[i]]))
                }
            )
            example = tf.train.Example(features = features)
            serialized = example.SerializeToString()
            writer.write(serialized)
    return

In [10]:
batch_user = 5000

start_user = 50000
end_user = 100000

args = [(i, i + batch_user, "train") for i in range(start_user, end_user, batch_user)]
print(len(args), args)

10 [(50000, 55000, 'train'), (55000, 60000, 'train'), (60000, 65000, 'train'), (65000, 70000, 'train'), (70000, 75000, 'train'), (75000, 80000, 'train'), (80000, 85000, 'train'), (85000, 90000, 'train'), (90000, 95000, 'train'), (95000, 100000, 'train')]


In [11]:
def gen(params):
    
    import tensorflow as tf
    import os
    os.environ["CUDA_VISIBLE_DEVICES"]="-1"
    
    start_idx, end_idx, scenario = params
        
    maxlen = 100
    overlap = 0

    data = read_data(file_path="/home/anhtt_vcs/Public/keystrokes_feature/all_by_user.json",
                     start_idx=start_idx,
                     end_idx=end_idx)

    X = []
    Y = []
    
    label = start_idx

    for user_data in tqdm(data):

        sequences = user_data['sequences']
        sequences = split_subsequence(sequences, maxlen, overlap)
        
        for sequence in sequences:
            try:
                x = parser(sequence)
                if not len(x):
                    continue
                X.append(x)
                Y.append(label)
            except Exception as e:
                print(e)
        
        label += 1
    
    X = np.array(X)
    Y = np.array(Y)

    print(X.shape, Y.shape)

    if not os.path.exists(os.path.join("data", scenario)):
        os.makedirs(os.path.join("data", scenario))

    save_tfrecord(X, Y, "data/matrix/{}/batch_{}_{}.tfrecord".format(scenario, start_idx, end_idx))

    return

In [12]:
pool = Pool(len(args))
pool.map(gen, args)

100%|██████████| 5000/5000 [04:31<00:00, 18.41it/s]
100%|██████████| 5000/5000 [04:34<00:00, 18.20it/s]
100%|██████████| 5000/5000 [04:35<00:00, 18.18it/s]
100%|██████████| 5000/5000 [04:34<00:00, 18.23it/s]
100%|██████████| 5000/5000 [04:36<00:00, 18.08it/s]
100%|██████████| 5000/5000 [04:34<00:00, 18.21it/s]
100%|█████████▉| 4999/5000 [04:37<00:00, 17.89it/s]

(32519, 17, 17, 10) (32519,)


100%|██████████| 5000/5000 [04:37<00:00, 18.01it/s]
 99%|█████████▉| 4958/5000 [04:37<00:02, 19.20it/s]

(32494, 17, 17, 10) (32494,)


100%|█████████▉| 4976/5000 [04:38<00:01, 22.17it/s]

(32612, 17, 17, 10) (32612,)


100%|██████████| 5000/5000 [04:37<00:00, 18.03it/s]
 99%|█████████▊| 4931/5000 [04:38<00:04, 16.93it/s]

(32624, 17, 17, 10) (32624,)


100%|█████████▉| 4995/5000 [04:39<00:00, 17.27it/s]

(32492, 17, 17, 10) (32492,)


100%|█████████▉| 4998/5000 [04:39<00:00, 18.52it/s]

(32631, 17, 17, 10) (32631,)


100%|██████████| 5000/5000 [04:39<00:00, 17.86it/s]
 99%|█████████▉| 4958/5000 [04:39<00:01, 24.10it/s]

(32561, 17, 17, 10) (32561,)


100%|█████████▉| 4985/5000 [04:41<00:00, 23.25it/s]

(32506, 17, 17, 10) (32506,)


100%|██████████| 5000/5000 [04:41<00:00, 17.75it/s]


(32497, 17, 17, 10) (32497,)
(32424, 17, 17, 10) (32424,)


[None, None, None, None, None, None, None, None, None, None]

In [16]:
def parse_fn(example_proto):
    features = {"data": tf.io.FixedLenFeature((), tf.string),
                "label": tf.io.FixedLenFeature((), tf.int64),
                }
    parsed_features = tf.io.parse_single_example(example_proto, features)
    data = tf.io.decode_raw(parsed_features["data"], tf.float32)
    data = tf.reshape(data, shape=(17, 17, 10))
    return data, parsed_features["label"]

def load_tfrecord(filepath, batch_size=128, shuffle=True):
    AUTOTUNE = tf.data.experimental.AUTOTUNE
    dataset = tf.data.TFRecordDataset(filepath, num_parallel_reads=4)
    dataset = dataset.map(parse_fn, num_parallel_calls=4)
    dataset = dataset.batch(batch_size)
    if shuffle:
        dataset = dataset.shuffle(buffer_size=100)
    return dataset.prefetch(AUTOTUNE)

In [19]:
dataset = load_tfrecord(filepath="data/matrix/train/batch_0_5000.tfrecord", batch_size=512, shuffle=False)
for batch in dataset:
    x, y = batch
    print(x)
    break

tf.Tensor(
[[[[ 0.00377492  0.01233896  0.01569236 ...  0.2015      0.148
     0.086     ]
   [ 0.03394112  0.01155662  0.04902607 ...  0.26133335  0.12533334
     0.01633333]
   [ 0.03370697  0.30011436  0.30884066 ...  0.4958      0.4016
     0.3364    ]
   ...
   [ 0.          0.          0.         ...  0.          0.
     0.        ]
   [ 0.          0.          0.         ...  0.          0.
     0.        ]
   [ 0.          0.          0.         ...  0.          0.
     0.        ]]

  [[ 0.004       0.1045      0.124      ...  0.388       0.328
     0.2605    ]
   [ 0.0477445   0.08903226  0.0559594  ...  0.309       0.20509091
     0.10781818]
   [ 0.          0.          0.         ...  0.271       0.2
     0.169     ]
   ...
   [ 0.          0.          0.         ...  0.          0.
     0.        ]
   [ 0.          0.          0.         ...  0.          0.
     0.        ]
   [ 0.          0.          0.         ...  0.          0.
     0.        ]]

  [[ 0.          0. 