In [None]:
import json
from multiprocessing import Pool
from tqdm import tqdm
import numpy as np
from collections import defaultdict
import tensorflow as tf
import os

In [None]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
            logical_gpus = tf.config.list_logical_devices('GPU')
            print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

In [None]:
def index_2d(myList, v):
    for i, x in enumerate(myList):
        if v in x:
            return (i, x.index(v))
    return

In [None]:
def parser(data):

    KEYS = [["", "49", "50", "51", "52", "53", "54", "55", "56", "57", "48", "", "", "8"],
            ["", "81", "87", "69", "82", "84", "89", "85", "73", "79", "80"],
            ["", "65", "83", "68", "70", "71", "72", "74", "75", "76"],
            ["16", "90", "88", "67", "86", "66", "78", "77"],
            ["", "", "", "", "32", "32", "32", "32", "32"]]

    KEYS_FLAT = ["49", "50", "51", "52", "53", "54", "55", "56", "57", "48", "8",
                 "81", "87", "69", "82", "84", "89", "85", "73", "79", "80",
                 "65", "83", "68", "70", "71", "72", "74", "75", "76", 
                 "16", "90", "88", "67", "86", "66", "78", "77", "32"]

    pressedKeys = []
    for d in data:
        pressedKeys.append((str(d['keycode']), d['press_time'], d['release_time']))

    feature = []
    max_dist = 16
    max_time = 1500
    
    for i in range(len(pressedKeys)):
                
        if i == len(pressedKeys) - 1:
            break

        ht1 = int(pressedKeys[i][2]) - int(pressedKeys[i][1])
        ht2 = int(pressedKeys[i + 1][2]) - int(pressedKeys[i + 1][1])

        ptp = int(pressedKeys[i + 1][1]) - int(pressedKeys[i][1])
        rtp = int(pressedKeys[i + 1][1]) - int(pressedKeys[i][2])

        key1 = pressedKeys[i][0]
        key2 = pressedKeys[i + 1][0]

        d_key1 = index_2d(KEYS, key1)
        d_key2 = index_2d(KEYS, key2)

        if not d_key1 or not d_key2:
            continue

        if ptp < max_time and abs(rtp) < max_time:
            keyDistance = np.sum(np.absolute(np.array(d_key1) - np.array(d_key2)))
            feature.append((keyDistance / max_dist,
                            ht1 / max_time,
                            ht2 / max_time,
                            ptp / max_time,
                            rtp / max_time))

    if len(feature) < 50:
        return []

    # preprocessing
    n_features = 5
    maxlen = 100

    feature = np.array(feature)[ : maxlen]
    padding = np.full((maxlen, n_features), 0., dtype=np.float32)
    padding[ : len(feature), :] = feature

    return padding

In [None]:
def read_data(file_path, start_idx, end_idx):
    data = []
    with open(file_path) as f:
        for idx, line in enumerate(f):
            if start_idx <= idx < end_idx:
                line = json.loads(line)
                data.append(line)
            if idx >= end_idx:
                break
    return data

In [None]:
def split_subsequence(sequences, maxlen=100, overlap=0):
    
    flatten = []
    for seq in sequences:
        flatten.extend(seq)
    sequences = flatten
    sequences = sorted(sequences, key=lambda x: x['press_time'])
    
    subsequence = []
    for i in range(0, len(sequences), maxlen - overlap):
        sub = sequences[i: i + maxlen]
        if len(sub) >= 0.8 * maxlen:
            subsequence.append(sub)
    
    return subsequence

In [None]:
import tensorflow as tf

def save_tfrecord(data, label, filepath):
    with tf.io.TFRecordWriter(filepath) as writer:
        for i in range(len(data)):
            features = tf.train.Features(
                feature = {
                    "data":tf.train.Feature(bytes_list = tf.train.BytesList(value = [data[i].astype(np.float32).tostring()])),
                    "label":tf.train.Feature(int64_list = tf.train.Int64List(value = [label[i]]))
                }
            )
            example = tf.train.Example(features = features)
            serialized = example.SerializeToString()
            writer.write(serialized)
    return

In [None]:
batch_user = 1000

start_user = 0
end_user = 10000

args = [(i, i + batch_user, "train") for i in range(start_user, end_user, batch_user)]
print(len(args), args)

In [None]:
def gen(params):
    
    import tensorflow as tf
    import os
    os.environ["CUDA_VISIBLE_DEVICES"]="-1"
    
    start_idx, end_idx, scenario = params
        
    maxlen = 100
    overlap = 0

    data = read_data(file_path="/home/anhtt_vcs/Public/keystrokes_feature/all_by_user.json",
                     start_idx=start_idx,
                     end_idx=end_idx)

    X = []
    Y = []
    
    label = start_idx

    for user_data in tqdm(data):

        sequences = user_data['sequences']
        sequences = split_subsequence(sequences, maxlen, overlap)
        
        for sequence in sequences:
            try:
                x = parser(sequence)
                if not len(x):
                    continue
                X.append(x)
                Y.append(label)
            except Exception as e:
                print(e)
        
        label += 1
    
    X = np.array(X)
    Y = np.array(Y)

    print(X.shape, Y.shape)

    if not os.path.exists(os.path.join("data", scenario)):
        os.makedirs(os.path.join("data", scenario))

    save_tfrecord(X, Y, "data/{}/batch_{}_{}.tfrecord".format(scenario, start_idx, end_idx))

    return

In [None]:
# pool = Pool(len(args))
# pool.map(gen, args)

In [12]:
def parse_fn(example_proto):
    features = {"data": tf.io.FixedLenFeature((), tf.string),
                "label": tf.io.FixedLenFeature((), tf.int64),
                }
    parsed_features = tf.io.parse_single_example(example_proto, features)
    data = tf.io.decode_raw(parsed_features["data"], tf.float32)
    data = tf.reshape(data, shape=(70, 5))
    return data, parsed_features["label"]

def load_tfrecord(filepath, batch_size=128, shuffle=True):
    AUTOTUNE = tf.data.experimental.AUTOTUNE
    dataset = tf.data.TFRecordDataset(filepath, num_parallel_reads=4)
    dataset = dataset.map(parse_fn, num_parallel_calls=4)
    return dataset

In [14]:
import tensorflow as tf
dataset = load_tfrecord(filepath="/home/hoang/workspace/github/keystroke-dynamic-model/data/train/1.tfrecord")
for batch in dataset:
    x, y = batch
    print(x, y)
    break

tf.Tensor(
[[0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.        ]
 [0