In [1]:
data_dir = "/home/hoang/Downloads/data/output_numpy/passwords/"

In [2]:
json_file = "all_user_passwords.json"
with open(json_file, 'w') as f:
    f.write('')

In [4]:
import os
from tqdm import tqdm
import json
import tensorflow as tf

In [5]:
for user_dir in tqdm(sorted(os.listdir(data_dir))):
    user_dir = os.path.join(data_dir, user_dir)
    
    data = {'genuine': [], 'impostor': [], 'text': '', 'user_id': ''}
    
    # genuine
    genuine_dir = os.path.join(user_dir, 'genuine')
    for timestamp_dir in os.listdir(genuine_dir):
        timestamp_dir = os.path.join(genuine_dir, timestamp_dir)
        if not os.path.isdir(timestamp_dir):
            continue
            
        password_file = os.path.join(timestamp_dir, 'password.txt')
        with open(password_file) as fpassword:
            password_data = fpassword.read()
        
        press_file = os.path.join(timestamp_dir, 'p_raw_press.txt')
        with open(press_file) as fpress:
            press_data = [line.strip().split() for line in fpress]
        release_file = os.path.join(timestamp_dir, 'p_raw_release.txt')
        with open(release_file) as frelease:
            release_data = [line.strip().split() for line in frelease]
        
        press_data = [{'type': 'down',
                       'keycode': int(d[0]),
                       'time': int(d[1])} for d in press_data]
        release_data = [{'type': 'up',
                       'keycode': int(d[0]),
                       'time': int(d[1])} for d in release_data]
        
        data['genuine'].append(sorted(press_data + release_data, key=lambda x: x['time']))
        
    # impostor
    impostor_dir = os.path.join(user_dir, 'impostor')
    impostor_data = []
    for timestamp_dir in os.listdir(impostor_dir):
        timestamp_dir = os.path.join(impostor_dir, timestamp_dir)
        if not os.path.isdir(timestamp_dir):
            continue

        password_file = os.path.join(timestamp_dir, 'password.txt')
        with open(password_file) as fpassword:
            password_data = fpassword.read()

        press_file = os.path.join(timestamp_dir, 'p_raw_press.txt')
        with open(press_file) as fpress:
            press_data = [line.strip().split() for line in fpress]
        release_file = os.path.join(timestamp_dir, 'p_raw_release.txt')
        with open(release_file) as frelease:
            release_data = [line.strip().split() for line in frelease]
            
        press_data = [{'type': 'down',
                       'keycode': int(d[0]),
                       'time': int(d[1])} for d in press_data]
        release_data = [{'type': 'up',
                       'keycode': int(d[0]),
                       'time': int(d[1])} for d in release_data]
        
        data['impostor'].append(sorted(press_data + release_data, key=lambda x: x['time']))
    
    data['text'] = password_data
    data['user_id'] = os.path.basename(user_dir)
    
    if len(data['genuine']) and len(data['impostor']):
        with open(json_file, 'a') as fout:
            fout.write("{}\n".format(json.dumps(data)))
#     break

100%|█████████████████████████████████████████| 118/118 [00:50<00:00,  2.36it/s]


In [21]:
import json

import numpy as np


class Feature:
    def __init__(
            self,
            vocab_path="../configs/vocab.json",
            feature_type_path="../configs/feature_type.json"
    ):
        with open(vocab_path) as f:
            self.vocab = json.load(f)

        with open(feature_type_path) as f:
            self.feature_vocab = json.load(f)

    def input_from_raw(self, raw_seq):
        features = self.extract(raw_seq)

        return self.input_from_feature(features)

    def input_from_feature(self, features):
        raise NotImplementedError

    def extract_key(self, sub_seq):
        raise NotImplementedError

    def agg_feature(self, feature, features=None):
        raise NotImplementedError

    def extract(self, raw_seq):
        raw_seq = Feature.clean_raw_seq(raw_seq)
        duration = (raw_seq[-1]['time'] - raw_seq[0]['time'])

        features = None
        for i in range(len(raw_seq)):
            features = self.agg_feature(self.extract_key(raw_seq[i:]), features)

        return features, duration

    @staticmethod
    def clean_raw_seq(data):
        data = sorted(data, key=lambda x: x["time"])

        i = 0
        while i < len(data):
            if "keycode" not in data[i]:
                data.pop(i)
                continue
            i += 1

        return data


class MatrixFeature(Feature):
    def input_from_feature(self, features):
        n_features = 5
        n_keycodes = len(self.vocab)
        feature_matrix = np.full((n_keycodes, n_keycodes, n_features), 0, dtype=np.float32)

        for key, value in features.items():
            key_items = key.split('_')
            source_key = key_items[0]
            feature_type = key_items[-1]
            target_key = source_key if feature_type == "Hold" else key_items[1]

            value = [item for item in value if (item < self.feature_vocab[feature_type]["max"]) and (item > 0)]

            if not value:
                continue

            if source_key not in self.vocab:
                continue

            if target_key not in self.vocab:
                continue

            value = np.array(value)
            feature_matrix[
                self.vocab[source_key], self.vocab[target_key], self.feature_vocab[feature_type]["index"]
            ] = np.mean(value)

        feature_matrix = feature_matrix / 1000.

        return feature_matrix

    def extract_key(self, sub_seq):
        features = dict()
        source_down = {}
        source_up = {}
        target_down = {}
        target_up = {}

        for step_idx, step in enumerate(sub_seq):
            if step["type"] == "down":
                if not source_down:
                    source_down = step
                    continue

                if not target_down:
                    target_down = step
                    continue

            if step["type"] == "up":
                if (not source_up) and source_down and (step["keycode"] == source_down["keycode"]):
                    source_up = step
                    continue

                if (not target_up) and target_down and (step["keycode"] == target_down["keycode"]):
                    target_up = step
                    continue

            if source_down and source_up and target_down and target_up:
                break

        if (not source_down) or (not source_up) or (not target_down) or (not target_up):
            return {}

        features["{}_{}_DD".format(
            source_down["keycode"],
            target_down["keycode"]
        )] = target_down["time"] - source_down["time"]

        features["{}_{}_DU".format(
            source_down["keycode"],
            target_up["keycode"]
        )] = target_up["time"] - source_down["time"]

        features["{}_{}_UD".format(
            source_up["keycode"],
            target_down["keycode"]
        )] = target_down["time"] - source_up["time"]

        features["{}_{}_UU".format(
            source_up["keycode"],
            target_up["keycode"]
        )] = target_up["time"] - source_up["time"]

        features["{}_Hold".format(
            source_down["keycode"]
        )] = source_up["time"] - source_down["time"]

        return features

    def agg_feature(self, feature, features=None):
        if not features:
            features = dict()

        for key, value in feature.items():
            features[key] = features.get(key, [])
            features[key].append(value)

        return features


class StatsFeature(Feature):
    def input_from_feature(self, features):
        n_features = 5
        n_keycodes = len(self.vocab)
        feature_mean = np.full((n_keycodes, n_features), 0, dtype=np.float32)
        feature_std = np.full((n_keycodes, n_features), 0, dtype=np.float32)

        for key, value in features.items():
            key_items = key.split('_')
            source_key = key_items[0]
            feature_type = key_items[-1]

            value = [item for item in value if (item < self.feature_vocab[feature_type]["max"]) and (item > 0)]

            if not value:
                continue

            if source_key not in self.vocab:
                continue

            value = np.array(value)
            feature_mean[self.vocab[source_key], self.feature_vocab[feature_type]["index"]] = np.mean(value)
            feature_std[self.vocab[source_key], self.feature_vocab[feature_type]["index"]] = np.std(value)

        feature_mean = feature_mean / 1000.
        feature_std = feature_std / 1000.

        return np.concatenate([feature_mean, feature_std], axis=-1)

    def extract_key(self, sub_seq):
        features = dict()
        source_down = {}
        source_up = {}
        target_down = {}
        target_up = {}

        for step_idx, step in enumerate(sub_seq):
            if step["type"] == "down":
                if not source_down:
                    source_down = step
                    continue

                if not target_down:
                    target_down = step
                    continue

            if step["type"] == "up":
                if (not source_up) and source_down and (step["keycode"] == source_down["keycode"]):
                    source_up = step
                    continue

                if (not target_up) and target_down and (step["keycode"] == target_down["keycode"]):
                    target_up = step
                    continue

            if source_down and source_up and target_down and target_up:
                break

        if (not source_down) or (not source_up) or (not target_down) or (not target_up):
            return {}

        features["{}_{}_DD".format(
            source_down["keycode"],
            target_down["keycode"]
        )] = target_down["time"] - source_down["time"]

        features["{}_{}_DU".format(
            source_down["keycode"],
            target_up["keycode"]
        )] = target_up["time"] - source_down["time"]

        features["{}_{}_UD".format(
            source_up["keycode"],
            target_down["keycode"]
        )] = target_down["time"] - source_up["time"]

        features["{}_{}_UU".format(
            source_up["keycode"],
            target_up["keycode"]
        )] = target_up["time"] - source_up["time"]

        features["{}_Hold".format(
            source_down["keycode"]
        )] = source_up["time"] - source_down["time"]

        return features

    def agg_feature(self, feature, features=None):
        if not features:
            features = dict()

        for key, value in feature.items():
            features[key] = features.get(key, [])
            features[key].append(value)

        return features


class AnonymousSeqFeature(Feature):
    def input_from_feature(self, features, duration, norm):
        steps = []

        for feature in features:
            step = [None for _ in feature]
            step[self.feature_vocab["DD"]["index"]] = feature["DD"]
            step[self.feature_vocab["DU"]["index"]] = feature["DU"]
            step[self.feature_vocab["UD"]["index"]] = feature["UD"]
            step[self.feature_vocab["UU"]["index"]] = feature["UU"]
            step[self.feature_vocab["Hold"]["index"]] = feature["Hold"]

            steps.append(step)

        steps = np.array(steps)
        steps = np.clip(steps, -1500, 1500)
#         if np.max(steps) > 1500:
#             return

        # normalize
        # steps = steps * len(steps) / duration

        if norm == 'max':
            res = steps / np.max(steps)
        elif norm == 'min_max':
            res = (steps - np.min(steps)) / (np.max(steps) - np.min(steps))
        elif norm == 'none':
            res = steps / 1000.
        else:
            raise ValueError("Must norm")

        if np.isnan(res).any():
            print(steps)
        return res

    def extract_key(self, sub_seq):
        features = dict()
        source_down = {}
        source_up = {}
        target_down = {}
        target_up = {}

        for step_idx, step in enumerate(sub_seq):
            if step["type"] == "down":
                if not source_down:
                    source_down = step
                    continue

                if not target_down:
                    target_down = step
                    continue

            if step["type"] == "up":
                if step_idx == 0:
                    return {}

                if (not source_up) and source_down and (step["keycode"] == source_down["keycode"]):
                    source_up = step
                    continue

                if (not target_up) and target_down and (step["keycode"] == target_down["keycode"]):
                    target_up = step
                    continue

            if source_down and source_up and target_down and target_up:
                break

        if (not source_down) or (not source_up) or (not target_down) or (not target_up):
            return {}

        features["DD"] = target_down["time"] - source_down["time"]
        features["DU"] = target_up["time"] - source_down["time"]
        features["UD"] = target_down["time"] - source_up["time"]
        features["UU"] = target_up["time"] - source_up["time"]
        features["Hold"] = source_up["time"] - source_down["time"]

        return features

    def agg_feature(self, feature, features=None):
        if not features:
            features = list()

        if feature:
            features.append(feature)

        return features

In [22]:
feature_extractor = AnonymousSeqFeature()

In [23]:
import os


def save_tfrecord(data, label, filepath):
    with tf.io.TFRecordWriter(filepath) as writer:
        for i in range(len(data)):
            features = tf.train.Features(
                feature={
                    "data": tf.train.Feature(bytes_list=tf.train.BytesList(value=[data[i].astype(np.float32).tobytes()])),
                    "label": tf.train.Feature(int64_list=tf.train.Int64List(value=[label[i]]))
                }
            )
            example = tf.train.Example(features=features)
            serialized = example.SerializeToString()
            writer.write(serialized)
    return


def load_tfrecord(dirname):
    filenames = [os.path.join(dirname, filename) for filename in sorted(os.listdir(dirname))]
    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=4)
    print("Load dataset contains {} records".format(len(filenames)))
    return dataset

In [29]:
with open(json_file) as f:
    for idx, line in enumerate(f):
        if idx > 100:
            break
        line = json.loads(line)
        X = []
        Y = []
        text = line['text']
        
        for raw in line['genuine']:
            if len(text)*2*(1-0.2) <= len(raw) <= len(text)*2*(1+0.2):
                features, duration = feature_extractor.extract(raw)
                x = feature_extractor.input_from_feature(features, duration, 'none')
                if x is not None or not np.isnan(x).any():
                    X.append(x)
                    Y.append(1)
                    
        for raw in line['impostor']:
            if len(text)*2*(1-0.2) <= len(raw) <= len(text)*2*(1+0.2):
                features, duration = feature_extractor.extract(raw)
                x = feature_extractor.input_from_feature(features, duration, 'none')
                if x is not None or not np.isnan(x).any():
                    X.append(x)
                    Y.append(0)
        
        X = tf.keras.preprocessing.sequence.pad_sequences(X,
                                                          padding="post",
                                                          truncating="post",
                                                          value=0,
                                                          maxlen=70,
                                                          dtype="float")
        X = np.array(X)
        Y = np.array(Y)
        
        X_pos = X[Y == 1]
        X_neg = X[Y == 0]
        X_pos = X_pos[:100]
        X_neg = X_neg[:100]
        if not len(X_pos) or not len(X_neg):
            continue
        
        X_pairs = []
        Y_pairs = []
        for i in range(len(X_pos)):
            anchor = X_pos[i]
            positive_list = X_pos[i+1:]
            for positive in positive_list:
                X_pairs.append([anchor, positive])
                Y_pairs.append([1])
                
        for anchor in X_pos:
            for negative in X_neg:
                X_pairs.append([anchor, negative])
                Y_pairs.append([0])
        
        X_pairs = np.array(X_pairs)
        Y_pairs = np.array(Y_pairs)
        print(line["user_id"], X_pairs.shape, Y_pairs.shape)
        
        save_tfrecord(X_pairs, Y_pairs, os.path.join("train", line["user_id"]+'.tfrecord'))

user_001 (14950, 2, 70, 5) (14950, 1)
user_002 (14950, 2, 70, 5) (14950, 1)
user_004 (14950, 2, 70, 5) (14950, 1)
user_005 (13005, 2, 70, 5) (13005, 1)
user_006 (14950, 2, 70, 5) (14950, 1)
user_007 (4780, 2, 70, 5) (4780, 1)
user_010 (219, 2, 70, 5) (219, 1)
user_011 (6831, 2, 70, 5) (6831, 1)
user_012 (1045, 2, 70, 5) (1045, 1)
user_013 (14950, 2, 70, 5) (14950, 1)
user_014 (721, 2, 70, 5) (721, 1)
user_015 (1045, 2, 70, 5) (1045, 1)
user_016 (6225, 2, 70, 5) (6225, 1)
user_017 (3306, 2, 70, 5) (3306, 1)
user_018 (100, 2, 70, 5) (100, 1)
user_019 (14950, 2, 70, 5) (14950, 1)
user_020 (14950, 2, 70, 5) (14950, 1)
user_021 (49, 2, 70, 5) (49, 1)
user_022 (13771, 2, 70, 5) (13771, 1)
user_023 (14950, 2, 70, 5) (14950, 1)
user_024 (14356, 2, 70, 5) (14356, 1)
user_025 (3435, 2, 70, 5) (3435, 1)
user_026 (14751, 2, 70, 5) (14751, 1)
user_027 (6804, 2, 70, 5) (6804, 1)
user_029 (2190, 2, 70, 5) (2190, 1)
user_030 (14950, 2, 70, 5) (14950, 1)
user_031 (12006, 2, 70, 5) (12006, 1)
user_032 (

In [28]:
with open(json_file) as f:
    for idx, line in enumerate(f):
        if idx <= 100:
            continue
        line = json.loads(line)
        X = []
        Y = []
        text = line['text']
        
        for raw in line['genuine']:
            if len(text)*2*(1-0.2) <= len(raw) <= len(text)*2*(1+0.2):
                features, duration = feature_extractor.extract(raw)
                x = feature_extractor.input_from_feature(features, duration, 'none')
                if x is not None or not np.isnan(x).any():
                    X.append(x)
                    Y.append(1)
                    
        for raw in line['impostor']:
            if len(text)*2*(1-0.2) <= len(raw) <= len(text)*2*(1+0.2):
                features, duration = feature_extractor.extract(raw)
                x = feature_extractor.input_from_feature(features, duration, 'none')
                if x is not None or not np.isnan(x).any():
                    X.append(x)
                    Y.append(0)
        
        X = tf.keras.preprocessing.sequence.pad_sequences(X,
                                                          padding="post",
                                                          truncating="post",
                                                          value=0,
                                                          maxlen=70,
                                                          dtype="float")
        X = np.array(X)
        Y = np.array(Y)
        
        X_pos = X[Y == 1]
        X_neg = X[Y == 0]
        X_pos = X_pos[:100]
        X_neg = X_neg[:100]
        if not len(X_pos) or not len(X_neg):
            continue
        
        X_pairs = []
        Y_pairs = []
        for i in range(len(X_pos)):
            anchor = X_pos[i]
            positive_list = X_pos[i+1:]
            for positive in positive_list:
                X_pairs.append([anchor, positive])
                Y_pairs.append([1])
                
        for anchor in X_pos:
            for negative in X_neg:
                X_pairs.append([anchor, negative])
                Y_pairs.append([0])
        
        X_pairs = np.array(X_pairs)
        Y_pairs = np.array(Y_pairs)
        print(line["user_id"], X_pairs.shape, Y_pairs.shape)
        
        save_tfrecord(X_pairs, Y_pairs, os.path.join("dev", line["user_id"]+'.tfrecord'))

user_112 (4697, 2, 70, 5) (4697, 1)
user_113 (3990, 2, 70, 5) (3990, 1)
user_114 (4680, 2, 70, 5) (4680, 1)
user_116 (5214, 2, 70, 5) (5214, 1)
user_118 (7605, 2, 70, 5) (7605, 1)
user_119 (395, 2, 70, 5) (395, 1)
user_120 (175, 2, 70, 5) (175, 1)
user_121 (46, 2, 70, 5) (46, 1)
user_122 (445, 2, 70, 5) (445, 1)
user_123 (1992, 2, 70, 5) (1992, 1)
user_124 (95, 2, 70, 5) (95, 1)
