In [9]:
from sklearn.model_selection import train_test_split
from sklearn.mixture import GaussianMixture
from tqdm.notebook import tqdm
import os
import numpy as np
import glob

In [10]:
decode = lambda file_id: {
    'user_id': int(file_id[:3]),
    'session_no': int(file_id[3]),
    'keyboard_code': int(file_id[4]),
    'task_no': int(file_id[5]),
}

class Digraph:
    x = 0
    y = 1
    h1 = 2
    h2 = 3
    pp = 4
    rp = 5

In [11]:
TRAIN, VAL, TEST = 0.75, 0.15, 0.10
try:
    split_user_digraphs = np.load('digraphs.npy', allow_pickle=True)[()]
    print('Loaded split digraphs from file')
except FileNotFoundError:
    user_digraphs = {}
    combined_files = glob.glob('digraphs/*/*/*.txt')
    users = max(decode(os.path.basename(filename))['user_id'] for filename in combined_files) + 1
    for user in tqdm(range(users), total=users, desc='Loading digraphs', unit='user'):
        files = [filename for filename in combined_files if decode(os.path.basename(filename))['user_id'] == user]
        user_digraphs[user] = {}
        for filename in files:
            with open(filename) as file:
                lines = file.readlines()
                for line in lines:
                    key1, key2, x, y, h1, h2, pp, rp = line.split()
                    digraph = [int(x), int(y), int(h1), int(h2), int(pp), int(rp)]
                    if (key1, key2) not in user_digraphs[user]:
                        user_digraphs[user][(key1, key2)] = []
                    user_digraphs[user][(key1, key2)].append(digraph)

    split_user_digraphs = {
        'train': {},
        'val': {},
        'test': {},
        'splits': {
            'train': TRAIN,
            'val': VAL,
            'test': TEST,
        }
    }
    for user in tqdm(range(users), total=users, desc='Splitting digraphs', unit='user'):
        for key_pair in user_digraphs[user]:
            user_digraphs[user][key_pair] = np.array(user_digraphs[user][key_pair])
            try:
                train, val_test = train_test_split(user_digraphs[user][key_pair], train_size=TRAIN, test_size=VAL+TEST, shuffle=True)
                val, test = train_test_split(val_test, train_size=VAL/(VAL+TEST), test_size=TEST/(VAL+TEST), shuffle=True)
                if user not in split_user_digraphs['train']:
                    split_user_digraphs['train'][user] = {}
                    split_user_digraphs['val'][user] = {}
                    split_user_digraphs['test'][user] = {}
                split_user_digraphs['train'][user][key_pair] = train
                split_user_digraphs['val'][user][key_pair] = val
                split_user_digraphs['test'][user][key_pair] = test
            except ValueError:
                # Not enough data to split
                pass
    np.save('digraphs.npy', split_user_digraphs)

Loading digraphs:   0%|          | 0/149 [00:00<?, ?user/s]

Splitting digraphs:   0%|          | 0/149 [00:00<?, ?user/s]

In [12]:
MIN_SAMPLES = 50
N_COMPONENTS = 2
try:
    user_models = np.load('user_models_pp.npy', allow_pickle=True)[()]
    print('Loaded models from file')
except FileNotFoundError:
    user_models = {}
    for user in tqdm(split_user_digraphs['train'], desc='Fitting models', unit='user'):
        user_models[user] = {}
        for (key_pair, digraphs) in split_user_digraphs['train'][user].items():
            if len(digraphs) < MIN_SAMPLES:
                continue
            model = GaussianMixture(n_components=N_COMPONENTS)
            # model.fit(user_digraphs[user][digraph][:, [Digraph.h1, Digraph.h2, Digraph.pp, Digraph.rp]])
            model.fit(digraphs[:, [Digraph.pp]])
            user_models[user][key_pair] = model
    np.save('user_models_pp.npy', user_models)

Fitting models:   0%|          | 0/148 [00:00<?, ?user/s]

In [13]:
def similarity_score(user_digraphs, profile_model, z_threshold=1):
    passed_digraphs = 0
    total_digraphs = 0
    for (key_pair, digraphs) in user_digraphs.items():
        if key_pair not in profile_model:
            continue
        model = profile_model[key_pair]
        for digraph in digraphs:
            total_digraphs += 1
            for i in range(model.n_components):
                weight, mean, covariance = model.weights_[i], model.means_[i], model.covariances_[i]
                z_score = (digraph[Digraph.pp] - mean) / np.sqrt(covariance)
                if abs(z_score) <= z_threshold:
                    passed_digraphs += weight
                    break
    return passed_digraphs / total_digraphs

In [14]:
def classify(user_digraphs, profile_model, pass_threshold=0.60):
    score = similarity_score(user_digraphs, profile_model)
    return score >= pass_threshold, score-pass_threshold

In [15]:
errors = 0
net_error_distance = 0
net_correct_distance = 0
total = 0
for user in tqdm(split_user_digraphs['test'], desc='Testing users against selves', unit='user'):
    profile_model = user_models[user]
    same_user, distance_from_threshold = classify(split_user_digraphs['test'][user], profile_model)
    if same_user == False:
        errors += 1
        net_error_distance += abs(distance_from_threshold)
    else:
        net_correct_distance += abs(distance_from_threshold)
    total += 1
print(f"FRR = {errors*100/total:2.3f}%")
print(f"Average distance from threshold for errors = {net_error_distance*100/errors:2.3f}%")
print(f"Average distance from threshold for correct identification = {net_correct_distance*100/(total-errors):2.3f}%")
print(f"Average distance from threshold = {(net_error_distance+net_correct_distance)*100/total:2.3f}%")

Testing users against selves:   0%|          | 0/148 [00:00<?, ?user/s]

FRR = 28.378%
Average distance from threshold for errors = 2.385%
Average distance from threshold for correct identification = 3.544%
Average distance from threshold = 3.215%


In [16]:
errors = 0
net_error_distance = 0
net_correct_distance = 0
total = 0
for user in tqdm(split_user_digraphs['test'], desc='Testing users against others', unit='user'):
    profile_model = user_models[user]
    for other_user in split_user_digraphs['test']:
        if other_user == user:
            continue
        same_user, distance_from_threshold = classify(split_user_digraphs['test'][other_user], profile_model)
        if same_user == True:
            errors += 1
            net_error_distance += abs(distance_from_threshold)
        else:
            net_correct_distance += abs(distance_from_threshold)
        total += 1
print(f"FAR = {errors*100/total:2.3f}%")
print(f"Average distance from threshold for errors = {net_error_distance*100/errors:2.3f}%")
print(f"Average distance from threshold for correct identification = {net_correct_distance*100/(total-errors):2.3f}%")
print(f"Average distance from threshold = {(net_error_distance+net_correct_distance)*100/total:2.3f}%")

Testing users against others:   0%|          | 0/148 [00:00<?, ?user/s]

FAR = 4.201%
Average distance from threshold for errors = 2.269%
Average distance from threshold for correct identification = 12.287%
Average distance from threshold = 11.867%
