In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import os
import string
import re
import json

### Check DATA

The metadata_participants.txt contains demographics and derived metrics for each participant.
The column names are described as follows:
<table>
    <tr><td>PARTICIPANT_ID</td><td>Unique ID of participant</td></tr>
    <tr><td>AGE</td><td>Participant's age</td></tr>
    <tr><td>GENDER</td><td>Participant's gender</td></tr>
    <tr><td>HAS_TAKEN_TYPING_COURSE</td><td>Whether the participant has taken a typing course (1) or not (0)</td></tr>
    <tr><td>COUNTRY</td><td>Country from which the participant has taken the study</td></tr>
    <tr><td>KEYBOARD_LAYOUT</td><td>QWERTY, AZERTY, QWERTZ or other layout</td></tr>
    <tr><td>NATIVE_LANGUAGE</td><td>Native language of participant</td></tr>
    <tr><td>FINGERS</td><td>choice between 1-2, 3-4, 5-6, 7-8 and 9-10 fingers used</td></tr>
    <tr><td>TIME_SPENT_TYPING</td><td>Number of hours spent typing per day</td></tr>
    <tr><td>KEYBOARD_TYPE</td><td>full (desktop), laptop, small (physical) or (on-screen) touch keyboard</td></tr>
    <tr><td>ERROR_RATE(%)</td><td>Average error rate</td></tr>
    <tr><td>AVG_WPM</td><td>Average words per minute</td></tr>
    <tr><td>AVG_IKI</td><td>Average inter-key interval</td></tr>
    <tr><td>ROLLOVER</td><td>Average rollover ratio</td></tr>
    <tr><td>KSPC</td><td>Average Keystrokes per Character</td></tr>
    <tr><td>AVG_KEYPRESS</td><td>Average keypress duration</td></tr>
</table>

Files named number_keystrokes.txt are the keystroke-by-keystroke logs for all test sentences transcribed by the participant ID = number.<br>
The column names are described as follows: <br>
    
 <table>
    <tr><td>PARTICIPANT_ID</td><td>Unique ID of participant</td></tr>
    <tr><td>TEST_SECTION_ID</td><td>Unique ID of sentence within a participant's test</td></tr>
    <tr><td>SENTENCE</td><td>Presented sentence</td></tr>
    <tr><td>USER_INPUT</td><td>Transcribed sentence</td></tr>
    <tr><td>KEYSTROKE_ID</td><td>Unique keystroke id (across all participants)</td></tr>
    <tr><td>PRESS_TIME</td><td>Timestamp when the key was pressed</td></tr>
    <tr><td>RELEASE_TIME</td><td>Timestamp when the key was released</td></tr>
    <tr><td>LETTER</td><td>String representation of the pressed key</td></tr>
    <tr><td>KEYCODE</td><td>JavaScript keycode of the pressed key</td></tr>
</table>

For some users, the typed letter was not logged correctly. Instead, the corresponding javascript keycode can be used. <br>


**Initial thoughts**: <br>
- calculate flight time between neighbouring symbols<br>
- find correct letter types (using keycodes)<br>
- remove small (physical) or (on-screen) touch keyboard<br>

In [80]:
# Prepare KEY_CODES dictionary

KEY_CODES = dict()

with open("key-codes.json", "rb") as f:
    KEY_CODES = json.load(f)

# No keys will be removed. Function keys misclicks are possible and numpad buttons might be used for typing in numbers.
# Additionally: UNKNOWN key will be added with value of -1

KEY_CODES["UNKNOWN"] = -1
KEY_CODES = dict(sorted(KEY_CODES.items(), key=lambda x: x[1]))

In [83]:
KEY_CODES

{'UNKNOWN': -1,
 'backspace': 8,
 'tab': 9,
 'enter': 13,
 'shift': 16,
 'ctrl': 17,
 'alt': 18,
 'pausebreak': 19,
 'capslock': 20,
 'esc': 27,
 'space': 32,
 'pageup': 33,
 'pagedown': 34,
 'end': 35,
 'home': 36,
 'leftarrow': 37,
 'uparrow': 38,
 'rightarrow': 39,
 'downarrow': 40,
 'print_screen': 44,
 'insert': 45,
 'delete': 46,
 '0': 48,
 '1': 49,
 '2': 50,
 '3': 51,
 '4': 52,
 '5': 53,
 '6': 54,
 '7': 55,
 '8': 56,
 '9': 57,
 'a': 65,
 'b': 66,
 'c': 67,
 'd': 68,
 'e': 69,
 'f': 70,
 'g': 71,
 'h': 72,
 'i': 73,
 'j': 74,
 'k': 75,
 'l': 76,
 'm': 77,
 'n': 78,
 'o': 79,
 'p': 80,
 'q': 81,
 'r': 82,
 's': 83,
 't': 84,
 'u': 85,
 'v': 86,
 'w': 87,
 'x': 88,
 'y': 89,
 'z': 90,
 'leftwindowkey': 91,
 'rightwindowkey': 92,
 'selectkey': 93,
 'numpad0': 96,
 'numpad1': 97,
 'numpad2': 98,
 'numpad3': 99,
 'numpad4': 100,
 'numpad5': 101,
 'numpad6': 102,
 'numpad7': 103,
 'numpad8': 104,
 'numpad9': 105,
 'multiply': 106,
 'add': 107,
 'subtract': 109,
 'decimalpoint': 110,
 '

In [2]:
COUNTRY_MAP = {
    "US": "United States",
    "GB": "Great Britain",
    "MY": "Malaysia",
    "KR": "Korea",
    "CA": "Canada",
    "PH": "Philippines"
}

LANGUAGE_MAP = {
    "en": "English",
    "zh": "Chinese",
    "tl": "Tagalog", # Philippines
    "cy": "Welsh"
}

MAIN_DIR = "data/small_data"

In [3]:
participant_ids = [re.findall(r"[0-9]+", f)[0] for f in os.listdir(MAIN_DIR) if re.match("[0-9]+", f)]

### Features to calculate 
Link: https://towardsdatascience.com/keystroke-dynamics-analysis-and-prediction-part-1-eda-3fe2d25bac04 <br>
- **Hold Time (aka Dwell Time)**: Time the key is pressed
- **Press-Press Time**: Time between the presses of 2 consecutive keys
- **Release-Press Time (aka Flight Time)**: Time to 'fly' from one key to another (negative in case of rollover)
- **Release-Release Time**: Time between the releases of 2 consecutive keys  

**ROLLOVER** - RELEASE_PRESS_TIME IS NEGATIVE!!!

In [22]:
def read_data_for_participant(participant_id:int, calculate_features:bool=True, drop_timestamps:bool=True, print_info:bool=True) -> pd.DataFrame: 
    global MAIN_DIR
    df = pd.read_csv(os.path.join(MAIN_DIR, f"{participant_id}_keystrokes.txt"), sep="\t")
    df["LETTER"] = df["LETTER"].str.lower()
    if calculate_features:
        df["HOLD_TIME"] = df["RELEASE_TIME"] - df["PRESS_TIME"]
        df["PRESS_PRESS_TIME"] = (df["PRESS_TIME"] - df["PRESS_TIME"].shift(1)).fillna(0)
        df["RELEASE_PRESS_TIME"] = (df["PRESS_TIME"] - df["RELEASE_TIME"].shift(1)).fillna(0)
        df["RELEASE_RELEASE_TIME"] = (df["RELEASE_TIME"] - df["RELEASE_TIME"].shift(1)).fillna(0)
    if drop_timestamps:
        df.drop(["RELEASE_TIME", "PRESS_TIME"], axis=1, inplace=True)
    if print_info:
        print(f"{df[df['SENTENCE'] == df['USER_INPUT']]['TEST_SECTION_ID'].nunique()} / {df['TEST_SECTION_ID'].nunique()}" \
              f" sentences were written correctly by the participant {participant_id}.")
    return df

In [23]:
metadata = pd.read_csv(os.path.join(MAIN_DIR, "metadata_participants.txt"), sep="\t")
metadata["COUNTRY"] = metadata["COUNTRY"].map(COUNTRY_MAP)
metadata["NATIVE_LANGUAGE"] = metadata["NATIVE_LANGUAGE"].map(LANGUAGE_MAP)

In [24]:
df = read_data_for_participant(participant_ids[0])
df.head(5)

10 / 15 sentences were written correctly by the participant 145007.


Unnamed: 0,PARTICIPANT_ID,TEST_SECTION_ID,SENTENCE,USER_INPUT,KEYSTROKE_ID,LETTER,KEYCODE,HOLD_TIME,PRESS_PRESS_TIME,RELEASE_PRESS_TIME,RELEASE_RELEASE_TIME
0,145007,1577476,"I will be out on Friday, but any other day is ...","I will be out on Frida, but any other day is f...",74993989,shift,16,247,0.0,0.0,0.0
1,145007,1577476,"I will be out on Friday, but any other day is ...","I will be out on Frida, but any other day is f...",74993987,i,73,96,144.0,-103.0,-7.0
2,145007,1577476,"I will be out on Friday, but any other day is ...","I will be out on Frida, but any other day is f...",74993991,,32,96,111.0,15.0,111.0
3,145007,1577476,"I will be out on Friday, but any other day is ...","I will be out on Frida, but any other day is f...",74993993,w,87,96,112.0,16.0,112.0
4,145007,1577476,"I will be out on Friday, but any other day is ...","I will be out on Frida, but any other day is f...",74993995,i,73,64,80.0,-16.0,48.0


In [7]:
 metadata.head(5)

Unnamed: 0,PARTICIPANT_ID,AGE,GENDER,HAS_TAKEN_TYPING_COURSE,COUNTRY,LAYOUT,NATIVE_LANGUAGE,FINGERS,TIME_SPENT_TYPING,KEYBOARD_TYPE,ERROR_RATE,AVG_WPM,AVG_IKI,KSPC,ROLLOVER,AVG_KEYPRESS
0,27252,17,male,1,United States,qwerty,English,9.loka,1,full,7.411908,56.4777,177.077497,1.205357,0.1247,104.899627
1,36718,13,female,1,Korea,qwertz,English,7.elo,5,laptop,0.461538,26.671,393.736794,1.151002,0.0396,137.38679
2,56281,31,male,0,United States,qwerty,English,9.loka,1,full,1.381215,65.1923,159.431572,1.152566,0.2831,102.910267
3,64663,16,male,0,United States,qwerty,English,5.kesä,1,full,0.0,37.2631,279.294231,1.156507,0.0699,106.803953
4,64816,27,none,0,Canada,qwerty,Chinese,3.huhti,3,laptop,2.614379,23.0143,304.720883,1.299663,0.1114,154.067021


In [72]:
for participant in participant_ids:
    df = read_data_for_participant(participant, print_info=False)
    calculated = df[df['RELEASE_PRESS_TIME'] <= 0.0].shape[0] / df.shape[0]
    meta = metadata.loc[metadata['PARTICIPANT_ID'] == int(participant)]['ROLLOVER'].values[0]
    print(f"Participant ID: {int(participant):9d}, Calculated rollover: {calculated:7.4f}, Metadata rollover: {meta:7.4f}")

Participant ID:    145007, Calculated rollover:  0.4462, Metadata rollover:  0.4556
Participant ID:    159915, Calculated rollover:  0.1997, Metadata rollover:  0.2032
Participant ID:    264420, Calculated rollover:  0.0607, Metadata rollover:  0.0608
Participant ID:    271802, Calculated rollover:  0.4891, Metadata rollover:  0.5009
Participant ID:     27252, Calculated rollover:  0.1238, Metadata rollover:  0.1247
Participant ID:     36718, Calculated rollover:  0.0401, Metadata rollover:  0.0396
Participant ID:    373245, Calculated rollover:  0.2890, Metadata rollover:  0.2942
Participant ID:    379387, Calculated rollover:  0.1126, Metadata rollover:  0.1135
Participant ID:    401129, Calculated rollover:  0.1983, Metadata rollover:  0.2014
Participant ID:    405680, Calculated rollover:  0.4809, Metadata rollover:  0.4915
Participant ID:    438530, Calculated rollover:  0.2271, Metadata rollover:  0.2297
Participant ID:    444311, Calculated rollover:  0.0462, Metadata rollover: 

### Visualization

1. Create a column containing 2 consecutive keys
2. Error rate vs typing course and time spent typing
3. native language vs fingers
4. fingers vs error rate
5. fingers vs rollover
6. wpm vs rollover
7. wpm vs fingers

In [70]:
KEY_CODES

{'backspace': 8,
 'tab': 9,
 'enter': 13,
 'shift': 16,
 'ctrl': 17,
 'alt': 18,
 'pausebreak': 19,
 'capslock': 20,
 'esc': 27,
 'space': 32,
 'pageup': 33,
 'pagedown': 34,
 'end': 35,
 'home': 36,
 'leftarrow': 37,
 'uparrow': 38,
 'rightarrow': 39,
 'downarrow': 40,
 'print_screen': 44,
 'insert': 45,
 'delete': 46,
 '0': 48,
 '1': 49,
 '2': 50,
 '3': 51,
 '4': 52,
 '5': 53,
 '6': 54,
 '7': 55,
 '8': 56,
 '9': 57,
 'a': 65,
 'b': 66,
 'c': 67,
 'd': 68,
 'e': 69,
 'f': 70,
 'g': 71,
 'h': 72,
 'i': 73,
 'j': 74,
 'k': 75,
 'l': 76,
 'm': 77,
 'n': 78,
 'o': 79,
 'p': 80,
 'q': 81,
 'r': 82,
 's': 83,
 't': 84,
 'u': 85,
 'v': 86,
 'w': 87,
 'x': 88,
 'y': 89,
 'z': 90,
 'leftwindowkey': 91,
 'rightwindowkey': 92,
 'selectkey': 93,
 'numpad0': 96,
 'numpad1': 97,
 'numpad2': 98,
 'numpad3': 99,
 'numpad4': 100,
 'numpad5': 101,
 'numpad6': 102,
 'numpad7': 103,
 'numpad8': 104,
 'numpad9': 105,
 'multiply': 106,
 'add': 107,
 'subtract': 109,
 'decimalpoint': 110,
 'divide': 111,
 '

In [73]:
len(KEY_CODES)**2

10000