In [1]:
import glob
import os
import re
import json
import itertools
import pandas as pd
import torch
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Check DATA

The metadata_participants.txt contains demographics and derived metrics for each participant.
The column names are described as follows:
<table>
    <tr><td>PARTICIPANT_ID</td><td>Unique ID of participant</td></tr>
    <tr><td>AGE</td><td>Participant's age</td></tr>
    <tr><td>GENDER</td><td>Participant's gender</td></tr>
    <tr><td>HAS_TAKEN_TYPING_COURSE</td><td>Whether the participant has taken a typing course (1) or not (0)</td></tr>
    <tr><td>COUNTRY</td><td>Country from which the participant has taken the study</td></tr>
    <tr><td>KEYBOARD_LAYOUT</td><td>QWERTY, AZERTY, QWERTZ or other layout</td></tr>
    <tr><td>NATIVE_LANGUAGE</td><td>Native language of participant</td></tr>
    <tr><td>FINGERS</td><td>choice between 1-2, 3-4, 5-6, 7-8 and 9-10 fingers used</td></tr>
    <tr><td>TIME_SPENT_TYPING</td><td>Number of hours spent typing per day</td></tr>
    <tr><td>KEYBOARD_TYPE</td><td>full (desktop), laptop, small (physical) or (on-screen) touch keyboard</td></tr>
    <tr><td>ERROR_RATE(%)</td><td>Average error rate</td></tr>
    <tr><td>AVG_WPM</td><td>Average words per minute</td></tr>
    <tr><td>AVG_IKI</td><td>Average inter-key interval</td></tr>
    <tr><td>ROLLOVER</td><td>Average rollover ratio</td></tr>
    <tr><td>KSPC</td><td>Average Keystrokes per Character</td></tr>
    <tr><td>AVG_KEYPRESS</td><td>Average keypress duration</td></tr>
</table>

Files named number_keystrokes.txt are the keystroke-by-keystroke logs for all test sentences transcribed by the participant ID = number.<br>
The column names are described as follows: <br>
    
 <table>
    <tr><td>PARTICIPANT_ID</td><td>Unique ID of participant</td></tr>
    <tr><td>TEST_SECTION_ID</td><td>Unique ID of sentence within a participant's test</td></tr>
    <tr><td>SENTENCE</td><td>Presented sentence</td></tr>
    <tr><td>USER_INPUT</td><td>Transcribed sentence</td></tr>
    <tr><td>KEYSTROKE_ID</td><td>Unique keystroke id (across all participants)</td></tr>
    <tr><td>PRESS_TIME</td><td>Timestamp when the key was pressed</td></tr>
    <tr><td>RELEASE_TIME</td><td>Timestamp when the key was released</td></tr>
    <tr><td>LETTER</td><td>String representation of the pressed key</td></tr>
    <tr><td>KEYCODE</td><td>JavaScript keycode of the pressed key</td></tr>
</table>

For some users, the typed letter was not logged correctly. Instead, the corresponding javascript keycode can be used. <br>


**Initial thoughts**: <br>
- calculate flight time between neighbouring symbols<br>
- find correct letter types (using keycodes)<br>
- remove small (physical) or (on-screen) touch keyboard<br>

In [14]:
class Mapper:
    
    def __init__(self):
        self.KEY_TO_CODE = dict()
        with open("key-codes.json", "rb") as f:
            self.KEY_TO_CODE = json.load(f)
        self.KEY_TO_CODE["<SoS>"] = 0 # Start of Sequence
        self.KEY_TO_CODE = dict(sorted(self.KEY_TO_CODE.items(), key=lambda x: x[1]))
        self.KEY_CODES = list(self.KEY_TO_CODE.values())
        self.dict_size = len(self.KEY_TO_CODE)
        self.KEY_TO_CODE["UNKNOWN"] = -1
        self.CODE_TO_KEY = { v: k for k, v in self.KEY_TO_CODE.items() }
        self.inner_mapping = { k: i for i, k in enumerate(self.KEY_CODES) }
        self.reversed_inner_mapping = {i: k for i, k in enumerate(self.KEY_CODES)}
        
    def get_key_from_code(self, code: int):
        try:
            return self.CODE_TO_KEY[code]
        except:
            return "Code not found"
    
    def get_code_from_key(self, key: str):
        try:
            return self.KEY_TO_CODE[key]
        except:
            return "Key not found"
        
    def get_key_from_mapped_code(self, code: int):
        try:
            code = self.reversed_inner_mapping[code]
            return self.get_key_from_code(code)
        except:
            return "Mapped code not found"
    
    def get_mapped_code_from_key(self, key: str):
        try:
            code = self.get_code_from_key(key)
            return self.inner_mapping[code]
        except:
            return "Key not found"
    
    def get_mapped_code_from_code(self, code: int):
        try:
            return self.inner_mapping[code]
        except:
            return "Code not found"
        
    def get_code_from_mapped_code(self, code: int):
        try:
            return self.reversed_inner_mapping[code]
        except:
            return "Mapped code not found"
            

In [6]:
COUNTRY_MAP = {
    "US": "United States",
    "GB": "Great Britain",
    "MY": "Malaysia",
    "KR": "Korea",
    "CA": "Canada",
    "PH": "Philippines"
}

LANGUAGE_MAP = {
    "en": "English",
    "zh": "Chinese",
    "tl": "Tagalog", # Philippines
    "cy": "Welsh"
}

MAIN_DIR = "data/small_data"

In [7]:
participant_ids = [re.findall(r"[0-9]+", f)[0] for f in os.listdir(MAIN_DIR) if re.match("[0-9]+", f)]

### Features to calculate 
Link: https://towardsdatascience.com/keystroke-dynamics-analysis-and-prediction-part-1-eda-3fe2d25bac04 <br>
- **Hold Time (aka Dwell Time)**: Time the key is pressed
- **Press-Press Time**: Time between the presses of 2 consecutive keys
- **Release-Press Time (aka Flight Time)**: Time to 'fly' from one key to another (negative in case of rollover)
- **Release-Release Time**: Time between the releases of 2 consecutive keys  

**ROLLOVER** - RELEASE_PRESS_TIME IS NEGATIVE!!!

In [166]:
def read_data_for_participant(participant_id:int, calculate_features:bool=True, drop_timestamps:bool=True, print_info:bool=True) -> pd.DataFrame: 
    global MAIN_DIR
    df = pd.read_csv(os.path.join(MAIN_DIR, f"{participant_id}_keystrokes.txt"), sep="\t")
    df["LETTER"] = df["LETTER"].str.lower()
    if calculate_features:
        df["PREV_KEYCODE"] = 0
        df["HOLD_TIME"] = df["RELEASE_TIME"] - df["PRESS_TIME"]
        df["PRESS_PRESS_TIME"] = 0
        df["RELEASE_PRESS_TIME"] = 0
        df["RELEASE_RELEASE_TIME"] = 0 

        shifted_df = df.shift(1).fillna(0)
        new_sentences = df["TEST_SECTION_ID"] == shifted_df["TEST_SECTION_ID"]
        new_sentences = df.index[new_sentences].tolist()
        for index in new_sentences:
            df.at[index, "PREV_KEYCODE"] = int(shifted_df.at[index, "KEYCODE"])
            df.at[index, "PRESS_PRESS_TIME"] = df.at[index, "PRESS_TIME"] - shifted_df.at[index, "PRESS_TIME"]
            df.at[index, "RELEASE_PRESS_TIME"] = df.at[index, "PRESS_TIME"] - shifted_df.at[index, "RELEASE_TIME"]
            df.at[index, "RELEASE_RELEASE_TIME"] = df.at[index, "RELEASE_TIME"] - shifted_df.at[index, "RELEASE_TIME"]
    if drop_timestamps:
        df.drop(["RELEASE_TIME", "PRESS_TIME"], axis=1, inplace=True)
    if print_info:
        print(f"{df[df['SENTENCE'] == df['USER_INPUT']]['TEST_SECTION_ID'].nunique()} / {df['TEST_SECTION_ID'].nunique()}" \
              f" sentences were written correctly by the participant {participant_id}.")
    del shifted_df
    return df

In [167]:
df_test = read_data_for_participant(145007)

df_test

10 / 15 sentences were written correctly by the participant 145007.


Unnamed: 0,PARTICIPANT_ID,TEST_SECTION_ID,SENTENCE,USER_INPUT,KEYSTROKE_ID,LETTER,KEYCODE,PREV_KEYCODE,HOLD_TIME,PRESS_PRESS_TIME,RELEASE_PRESS_TIME,RELEASE_RELEASE_TIME
0,145007,1577476,"I will be out on Friday, but any other day is ...","I will be out on Frida, but any other day is f...",74993989,shift,16,0,247,0,0,0
1,145007,1577476,"I will be out on Friday, but any other day is ...","I will be out on Frida, but any other day is f...",74993987,i,73,16,96,144,-103,-7
2,145007,1577476,"I will be out on Friday, but any other day is ...","I will be out on Frida, but any other day is f...",74993991,,32,73,96,111,15,111
3,145007,1577476,"I will be out on Friday, but any other day is ...","I will be out on Frida, but any other day is f...",74993993,w,87,32,96,112,16,112
4,145007,1577476,"I will be out on Friday, but any other day is ...","I will be out on Frida, but any other day is f...",74993995,i,73,87,64,80,-16,48
...,...,...,...,...,...,...,...,...,...,...,...,...
618,145007,1577629,I am staying at a Hotel.,I am staying at a Hotel.,75000978,o,79,72,112,168,56,168
619,145007,1577629,I am staying at a Hotel.,I am staying at a Hotel.,75000985,t,84,79,96,136,24,120
620,145007,1577629,I am staying at a Hotel.,I am staying at a Hotel.,75001009,e,69,84,147,114,18,165
621,145007,1577629,I am staying at a Hotel.,I am staying at a Hotel.,75001013,l,76,69,80,110,-37,43


In [147]:
metadata = pd.read_csv(os.path.join(MAIN_DIR, "metadata_participants.txt"), sep="\t")
metadata["COUNTRY"] = metadata["COUNTRY"].map(COUNTRY_MAP)
metadata["NATIVE_LANGUAGE"] = metadata["NATIVE_LANGUAGE"].map(LANGUAGE_MAP)

In [145]:
df = read_data_for_participant(participant_ids[0])
df

10 / 15 sentences were written correctly by the participant 145007.


Unnamed: 0,PARTICIPANT_ID,TEST_SECTION_ID,SENTENCE,USER_INPUT,KEYSTROKE_ID,LETTER,KEYCODE,PREV_KEYCODE,HOLD_TIME,PRESS_PRESS_TIME,RELEASE_PRESS_TIME,RELEASE_RELEASE_TIME
0,145007,1577476,"I will be out on Friday, but any other day is ...","I will be out on Frida, but any other day is f...",74993989,shift,16,0,247,0,0,0
1,145007,1577476,"I will be out on Friday, but any other day is ...","I will be out on Frida, but any other day is f...",74993987,i,73,16,96,144,-103,-7
2,145007,1577476,"I will be out on Friday, but any other day is ...","I will be out on Frida, but any other day is f...",74993991,,32,73,96,111,15,111
3,145007,1577476,"I will be out on Friday, but any other day is ...","I will be out on Frida, but any other day is f...",74993993,w,87,32,96,112,16,112
4,145007,1577476,"I will be out on Friday, but any other day is ...","I will be out on Frida, but any other day is f...",74993995,i,73,87,64,80,-16,48
...,...,...,...,...,...,...,...,...,...,...,...,...
618,145007,1577629,I am staying at a Hotel.,I am staying at a Hotel.,75000978,o,79,72,112,168,56,168
619,145007,1577629,I am staying at a Hotel.,I am staying at a Hotel.,75000985,t,84,79,96,136,24,120
620,145007,1577629,I am staying at a Hotel.,I am staying at a Hotel.,75001009,e,69,84,147,114,18,165
621,145007,1577629,I am staying at a Hotel.,I am staying at a Hotel.,75001013,l,76,69,80,110,-37,43


In [146]:
 metadata.head(5)

Unnamed: 0,PARTICIPANT_ID,AGE,GENDER,HAS_TAKEN_TYPING_COURSE,COUNTRY,LAYOUT,NATIVE_LANGUAGE,FINGERS,TIME_SPENT_TYPING,KEYBOARD_TYPE,ERROR_RATE,AVG_WPM,AVG_IKI,KSPC,ROLLOVER,AVG_KEYPRESS
0,27252,17,male,1,United States,qwerty,English,9.loka,1,full,7.411908,56.4777,177.077497,1.205357,0.1247,104.899627
1,36718,13,female,1,Korea,qwertz,English,7.elo,5,laptop,0.461538,26.671,393.736794,1.151002,0.0396,137.38679
2,56281,31,male,0,United States,qwerty,English,9.loka,1,full,1.381215,65.1923,159.431572,1.152566,0.2831,102.910267
3,64663,16,male,0,United States,qwerty,English,5.kesä,1,full,0.0,37.2631,279.294231,1.156507,0.0699,106.803953
4,64816,27,none,0,Canada,qwerty,Chinese,3.huhti,3,laptop,2.614379,23.0143,304.720883,1.299663,0.1114,154.067021


In [148]:
for participant in participant_ids:
    df = read_data_for_participant(participant, print_info=False)
    calculated = df[df['RELEASE_PRESS_TIME'] <= 0.0].shape[0] / df.shape[0]
    meta = metadata.loc[metadata['PARTICIPANT_ID'] == int(participant)]['ROLLOVER'].values[0]
    print(f"Participant ID: {int(participant):9d}, Calculated rollover: {calculated:7.4f}, Metadata rollover: {meta:7.4f}")

Participant ID:    145007, Calculated rollover:  0.4687, Metadata rollover:  0.4556
Participant ID:    159915, Calculated rollover:  0.2235, Metadata rollover:  0.2032
Participant ID:    264420, Calculated rollover:  0.0777, Metadata rollover:  0.0608
Participant ID:    271802, Calculated rollover:  0.5145, Metadata rollover:  0.5009
Participant ID:     27252, Calculated rollover:  0.1386, Metadata rollover:  0.1247
Participant ID:     36718, Calculated rollover:  0.0588, Metadata rollover:  0.0396
Participant ID:    373245, Calculated rollover:  0.3104, Metadata rollover:  0.2942
Participant ID:    379387, Calculated rollover:  0.1309, Metadata rollover:  0.1135
Participant ID:    401129, Calculated rollover:  0.2180, Metadata rollover:  0.2014
Participant ID:    405680, Calculated rollover:  0.5042, Metadata rollover:  0.4915
Participant ID:    438530, Calculated rollover:  0.2500, Metadata rollover:  0.2297
Participant ID:    444311, Calculated rollover:  0.0633, Metadata rollover: 

In [149]:
mapper = Mapper()

In [156]:
two_symb_occur = torch.zeros((mapper.dict_size, mapper.dict_size))

general_df = pd.DataFrame()

for participant in participant_ids:
    t = read_data_for_participant(participant)
    general_df = pd.concat([general_df, t])
    for prev_code, curr_code in zip(t["PREV_KEYCODE"].values, t["KEYCODE"].values):
        two_symb_occur[mapper.get_mapped_code_from_code(prev_code), mapper.get_mapped_code_from_code(curr_code)] += 1

10 / 15 sentences were written correctly by the participant 145007.
9 / 15 sentences were written correctly by the participant 159915.
2 / 15 sentences were written correctly by the participant 264420.
12 / 15 sentences were written correctly by the participant 271802.
0 / 15 sentences were written correctly by the participant 27252.
12 / 15 sentences were written correctly by the participant 36718.
14 / 15 sentences were written correctly by the participant 373245.
11 / 15 sentences were written correctly by the participant 379387.
6 / 15 sentences were written correctly by the participant 401129.
14 / 15 sentences were written correctly by the participant 405680.
8 / 15 sentences were written correctly by the participant 438530.
8 / 15 sentences were written correctly by the participant 444311.
10 / 15 sentences were written correctly by the participant 455926.
0 / 15 sentences were written correctly by the participant 473281.
15 / 15 sentences were written correctly by the participa

In [157]:
two_symb_occur

tensor([[  0.,   0.,   0.,  ...,   0.,   0.,   0.],
        [  0., 293.,   0.,  ...,   0.,   0.,   5.],
        [  0.,   0.,   0.,  ...,   0.,   0.,   0.],
        ...,
        [  0.,   0.,   0.,  ...,   0.,   0.,   0.],
        [  0.,   0.,   0.,  ...,   0.,   0.,   0.],
        [  0.,   3.,   0.,  ...,   0.,   0.,   0.]])

In [158]:
sum(two_symb_occur[0])

tensor(300.)

In [188]:
# Most used bigraphs

k = 3

for i in range(two_symb_occur.shape[0]):
    print(f"Most used symbols with: {mapper.get_key_from_mapped_code(i)}")
    topk = torch.topk(two_symb_occur[i], k)
    for index in topk.indices:
        print(f"{mapper.get_key_from_mapped_code(index.item())} \t", end="")
    print()

# torch.argmax(two_symb_occur, axis=1)

Most used symbols with: <SoS>
shift 	capslock 	t 	
Most used symbols with: backspace
backspace 	e 	space 	
Most used symbols with: tab
<SoS> 	backspace 	tab 	
Most used symbols with: enter
<SoS> 	backspace 	tab 	
Most used symbols with: shift
i 	t 	forwardslash 	
Most used symbols with: ctrl
<SoS> 	backspace 	tab 	
Most used symbols with: alt
<SoS> 	backspace 	tab 	
Most used symbols with: pausebreak
<SoS> 	backspace 	tab 	
Most used symbols with: capslock
i 	h 	a 	
Most used symbols with: esc
<SoS> 	backspace 	tab 	
Most used symbols with: space
t 	a 	w 	
Most used symbols with: pageup
<SoS> 	backspace 	tab 	
Most used symbols with: pagedown
<SoS> 	backspace 	tab 	
Most used symbols with: end
<SoS> 	backspace 	tab 	
Most used symbols with: home
<SoS> 	backspace 	tab 	
Most used symbols with: leftarrow
leftarrow 	backspace 	t 	
Most used symbols with: uparrow
<SoS> 	backspace 	tab 	
Most used symbols with: rightarrow
rightarrow 	d 	space 	
Most used symbols with: downarrow
<SoS> 	backs

### Visualization 

1. Error rate vs typing course and time spent typing
2. native language vs fingers
3. fingers vs error rate
4. fingers vs rollover
5. wpm vs rollover
6. wpm vs fingers

101