In [124]:
#Unzipping all the files to ease preprocessing

import zipfile
import os

# Unzip user metadata
with zipfile.ZipFile("/users/imbahndu/Desktop/Columbia DBM/tappy-keystroke-data-1.0.0/Archived-users.zip", 'r') as zip_ref:
    zip_ref.extractall('user_info')

# Unzip keystroke data
with zipfile.ZipFile("/users/imbahndu/Desktop/Columbia DBM/tappy-keystroke-data-1.0.0/Archived-Data.zip", 'r') as zip_ref:
    zip_ref.extractall('keystroke_data')


In [125]:
import os
import pandas as pd

keystroke_records = []

# Folder containing all the .txt files
data_folder = 'keystroke_data/Tappy Data'

# Loop through each file in the folder and finding the ones ending with .txt
for file in os.listdir(data_folder):
    if file.endswith('.txt'):
        file_path = os.path.join(data_folder, file)
        #Create a dataframe for each file 
        try:
            df = pd.read_csv(
                file_path,
                sep='\t',
                header=None,
                on_bad_lines="skip",
                #the names are extracted from the dataset where I pulled this information
                names=[
                     "UserKey",'Date', 'Timestamp',
                    'Hand', 'HoldTime', 'Direction',
                    'LatencyTime', 'FlightTime'
                ], low_memory=False, usecols=range(8)
            )
            #Creating a new df column to trace the file, but will drop it for ml project
            df['Filename'] = file  # optional for traceability
            keystroke_records.append(df)
        except Exception as e:
            print(f"Failed to read {file}: {e}")

# Combine all parsed files
if keystroke_records:
    keystrokes_df = pd.concat(keystroke_records, ignore_index=True)
    print("Successfully loaded keystroke data:")
else:
    print("No valid keystroke files found.")


keystrokes_df


Successfully loaded keystroke data:


Unnamed: 0,UserKey,Date,Timestamp,Hand,HoldTime,Direction,LatencyTime,FlightTime,Filename
0,0EA27ICBLF,160722,18:41:04.336,L,0101.6,LL,0234.4,0156.3,0EA27ICBLF_1607.txt
1,0EA27ICBLF,160722,18:42:14.070,L,0085.9,LL,0437.5,0359.4,0EA27ICBLF_1607.txt
2,0EA27ICBLF,160722,18:42:14.273,L,0078.1,LL,0210.9,0125.0,0EA27ICBLF_1607.txt
3,0EA27ICBLF,160722,18:42:14.617,L,0062.5,LL,0359.4,0281.3,0EA27ICBLF_1607.txt
4,0EA27ICBLF,160722,18:42:15.586,S,0125.0,LS,0187.5,0093.8,0EA27ICBLF_1607.txt
...,...,...,...,...,...,...,...,...,...
9316853,ZYWLN4JVLA,170126,13:56:20.117,L,0195.3,RL,0425.8,261.7,ZYWLN4JVLA_1701.txt
9316854,ZYWLN4JVLA,170126,13:56:20.242,R,0105.5,LR,0214.8,19.5,ZYWLN4JVLA_1701.txt
9316855,ZYWLN4JVLA,170126,13:56:33.625,L,0168.0,LL,0332.0,15.6,ZYWLN4JVLA_1701.txt
9316856,ZYWLN4JVLA,170126,13:56:33.836,L,0097.7,LL,0281.3,113.3,ZYWLN4JVLA_1701.txt


In [126]:
#Doing thesame thing above for user_info 

user_info_list = []

user_folder = "/users/imbahndu/Desktop/Columbia DBM/tappy-keystroke-data-1.0.0/user_info/Archived users"

#Looping through each file 
for file in os.listdir(user_folder):
    if file.endswith(".txt"):
        #Opening them
        filename = os.path.join(user_folder, file)
        user_data = {}
        #Inspecting the way the user information is encrypted to decrypt it
        with open(filename, "r" ) as f:
            #looping through each file
            for line in f:
                if ":" in line:
                    #append the text before a key in the user_data dictionary and adding the association as the value
                    key, value = line.strip().split(":", 1)
                    print(key, value)
                    user_data[key.strip()] = value.strip()
                    #Adding the file name for traceabililty 
        user_data["UserKey"] = file.replace(".txt", " ")
        user_data["UserKey"] = file.replace("User_", " ")
                    #Appending final results
        user_info_list.append(user_data)
        print(user_data)
print(user_info_list)
user_df = pd.DataFrame(user_info_list)


BirthYear  1952
Gender  Female
Parkinsons  True
Tremors  True
DiagnosisYear  2000
Sided  Left
UPDRS  Don't know
Impact  Severe
Levadopa  True
DA  True
MAOB  False
Other  False
{'BirthYear': '1952', 'Gender': 'Female', 'Parkinsons': 'True', 'Tremors': 'True', 'DiagnosisYear': '2000', 'Sided': 'Left', 'UPDRS': "Don't know", 'Impact': 'Severe', 'Levadopa': 'True', 'DA': 'True', 'MAOB': 'False', 'Other': 'False', 'UserKey': ' 0EA27ICBLF.txt'}
BirthYear  1959
Gender  Female
Parkinsons  False
Tremors  False
DiagnosisYear   ------
Sided  None
UPDRS  Don't know
Impact   ------
Levadopa  False
DA  False
MAOB  False
Other  False
{'BirthYear': '1959', 'Gender': 'Female', 'Parkinsons': 'False', 'Tremors': 'False', 'DiagnosisYear': '------', 'Sided': 'None', 'UPDRS': "Don't know", 'Impact': '------', 'Levadopa': 'False', 'DA': 'False', 'MAOB': 'False', 'Other': 'False', 'UserKey': ' 0QAZFRHQHW.txt'}
BirthYear  1946
Gender  Female
Parkinsons  False
Tremors  False
DiagnosisYear   ------
Sided  None
U

In [127]:
print(user_df)

    BirthYear  Gender Parkinsons Tremors DiagnosisYear  Sided       UPDRS  \
0        1952  Female       True    True          2000   Left  Don't know   
1        1959  Female      False   False        ------   None  Don't know   
2        1946  Female      False   False        ------   None  Don't know   
3        1944    Male      False   False        ------   None  Don't know   
4        1953    Male       True    True          2017   Left  Don't know   
..        ...     ...        ...     ...           ...    ...         ...   
222      1957  Female       True    True          2008   Left  Don't know   
223      1942    Male       True   False          2016   None  Don't know   
224      1942    Male       True    True          2013   None  Don't know   
225            Female       True    True          2015  Right  Don't know   
226            Female       True   False                Right  Don't know   

     Impact Levadopa     DA   MAOB  Other          UserKey  
0    Severe   

In [128]:
user_df["UserKey"]

0       0EA27ICBLF.txt
1       0QAZFRHQHW.txt
2       0WTDIGPSBZ.txt
3       1HOEBIGASW.txt
4       1WMVCCU4RH.txt
            ...       
222     ZT9ASWFCFS.txt
223     ZWBPPNQCUX.txt
224     ZWHGXDUDLG.txt
225     ZY9CCHSPF2.txt
226     ZYWLN4JVLA.txt
Name: UserKey, Length: 227, dtype: object

In [None]:
#Replacing instances of .txt with an empty string to ease concatenation 
user_df["UserKey"] = user_df['UserKey'].str.replace(".txt", " ", case=False)
user_df["UserKey"]

0       0EA27ICBLF 
1       0QAZFRHQHW 
2       0WTDIGPSBZ 
3       1HOEBIGASW 
4       1WMVCCU4RH 
           ...     
222     ZT9ASWFCFS 
223     ZWBPPNQCUX 
224     ZWHGXDUDLG 
225     ZY9CCHSPF2 
226     ZYWLN4JVLA 
Name: UserKey, Length: 227, dtype: object

In [None]:
k#Getting rid of extra characters within the file
eystrokes_df['UserKey'] = keystrokes_df['Filename'].str.extract(r'([A-Z0-9]+)', expand=False)
keystrokes_df["UserKey"]

0          0EA27ICBLF
1          0EA27ICBLF
2          0EA27ICBLF
3          0EA27ICBLF
4          0EA27ICBLF
              ...    
9316853    ZYWLN4JVLA
9316854    ZYWLN4JVLA
9316855    ZYWLN4JVLA
9316856    ZYWLN4JVLA
9316857    ZYWLN4JVLA
Name: UserKey, Length: 9316858, dtype: object

In [137]:
#Making everything consistent to facilitate merging, removing whitespace and making everything upper case
keystrokes_df['UserKey'] = keystrokes_df['UserKey'].str.strip().str.upper()
user_df['UserKey'] = user_df['UserKey'].str.strip().str.upper()


In [132]:
# Preview which keys exist in both
print("UserKeys in keystrokes not in user_df:", set(keystrokes_df['UserKey']) - set(user_df['UserKey']))
print("UserKeys in user_df not in keystrokes:", set(user_df['UserKey']) - set(keystrokes_df['UserKey']))


UserKeys in keystrokes not in user_df: {'7OIX84KINL', 'TQCMYNHOSM', 'WHTIJRQSYC', 'JZVDCNBT1S', 'UURLOSDUOA', 'GDGG19NBH7', 'PHOPRDDRXM', 'YCQJAEI7RN', 'L9ULZGDITG', 'MFAQFDW3IH', 'UFMGBC58XD', '0NMZ9JBBHI', '6VH7AFDQ32', 'Z1QLXS3DGS', '9ALACHPPUR', 'ZFADOEIUOD', 'VDKQJZARYF', 'AEKGFUEQUU', 'SEVJ2WXCKH', 'M6MUS5WY6N', 'NKKUDKDMGQ', 'KE21ILBSOI', '4IE6CIRI0V', 'KLM4CVXNRB', 'QPRYN1F9BZ', '1BMKGIOSHF', 'QEYMRM1ZSM', 'UMZFEO7IAG', 'XZ9HNUO4KO', 'S5FSMHZEYI', 'TLR6R7C7AP', 'RWPUNZZ3EF', 'PESPWEVTGF', 'RTBOZILJFT', 'YACDBXUNSA', '3HYXJCTCNT', 'ZGZJSOXBWP', 'PVADT62JUM', 'UBO5EZ7IIH', 'HSOCXZDCZM', 'LIPRETG9WY', 'YNESMGNGWK', 'EAGGQIYQYT', 'A0K1FZANZN', 'OTUYARXZX2', '55KMBQR3HH', 'PRG54RZY7X', 'WHUETYQCMK', '9IA0YLPHU2'}
UserKeys in user_df not in keystrokes: {'5CECBJXGB3', 'QSFGD1TSPA', '1WMVCCU4RH', 'K4TGUFAPJ8', '0WTDIGPSBZ', '3LBXTMXULC', 'YWAWVFZZZE', 'YFZEXV85OS', 'NTWSB0ULD7', 'KCHR1NSQZO'}


In [133]:
#Unifying each key here, making sure everything is consistent
keystrokes_df['UserKey'] = keystrokes_df['UserKey'].str.split("_").str[0].str.strip().str.upper()
user_df['UserKey'] = user_df['UserKey'].str.strip().str.upper()

In [134]:
#Concatenate both files to have a unified pf
merged_df = keystrokes_df.merge(user_df, on="UserKey", how="left")
merged_df

Unnamed: 0,UserKey,Date,Timestamp,Hand,HoldTime,Direction,LatencyTime,FlightTime,Filename,BirthYear,...,Parkinsons,Tremors,DiagnosisYear,Sided,UPDRS,Impact,Levadopa,DA,MAOB,Other
0,0EA27ICBLF,160722,18:41:04.336,L,0101.6,LL,0234.4,0156.3,0EA27ICBLF_1607.txt,1952,...,True,True,2000,Left,Don't know,Severe,True,True,False,False
1,0EA27ICBLF,160722,18:42:14.070,L,0085.9,LL,0437.5,0359.4,0EA27ICBLF_1607.txt,1952,...,True,True,2000,Left,Don't know,Severe,True,True,False,False
2,0EA27ICBLF,160722,18:42:14.273,L,0078.1,LL,0210.9,0125.0,0EA27ICBLF_1607.txt,1952,...,True,True,2000,Left,Don't know,Severe,True,True,False,False
3,0EA27ICBLF,160722,18:42:14.617,L,0062.5,LL,0359.4,0281.3,0EA27ICBLF_1607.txt,1952,...,True,True,2000,Left,Don't know,Severe,True,True,False,False
4,0EA27ICBLF,160722,18:42:15.586,S,0125.0,LS,0187.5,0093.8,0EA27ICBLF_1607.txt,1952,...,True,True,2000,Left,Don't know,Severe,True,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9316853,ZYWLN4JVLA,170126,13:56:20.117,L,0195.3,RL,0425.8,261.7,ZYWLN4JVLA_1701.txt,,...,True,False,,Right,Don't know,Medium,True,True,True,True
9316854,ZYWLN4JVLA,170126,13:56:20.242,R,0105.5,LR,0214.8,19.5,ZYWLN4JVLA_1701.txt,,...,True,False,,Right,Don't know,Medium,True,True,True,True
9316855,ZYWLN4JVLA,170126,13:56:33.625,L,0168.0,LL,0332.0,15.6,ZYWLN4JVLA_1701.txt,,...,True,False,,Right,Don't know,Medium,True,True,True,True
9316856,ZYWLN4JVLA,170126,13:56:33.836,L,0097.7,LL,0281.3,113.3,ZYWLN4JVLA_1701.txt,,...,True,False,,Right,Don't know,Medium,True,True,True,True


In [135]:
#Something is off about dataset above --> Double check

#Parse user info 

#Merge both together and perform ML stuff (hopefullly they have similar parameters -- rows , columns and stuff)

In [136]:
merged_df["Parkinsons"].value_counts()

Parkinsons
True     6470722
False    2544149
Name: count, dtype: int64

In [None]:
#Exporting the df file to a file directory for preprocessong

merged_df.to_csv("typing.csv")