In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import plotly.express as px
import glob, os
import seaborn as sns

In [3]:
user_list = glob.glob('Archived users/*.txt')

user_data = []

for user in user_list:
    user_dict = {}
    id = user.split('_')[1].split('.')[0]
    user_dict['id'] = id
    with open(user) as f:
        lines = f.readlines()
        for line in lines:
            line = line.strip()
            key = line.split(': ')[0]
            val = np.nan if len(line.split(': ')) == 1 else line.split(': ')[1]
            user_dict[key] = val
        user_data.append(user_dict)

In [4]:
user_data = pd.DataFrame(user_data)

In [5]:
user_data=user_data.drop(columns=[col for col in user_data.columns if ':' in col])
user_data

Unnamed: 0,id,Gender,Parkinsons,Tremors,Sided,UPDRS,Impact,Levadopa,DA,MAOB,Other,BirthYear,DiagnosisYear
0,PJU53Y7KVB,Male,True,True,Left,Don't know,Mild,True,False,False,False,,
1,G6OE5CXQPY,Male,False,False,,Don't know,------,False,False,False,False,1952,------
2,2X17VCRRQA,Male,True,True,Left,Don't know,Severe,True,False,False,False,1951,2003
3,I3U47MF5UF,Female,True,True,,Don't know,Medium,True,False,False,True,1947,2014
4,L7Q16SJ7KP,Male,True,True,Right,Don't know,Severe,True,False,True,False,1965,2007
...,...,...,...,...,...,...,...,...,...,...,...,...,...
222,VIAXR21TSC,Female,True,True,Left,Don't know,Medium,True,False,False,True,1946,2012
223,IDZHIUK2W2,Female,True,True,,Don't know,Mild,True,False,False,False,1943,2013
224,SW61HF6XRY,Male,True,True,,Don't know,Severe,True,False,False,True,1953,2016
225,2JTCBKUP8T,Male,True,True,Right,Don't know,Medium,True,False,False,False,1958,2013


In [6]:
def is_medicated(df):
    return df['Levadopa'] or df['MAOB'] or df['Other'] or df['DA']

In [7]:
user_data['age'] = 2017 - user_data['BirthYear'].astype(float)
user_data['medicated'] = user_data.apply(is_medicated, axis=1)

In [8]:
user_data

Unnamed: 0,id,Gender,Parkinsons,Tremors,Sided,UPDRS,Impact,Levadopa,DA,MAOB,Other,BirthYear,DiagnosisYear,age,medicated
0,PJU53Y7KVB,Male,True,True,Left,Don't know,Mild,True,False,False,False,,,,True
1,G6OE5CXQPY,Male,False,False,,Don't know,------,False,False,False,False,1952,------,65.0,False
2,2X17VCRRQA,Male,True,True,Left,Don't know,Severe,True,False,False,False,1951,2003,66.0,True
3,I3U47MF5UF,Female,True,True,,Don't know,Medium,True,False,False,True,1947,2014,70.0,True
4,L7Q16SJ7KP,Male,True,True,Right,Don't know,Severe,True,False,True,False,1965,2007,52.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
222,VIAXR21TSC,Female,True,True,Left,Don't know,Medium,True,False,False,True,1946,2012,71.0,True
223,IDZHIUK2W2,Female,True,True,,Don't know,Mild,True,False,False,False,1943,2013,74.0,True
224,SW61HF6XRY,Male,True,True,,Don't know,Severe,True,False,False,True,1953,2016,64.0,True
225,2JTCBKUP8T,Male,True,True,Right,Don't know,Medium,True,False,False,False,1958,2013,59.0,True


In [9]:
user_data['Parkinsons'] = user_data['Parkinsons'] == 'True'
user_data['Tremors'] = user_data['Tremors'] == 'True'
user_data['medicated'] = user_data['medicated'] == 'True'
cols = ['id', 'Gender', 'Parkinsons', 'Tremors', 'Sided', 'UPDRS', 'Impact', 'age', 'medicated']
user_data = user_data[cols]
user_data

Unnamed: 0,id,Gender,Parkinsons,Tremors,Sided,UPDRS,Impact,age,medicated
0,PJU53Y7KVB,Male,True,True,Left,Don't know,Mild,,True
1,G6OE5CXQPY,Male,False,False,,Don't know,------,65.0,False
2,2X17VCRRQA,Male,True,True,Left,Don't know,Severe,66.0,True
3,I3U47MF5UF,Female,True,True,,Don't know,Medium,70.0,True
4,L7Q16SJ7KP,Male,True,True,Right,Don't know,Severe,52.0,True
...,...,...,...,...,...,...,...,...,...
222,VIAXR21TSC,Female,True,True,Left,Don't know,Medium,71.0,True
223,IDZHIUK2W2,Female,True,True,,Don't know,Mild,74.0,True
224,SW61HF6XRY,Male,True,True,,Don't know,Severe,64.0,True
225,2JTCBKUP8T,Male,True,True,Right,Don't know,Medium,59.0,True


In [10]:
user_data['UPDRS'] = user_data['UPDRS'].apply(lambda x: int(x) if x != 'Don\'t know' else 0)
impact_dict = {'Mild': 1, 'Medium': 2, 'Severe': 3, ' ------':0, np.nan:0}
user_data['Impact'] = user_data['Impact'].apply(lambda x: impact_dict[x]).fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_data['UPDRS'] = user_data['UPDRS'].apply(lambda x: int(x) if x != 'Don\'t know' else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_data['Impact'] = user_data['Impact'].apply(lambda x: impact_dict[x]).fillna(0)


In [11]:
user_data.to_csv('user_data.csv', index=False)

In [2]:
data_list = glob.glob('Tappy Data/*.txt')
exp_data = []
for data in data_list:
    with open(data) as f:
        lines = f.readlines()
        for line in lines:
            line = line.strip().split('\t')
            if len(line) != 8:
                print(line)
            else:
                exp_data.append(line)

['OMCPRWYBSQ', '161027', '11:38:01.707', 'R', '0085.9', 'LR', '0601.6']
['QAH9IVALVC', '160829', '13:33:10.324', 'L', '0078.1', 'LL', '0250.0']


In [3]:
exp_data = pd.DataFrame(exp_data, columns=['id','date','time','hand', 'hold_time', 'direction', 'latency', 'flight_time'])
display(exp_data)

Unnamed: 0,id,date,time,hand,hold_time,direction,latency,flight_time
0,0EA27ICBLF,160722,18:41:04.336,L,0101.6,LL,0234.4,0156.3
1,0EA27ICBLF,160722,18:42:14.070,L,0085.9,LL,0437.5,0359.4
2,0EA27ICBLF,160722,18:42:14.273,L,0078.1,LL,0210.9,0125.0
3,0EA27ICBLF,160722,18:42:14.617,L,0062.5,LL,0359.4,0281.3
4,0EA27ICBLF,160722,18:42:15.586,S,0125.0,LS,0187.5,0093.8
...,...,...,...,...,...,...,...,...
9316851,ZYWLN4JVLA,170126,13:56:20.117,L,0195.3,RL,0425.8,0261.7
9316852,ZYWLN4JVLA,170126,13:56:20.242,R,0105.5,LR,0214.8,0019.5
9316853,ZYWLN4JVLA,170126,13:56:33.625,L,0168.0,LL,0332.0,0015.6
9316854,ZYWLN4JVLA,170126,13:56:33.836,L,0097.7,LL,0281.3,0113.3


In [4]:
def process_rows(df):
    if len(df['id']) != 10:
        df['id'] = np.nan
    if len(df['hold_time']) != 6:
        df['hold_time'] = np.nan
    else:
        df['hold_time'] = float(df['hold_time'])

    if len(df['latency']) != 6:
        df['latency'] = np.nan
    else:
        df['latency'] = float(df['latency'])
    if len(df['flight_time']) != 6:
        df['flight_time'] = np.nan
    else:
        df['flight_time'] = float(df['flight_time'])
    return df

In [5]:
exp_data = exp_data.apply(process_rows, axis=1)

In [16]:
exp_data.head()

Unnamed: 0,id,date,time,hand,hold_time,direction,latency,flight_time
0,NMMGWRY6SO,170301,08:45:42.125,L,187.5,LL,421.9,281.3
1,NMMGWRY6SO,170301,08:45:42.422,L,203.1,LL,281.3,93.8
2,NMMGWRY6SO,170301,08:48:29.031,L,203.1,LL,296.9,125.0
3,NMMGWRY6SO,170301,08:48:29.266,L,218.8,LL,218.8,15.6
4,NMMGWRY6SO,170301,08:48:29.484,R,187.5,LR,250.0,31.3
...,...,...,...,...,...,...,...,...
9316851,3HYXJCTCNT,170126,14:12:47.000,S,58.6,RS,214.8,160.2
9316852,3HYXJCTCNT,170126,14:12:49.422,R,85.9,LR,175.8,82.0
9316853,3HYXJCTCNT,170126,14:12:49.598,R,39.1,RR,222.7,136.7
9316854,3HYXJCTCNT,170126,14:12:51.160,L,43.0,LL,125.0,285.2


In [19]:
exp_data = exp_data.dropna(axis=0)

In [20]:
exp_data.to_csv('typing_data.csv', index=False)

In [21]:
big_data = pd.merge(exp_data, user_data, on='id', how='inner')