# Import Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Get data frame with all users for which there is general data
users = pd.read_csv("Users.csv")

In [3]:
print(users.columns)
users.head()

Index(['UserID', 'BirthYear', 'DA', 'DiagnosisYear', 'Gender', 'Impact',
       'Levadopa', 'MAOB', 'Other', 'Parkinsons', 'Sided', 'Tremors', 'UPDRS'],
      dtype='object')


Unnamed: 0,UserID,BirthYear,DA,DiagnosisYear,Gender,Impact,Levadopa,MAOB,Other,Parkinsons,Sided,Tremors,UPDRS
0,User_0EA27ICBLF,1952.0,True,2000,Female,Severe,True,False,False,True,Left,True,Don't know
1,User_0QAZFRHQHW,1959.0,False,------,Female,------,False,False,False,False,,False,Don't know
2,User_0WTDIGPSBZ,1946.0,False,------,Female,------,False,False,False,False,,False,Don't know
3,User_1HOEBIGASW,1944.0,False,------,Male,------,False,False,False,False,,False,Don't know
4,User_1WMVCCU4RH,1953.0,False,2017,Male,Medium,False,False,False,True,Left,True,Don't know


# Modify DataFrame

Turn the DataFrame into more usable with all numbers. <br>
UserID: dropped first 5 digits, which are are only "User_" <br>
Parkinsons: 1 for diagnosed with Parkinsons, 0 for not <br>
DA: 1 for using, 0 for not <br>
Levadopa: 1 for using, 0 for not <br>
MAOB: 1 for using, 0 for not <br>
Other: 1 for using, 0 for not <br>
Gender: 1 for female, 0 for male <br>
Tremors: 1 for yes, 0 for no <br>
Impact: 0 - 3 for not applicable, mild, medium, severe <br>
Sided: 0 for none, 1 for left, 2 for right <br>
UPDRS: 0 for unknown, then 1-5 on scale <br>



In [4]:
# Modify the data frame to be more usable
conditions = [
    (users['Impact'] == ' Mild'),
    (users['Impact'] == ' Medium'),
    (users['Impact'] == ' Severe'),
    (users['Impact'] != ' Mild') & (users['Impact'] != ' Medium') & (users['Impact'] != 'Severe')
    ]
values = [1,2,3,0]

conditions1 = [
    (users['UPDRS'] == 'Don\'t know'), 
    (users['UPDRS'] == '1'),
    (users['UPDRS'] == '2'),
    (users['UPDRS'] == '3'),
    (users['UPDRS'] == '4'),
    (users['UPDRS'] == '5')
]
values1 = [0,1,2,3,4,5]

conditions2 = [
    (users['Sided'] == ' None'),
    (users['Sided'] == ' Left'),
    (users['Sided'] == ' Right'),
]
values2 = [0,1,2]


users['UserID'] = [e[5:] for e in users['UserID']]
users['Gender'] = np.where(users['Gender'] == ' Female', 1, 0)
users['Parkinsons'] = np.where(users['Parkinsons'] == ' True', 1, 0)
users['DA'] = np.where(users['DA'] == ' True', 1, 0)
users['Levadopa'] = np.where(users['Levadopa'] == ' True', 1, 0)
users['MAOB'] = np.where(users['MAOB'] == ' True', 1, 0)
users['Tremors'] = np.where(users['Tremors'] == ' True', 1, 0)
users['Impact'] = np.select(conditions, values)
users['Other'] = np.where(users['Other'] == ' True', 1, 0)
users['Sided'] = np.select(conditions2, values2)
users['UPDRS'] = np.select(conditions1, values1)
users['BirthYear'] = pd.to_numeric(users['BirthYear'], errors='coerce')
users['DiagnosisYear'] = pd.to_numeric(users['DiagnosisYear'], errors='coerce')

In [5]:
users.head(10)

Unnamed: 0,UserID,BirthYear,DA,DiagnosisYear,Gender,Impact,Levadopa,MAOB,Other,Parkinsons,Sided,Tremors,UPDRS
0,0EA27ICBLF,1952.0,1,2000.0,1,3,1,0,0,1,1,1,0
1,0QAZFRHQHW,1959.0,0,,1,0,0,0,0,0,0,0,0
2,0WTDIGPSBZ,1946.0,0,,1,0,0,0,0,0,0,0,0
3,1HOEBIGASW,1944.0,0,,0,0,0,0,0,0,0,0,0
4,1WMVCCU4RH,1953.0,0,2017.0,0,2,0,0,0,1,1,1,0
5,1XNJCXS3EY,1936.0,0,,0,0,0,0,0,0,0,0,0
6,2JTCBKUP8T,1958.0,0,2013.0,0,2,1,0,0,1,2,1,0
7,2X17VCRRQA,1951.0,0,2003.0,0,3,1,0,0,1,1,1,0
8,310NXPGJPD,1961.0,0,2015.0,1,2,1,0,1,1,1,1,0
9,39KQRZCESF,1942.0,1,2013.0,0,2,1,0,0,1,2,1,0


# Data Breakdown
Here, we examine how the participants break down into the various categories.

In [6]:
parkinsons = users[users['Parkinsons'] == 1]
none = users[users['Parkinsons'] == 0]
print("There are",len(parkinsons),"observations in the parkinsons df.")
print("There are",len(none),"observations in the none df.")

There are 169 observations in the parkinsons df.
There are 58 observations in the none df.


In [7]:
da = users[users['DA'] == 1]
no_da = users.drop(da.index)
print("There are",len(da),"people who take dopamine agonist.")
print("There are",len(no_da),"people who do not (including those without parkinsons).")

There are 33 people who take dopamine agonist.
There are 194 people who do not (including those without parkinsons).


In [8]:
lev = users[users['Levadopa'] == 1]
no_lev = users[users['Levadopa'] == 0]
print("There are",len(lev),"taking levadopa.")
print("There are",len(no_lev),"people who don't take levadopa")

There are 112 taking levadopa.
There are 115 people who don't take levadopa


In [9]:
mao = users[users['MAOB'] == 1]
no_mao = users[users['MAOB'] == 0]
print("There are",len(mao),"taking MAO-B inhibitor")
print("There are",len(no_mao),"people who don't take MAO-B inhibitor")

There are 15 taking MAO-B inhibitor
There are 212 people who don't take MAO-B inhibitor


In [9]:
oth = users[users['Other'] == 1]
no_oth = users[users['Other'] == 0]
print("There are",len(oth),"taking other medication")
print("There are",len(no_oth),"people who don't take other medication")

There are 69 taking other medication
There are 158 people who don't take other medication


In [10]:
med1 = parkinsons[parkinsons['Levadopa'] == 0]
med2 = med1[med1['DA'] == 0]
med3 = med2[med2['MAOB'] == 0]
med4 = med3[med3['Other'] == 0]
print("There are",len(med4),"with PD taking no medication")

There are 22 with PD taking no medication


In [11]:
severe = users[users['Impact'] == 2]
print(len(severe))

74


# Start Working with Tappy Files 
First, we find how many users we have that also have tappy data. Then, we isolate some parameters from the tappy data for each person.

In [6]:
import os

In [7]:
# Get people in the User Data
people_users = users['UserID'].tolist()

# Get people in Tappy Data
tappy_files = os.listdir('ArchivedData')
tappy_users = tappy_files
i = 0
for file in tappy_files:
    tappy_users[i] = file[:10]
    i = i + 1

# Isolate the users we want and make new Data Frame
all_users = set(people_users).intersection(tappy_users)
print(len(all_users))

usersA = users[users['UserID'].isin(all_users)]
usersA = usersA[(usersA['Parkinsons'] == 0) | ((usersA['Levadopa'] == 0) & (usersA['Impact'] == 1))]
print(len(usersA))
usersA = usersA.reset_index(drop = True)
usersA.head()


217
87


Unnamed: 0,UserID,BirthYear,DA,DiagnosisYear,Gender,Impact,Levadopa,MAOB,Other,Parkinsons,Sided,Tremors,UPDRS
0,0QAZFRHQHW,1959.0,0,,1,0,0,0,0,0,0,0,0
1,1HOEBIGASW,1944.0,0,,0,0,0,0,0,0,0,0,0
2,1XNJCXS3EY,1936.0,0,,0,0,0,0,0,0,0,0,0
3,3DIXPRIOSW,1936.0,0,,0,0,0,0,0,0,0,0,0
4,48DZPAJ5NS,1950.0,0,2010.0,0,1,0,0,1,1,0,0,0


In [8]:
from datetime import datetime

In [9]:
# Function creating proper data frame for a list of data files that in theory all correspond to same person
# Discard any times less than 0 or larger than 5 seconds
def extract_tappy_df(tappy_files):
    filesdf = [pd.read_csv('ArchivedData/' + file, delimiter = '\t', index_col = False, names = ['UserKey', 'Date', 'Timestamp', 'Hand', 'Hold time', 'Direction', 'Latency time', 'Flight time']) for file in tappy_files]
    files = filesdf
    # Transform data frame to be more usable, and get rid of bad rows
    for i in range(len(filesdf)):
        file = filesdf[i]
        file = file[(file['Hand'] == 'L') | (file['Hand'] == 'R')]
        file = file[(file['Direction'] == 'LL') | (file['Direction'] == 'LR') | (file['Direction'] == 'RL') | (file['Direction'] == 'RR')]
        date = file['Date']
        hold = file['Hold time']
        latency = file['Latency time']
        flight = file['Flight time']
        file['Date'] = pd.to_datetime(date, errors='coerce', format='%y%M%d').dt.date
        for column in ['Hold time', 'Latency time', 'Flight time']:
            file[column] = pd.to_numeric(file[column], errors = 'coerce')
        file = file.dropna(axis = 0)
        file = file[(file['Hold time'] > 0) & (file['Flight time'] > 0) & (file['Latency time'] > 0) & (file['Hold time'] < 9000) & (file['Flight time'] < 9000) & (file['Latency time'] < 9000)]
        #file = file[(file['Hold time'] > 0) & (file['Flight time'] > 0) & (file['Latency time'] > 0)]
        del file['Flight time']
        files[i] = file
    df = pd.concat(files)
    return df

In [11]:
# Function taking tappy data frame and extracting relevant features 
# Note those featurs are in flux
def extract_features(tappy_df):
    direction_group_mean = tappy_df.groupby('Direction').mean()
    direction_group_mean = direction_group_mean.reindex(['LL', 'LR', 'RL', 'RR'])
    direction_group_mean = direction_group_mean.sort_index()
    data_dir_mean = (direction_group_mean.values.flatten())
    direction_group_std = tappy_df.groupby('Direction').std()
    direction_group_std = direction_group_std.reindex(['LL', 'LR', 'RL', 'RR'])
    direction_group_std = direction_group_std.sort_index()
    data_dir_std = (direction_group_std.values.flatten())
    hand_group_mean = tappy_df.groupby('Hand').mean()
    hand_group_mean = hand_group_mean.reindex(['L', 'R'])
    hand_group_mean = hand_group_mean.sort_index()
    data_hand_mean = (hand_group_mean.values.flatten())
    hand_group_std = tappy_df.groupby('Hand').mean()
    hand_group_std = hand_group_mean.reindex(['L', 'R'])
    hand_group_std = hand_group_mean.sort_index()
    data_hand_std = (hand_group_mean.values.flatten())
    LR_hand_hold = data_hand_mean[0] - data_hand_mean[1]
    LL_RR_latency = data_dir_mean[1] - data_dir_mean[7]
    LR_RL_latency = data_dir_mean[3] - data_dir_mean[5]
    extra = np.array([LR_hand_hold, LL_RR_latency, LR_RL_latency])
    data = np.concatenate((data_dir_mean, data_dir_std, data_hand_mean, data_hand_std, extra), axis = 0)
    return data
    
    

In [12]:
# Function which given a user, returns an array of files associated to user
def get_user(user):
    filenames = os.listdir('ArchivedData')
    user_data = np.array([])
    for filename in filenames:
        if user in filename:
            user_data = np.append([filename], user_data)
    return user_data

In [13]:
# Various Cutoffs for user data
user_names = usersA['UserID']
LOW_CUTOFF = 500
MED_CUTOFF = 1000
HIGH_CUTOFF = 2000
for user in user_names: 
    files = get_user(user)
    tappy_data = extract_tappy_df(files)
    length = len(tappy_data)
    usersA['Length'] = length
usersB = usersA.copy()
usersC = usersA.copy()
usersD = usersA.copy()
usersB = usersB[usersB['Length'] > LOW_CUTOFF]
usersC = usersC[usersC['Length'] > MED_CUTOFF]
usersD = usersD[usersD['Length'] > HIGH_CUTOFF]
print(len(usersA))
print(len(usersB))
print(len(usersC))
print(len(usersD))
print(usersA['Length'].min())

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


87
87
87
87
28759


In [13]:
# Testing Code
file_name1 = '0EA27ICBLF_1607.txt'
file_name2 = '0EA27ICBLF_1608.txt'
tappy_data = extract_tappy_df([file_name1, file_name2])
tappy_info = extract_features(tappy_data)
print(tappy_data.dtypes)
print(extract_features(tappy_data))
print(type(tappy_info))
filenames = os.listdir('ArchivedData')
print(type(filenames))
print(len(filenames))
user = users['UserID'][0]
print(get_user(user))
print(len(tappy_data))

  exec(code_obj, self.user_global_ns, self.user_ns)


UserKey          object
Date             object
Timestamp        object
Hand             object
Hold time       float64
Direction        object
Latency time    float64
dtype: object
[  82.16903976  263.5863773    81.94680999  277.61054092   74.67114757
  416.85633053   79.29456295  273.8646238    19.1815824    97.92265875
   15.57313984   95.48794685   13.21831533   60.92484493   26.70117819
  117.42383615   79.43600301  319.45440058   79.69395102  274.42870196
   79.43600301  319.45440058   79.69395102  274.42870196 -240.01839757
  -10.2782465  -139.24578961]
<class 'numpy.ndarray'>
<class 'list'>
621
['0EA27ICBLF_1607.txt' '0EA27ICBLF_1608.txt']
102182


In [14]:
# Testing Code
user = users['UserID'][0]
files = get_user(user)
print(files)
tappy_data = extract_tappy_df(files)
print(tappy_data.head(10))
tappy_info = extract_features(tappy_data)
print(tappy_info)
print(len(tappy_info))

['0EA27ICBLF_1607.txt' '0EA27ICBLF_1608.txt']


  exec(code_obj, self.user_global_ns, self.user_ns)


       UserKey        Date     Timestamp Hand  Hold time Direction  \
0   0EA27ICBLF  2016-01-22  18:41:04.336    L      101.6        LL   
1   0EA27ICBLF  2016-01-22  18:42:14.070    L       85.9        LL   
2   0EA27ICBLF  2016-01-22  18:42:14.273    L       78.1        LL   
3   0EA27ICBLF  2016-01-22  18:42:14.617    L       62.5        LL   
6   0EA27ICBLF  2016-01-22  18:42:15.969    R       85.9        LR   
7   0EA27ICBLF  2016-01-22  18:42:16.875    R       85.9        RR   
8   0EA27ICBLF  2016-01-22  18:42:17.289    L       70.3        RL   
9   0EA27ICBLF  2016-01-22  18:42:17.727    L      101.6        LL   
12  0EA27ICBLF  2016-01-22  18:42:19.172    L       62.5        LL   
13  0EA27ICBLF  2016-01-22  18:42:20.156    L       70.3        LL   

    Latency time  
0          234.4  
1          437.5  
2          210.9  
3          359.4  
6          195.3  
7          359.4  
8          429.7  
9          406.3  
12         406.3  
13         289.1  
[  82.16903976  263.

In [14]:
# Get data for all users with data
user_names = usersA['UserID']
NUM_FEATURES = 27
NUM_USERS = len(user_names)
new_data = np.zeros((NUM_USERS, NUM_FEATURES))
i = 0
for user in user_names: 
    files = get_user(user)
    tappy_data = extract_tappy_df(files)
    tappy_info = extract_features(tappy_data)
    new_data[i] = tappy_info
    i = i + 1

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


In [15]:
print(user_names)
print(usersA['UserID'])
print(len(usersA))

0     0QAZFRHQHW
1     1HOEBIGASW
2     1XNJCXS3EY
3     3DIXPRIOSW
4     48DZPAJ5NS
         ...    
82    YIA9DW5AGQ
83    YQSGN9BMVK
84    YWMIQIQND3
85    YYPKGX6B24
86    Z2UPVHHGBE
Name: UserID, Length: 87, dtype: object
0     0QAZFRHQHW
1     1HOEBIGASW
2     1XNJCXS3EY
3     3DIXPRIOSW
4     48DZPAJ5NS
         ...    
82    YIA9DW5AGQ
83    YQSGN9BMVK
84    YWMIQIQND3
85    YYPKGX6B24
86    Z2UPVHHGBE
Name: UserID, Length: 87, dtype: object
87


In [24]:
# Data frame holding all the data
columns = ['LLHold', 'LLLatency','LRHold', 'LRLatency', 'RLHold', 'RLLatency', 'RRHold', 'RRLatency', 
           'LLHoldStd', 'LLLatencyStd','LRHoldStd', 'LRLatencyStd', 'RLHoldStd', 'RLLatencyStd', 'RRHoldStd', 'RRLatencyStd', 
           'LHold', 'LLatency', 'RHold', 'RLatency', 'LHoldStd', 'LLatencyStd', 'RHoldStd', 'RLatencyStd',
           'LRHold_Diff', 'LLRRLatency_Diff', 'LRRLLatency_Diff']
user_info = pd.DataFrame(new_data, columns=columns)
dataAll = pd.concat([usersA, user_info], axis=1)
dataAll.head(30)
#dataAll.iloc[0]

Unnamed: 0,UserID,BirthYear,DA,DiagnosisYear,Gender,Impact,Levadopa,MAOB,Other,Parkinsons,...,LLatency,RHold,RLatency,LHoldStd,LLatencyStd,RHoldStd,RLatencyStd,LRHold_Diff,LLRRLatency_Diff,LRRLLatency_Diff
0,0QAZFRHQHW,1959.0,0,,1,0,0,0,0,0,...,416.769161,101.628974,388.125956,99.04781,416.769161,101.628974,388.125956,-317.72135,40.979771,-18.540793
1,1HOEBIGASW,1944.0,0,,0,0,0,0,0,0,...,456.151613,65.124138,479.8,66.280645,456.151613,65.124138,479.8,-389.870968,-4.588235,64.02619
2,1XNJCXS3EY,1936.0,0,,0,0,0,0,0,0,...,330.702278,105.731801,317.509004,152.804557,330.702278,105.731801,317.509004,-177.897722,25.711714,2.742036
3,3DIXPRIOSW,1936.0,0,,0,0,0,0,0,0,...,520.973945,166.612462,521.840426,147.707715,520.973945,166.612462,521.840426,-373.26623,34.890816,74.204668
4,48DZPAJ5NS,1950.0,0,2010.0,0,1,0,0,1,1,...,308.788745,126.101694,333.971404,124.899646,308.788745,126.101694,333.971404,-183.8891,-32.29788,14.376781
5,4XPHKKBXS6,,0,,0,0,0,0,0,0,...,548.636897,142.481709,525.671664,185.129769,548.636897,142.481709,525.671664,-363.507128,-4.639493,-75.820623
6,5PQVTWULAC,1943.0,1,2009.0,1,1,0,0,1,1,...,395.264856,114.319701,436.950672,133.727213,395.264856,114.319701,436.950672,-261.537643,-24.794478,24.435956
7,5USOYSDCXB,1941.0,0,2017.0,0,1,0,0,0,1,...,162.411699,74.382339,158.083945,72.752517,162.411699,74.382339,158.083945,-89.659182,1.05586,-5.769697
8,6LB9FQABZQ,1956.0,0,,1,0,0,0,0,0,...,171.297444,99.434913,186.55646,133.005183,171.297444,99.434913,186.55646,-38.292261,2.387666,36.452778
9,81NEUZEBXI,,0,,0,0,0,0,0,0,...,290.437371,129.685747,315.37432,131.231446,290.437371,129.685747,315.37432,-159.205925,-10.011062,42.836106


In [25]:
dataAll.columns


Index(['UserID', 'BirthYear', 'DA', 'DiagnosisYear', 'Gender', 'Impact',
       'Levadopa', 'MAOB', 'Other', 'Parkinsons', 'Sided', 'Tremors', 'UPDRS',
       'Length', 'LLHold', 'LLLatency', 'LRHold', 'LRLatency', 'RLHold',
       'RLLatency', 'RRHold', 'RRLatency', 'LLHoldStd', 'LLLatencyStd',
       'LRHoldStd', 'LRLatencyStd', 'RLHoldStd', 'RLLatencyStd', 'RRHoldStd',
       'RRLatencyStd', 'LHold', 'LLatency', 'RHold', 'RLatency', 'LHoldStd',
       'LLatencyStd', 'RHoldStd', 'RLatencyStd', 'LRHold_Diff',
       'LLRRLatency_Diff', 'LRRLLatency_Diff'],
      dtype='object')

In [26]:
dataAll.to_csv(r'ConvertedData.csv', index = False)

In [27]:
len(dataAll)

87

In [28]:
len(dataAll.index)

87