# Import Data

In [186]:
import pandas as pd
import numpy as np

In [187]:
users = pd.read_csv("Users.csv")

In [188]:
print(users.columns)
users.head()

Index(['UserID', 'BirthYear', 'DA', 'DiagnosisYear', 'Gender', 'Impact',
       'Levadopa', 'MAOB', 'Other', 'Parkinsons', 'Sided', 'Tremors', 'UPDRS'],
      dtype='object')


Unnamed: 0,UserID,BirthYear,DA,DiagnosisYear,Gender,Impact,Levadopa,MAOB,Other,Parkinsons,Sided,Tremors,UPDRS
0,User_0EA27ICBLF,1952.0,True,2000,Female,Severe,True,False,False,True,Left,True,Don't know
1,User_0QAZFRHQHW,1959.0,False,------,Female,------,False,False,False,False,,False,Don't know
2,User_0WTDIGPSBZ,1946.0,False,------,Female,------,False,False,False,False,,False,Don't know
3,User_1HOEBIGASW,1944.0,False,------,Male,------,False,False,False,False,,False,Don't know
4,User_1WMVCCU4RH,1953.0,False,2017,Male,Medium,False,False,False,True,Left,True,Don't know


# Modify DataFrame

Turn the DataFrame into more usable with all numbers. <br>
UserID: dropped first 5 digits, which are are only "User_" <br>
Parkinsons: 1 for diagnosed with Parkinsons, 0 for not <br>
DA: 1 for using, 0 for not <br>
Levadopa: 1 for using, 0 for not <br>
MAOB: 1 for using, 0 for not <br>
Other: 1 for using, 0 for not <br>
Gender: 1 for female, 0 for male <br>
Tremors: 1 for yes, 0 for no <br>
Impact: 0 - 3 for not applicable, mild, medium, severe <br>
Sided: 0 for none, 1 for left, 2 for right <br>
UPDRS: 0 for unknown, then 1-5 on scale <br>



In [189]:
conditions = [
    (users['Impact'] == ' Mild'),
    (users['Impact'] == ' Medium'),
    (users['Impact'] == ' Severe'),
    (users['Impact'] != ' Mild') & (users['Impact'] != ' Medium') & (users['Impact'] != 'Severe')
    ]
values = [1,2,3,0]

conditions1 = [
    (users['UPDRS'] == 'Don\'t know'), 
    (users['UPDRS'] == '1'),
    (users['UPDRS'] == '2'),
    (users['UPDRS'] == '3'),
    (users['UPDRS'] == '4'),
    (users['UPDRS'] == '5')
]
values1 = [0,1,2,3,4,5]

conditions2 = [
    (users['Sided'] == ' None'),
    (users['Sided'] == ' Left'),
    (users['Sided'] == ' Right'),
]
values2 = [0,1,2]


users['UserID'] = [e[5:] for e in users['UserID']]
users['Gender'] = np.where(users['Gender'] == ' Female', 1, 0)
users['Parkinsons'] = np.where(users['Parkinsons'] == ' True', 1, 0)
users['DA'] = np.where(users['DA'] == ' True', 1, 0)
users['Levadopa'] = np.where(users['Levadopa'] == ' True', 1, 0)
users['MAOB'] = np.where(users['MAOB'] == ' True', 1, 0)
users['Tremors'] = np.where(users['Tremors'] == ' True', 1, 0)
users['Impact'] = np.select(conditions, values)
users['Other'] = np.where(users['Other'] == ' True', 1, 0)
users['Sided'] = np.select(conditions2, values2)
users['UPDRS'] = np.select(conditions1, values1)
users['BirthYear'] = pd.to_numeric(users['BirthYear'], errors='coerce')
users['DiagnosisYear'] = pd.to_numeric(users['DiagnosisYear'], errors='coerce')

In [190]:
users.head(10)

Unnamed: 0,UserID,BirthYear,DA,DiagnosisYear,Gender,Impact,Levadopa,MAOB,Other,Parkinsons,Sided,Tremors,UPDRS
0,0EA27ICBLF,1952.0,1,2000.0,1,3,1,0,0,1,1,1,0
1,0QAZFRHQHW,1959.0,0,,1,0,0,0,0,0,0,0,0
2,0WTDIGPSBZ,1946.0,0,,1,0,0,0,0,0,0,0,0
3,1HOEBIGASW,1944.0,0,,0,0,0,0,0,0,0,0,0
4,1WMVCCU4RH,1953.0,0,2017.0,0,2,0,0,0,1,1,1,0
5,1XNJCXS3EY,1936.0,0,,0,0,0,0,0,0,0,0,0
6,2JTCBKUP8T,1958.0,0,2013.0,0,2,1,0,0,1,2,1,0
7,2X17VCRRQA,1951.0,0,2003.0,0,3,1,0,0,1,1,1,0
8,310NXPGJPD,1961.0,0,2015.0,1,2,1,0,1,1,1,1,0
9,39KQRZCESF,1942.0,1,2013.0,0,2,1,0,0,1,2,1,0


# Data Breakdown
Here, we examine how the participants break down into the various categories.

In [191]:
parkinsons = users[users['Parkinsons'] == 1]
none = users[users['Parkinsons'] == 0]
print("There are",len(parkinsons),"observations in the parkinsons df.")
print("There are",len(none),"observations in the none df.")

There are 169 observations in the parkinsons df.
There are 58 observations in the none df.


In [192]:
da = users[users['DA'] == 1]
no_da = users.drop(da.index)
print("There are",len(da),"people who take dopamine agonist.")
print("There are",len(no_da),"people who do not (including those without parkinsons).")

There are 33 people who take dopamine agonist.
There are 194 people who do not (including those without parkinsons).


In [193]:
lev = users[users['Levadopa'] == 1]
no_lev = users[users['Levadopa'] == 0]
print("There are",len(lev),"taking levadopa.")
print("There are",len(no_lev),"people who don't take levadopa")

There are 112 taking levadopa.
There are 115 people who don't take levadopa


In [194]:
mao = users[users['MAOB'] == 1]
no_mao = users[users['MAOB'] == 0]
print("There are",len(mao),"taking MAO-B inhibitor")
print("There are",len(no_mao),"people who don't take MAO-B inhibitor")

There are 15 taking MAO-B inhibitor
There are 212 people who don't take MAO-B inhibitor


In [195]:
oth = users[users['Other'] == 1]
no_oth = users[users['Other'] == 0]
print("There are",len(oth),"taking other medication")
print("There are",len(no_oth),"people who don't take other medication")

There are 69 taking other medication
There are 158 people who don't take other medication


In [196]:
med1 = parkinsons[parkinsons['Levadopa'] == 0]
med2 = med1[med1['DA'] == 0]
med3 = med2[med2['MAOB'] == 0]
med4 = med3[med3['Other'] == 0]
print("There are",len(med4),"with PD taking no medication")

There are 22 with PD taking no medication


# Start Working with Tappy Files 
First, we find how many users we have that also have tappy data. Then, we isolate some parameters from the tappy data for each person.

In [197]:
import os

In [198]:
# Get people in the User Data
people_users = users['UserID'].tolist()

# Get people in Tappy Data
tappy_files = os.listdir('ArchivedData')
tappy_users = tappy_files
i = 0
for file in tappy_files:
    tappy_users[i] = file[:10]
    i = i + 1

# Isolate the users we want and make new Data Frame
all_users = set(people_users).intersection(tappy_users)
print(len(all_users))

usersA = users[users['UserID'].isin(all_users)]
print(len(usersA))


217
217


In [199]:
from datetime import datetime

In [200]:
# Function creating proper data frame for a list of data files that in theory all correspond to same person
def extract_tappy_df(tappy_files):
    filesdf = [pd.read_csv('ArchivedData/' + file, delimiter = '\t', index_col = False, names = ['UserKey', 'Date', 'Timestamp', 'Hand', 'Hold time', 'Direction', 'Latency time', 'Flight time']) for file in tappy_files]
    files = filesdf
    # Transform data frame to be more usable, and get rid of bad rows
    for i in range(len(filesdf)):
        file = filesdf[i]
        file = file[(file['Hand'] == 'L') | (file['Hand'] == 'R') | (file['Hand'] == 'S')]
        file = file[(file['Direction'] == 'LL') | (file['Direction'] == 'LR') | (file['Direction'] == 'LS') |(file['Direction'] == 'RL') | (file['Direction'] == 'RR') | (file['Direction'] == 'RS') | (file['Direction'] == 'SL') | (file['Direction'] == 'SR') | (file['Direction'] == 'SS')]
        date = file['Date']
        hold = file['Hold time']
        latency = file['Latency time']
        flight = file['Flight time']
        file['Date'] = pd.to_datetime(date, errors='coerce', format='%y%M%d').dt.date
        file['Hold time'] = pd.to_numeric(hold, errors = 'coerce')
        file['Latency time'] = pd.to_numeric(latency, errors = 'coerce')
        file['Flight time'] = pd.to_numeric(flight, errors = 'coerce')
        files[i] = file
    df = pd.concat(files)
    return df

In [201]:
# Function taking tappy data frame and extracting relevant features 
# Note those featurs are in flux
def extract_features(tappy_df):
    hold = tappy_df['Hold time'].mean()
    latency = tappy_df['Latency time'].mean()
    flight = tappy_df['Flight time'].mean()
    direction_group = tappy_df.groupby('Direction').mean()
    direction_group = direction_group.reindex(['LL', 'LR', 'LS', 'RL', 'RR', 'RS', 'SL', 'SR', 'SS'])
    direction_group = direction_group.sort_index()
    data = (direction_group.values.flatten()).tolist()
    data.append(flight)
    data.append(latency)
    data.append(hold)
    return data
    
    

In [202]:
# Function which given a user, returns an array of files associated to user
def get_user(user):
    filenames = os.listdir('ArchivedData')
    user_data = np.array([])
    for filename in filenames:
        if user in filename:
            user_data = np.append([filename], user_data)
    return user_data

In [203]:
# Testing Code
file_name1 = '0EA27ICBLF_1607.txt'
file_name2 = '0EA27ICBLF_1608.txt'
tappy_data = extract_tappy_df([file_name1, file_name2])
tappy_info = extract_features(tappy_data)
print(tappy_data.dtypes)
print(extract_features(tappy_data))
print(type(tappy_info))
filenames = os.listdir('ArchivedData')
print(type(filenames))
print(len(filenames))
user = users['UserID'][0]
print(get_user(user))

  exec(code_obj, self.user_global_ns, self.user_ns)


UserKey          object
Date             object
Timestamp        object
Hand             object
Hold time       float64
Direction        object
Latency time    float64
Flight time     float64
dtype: object
[82.16961030642663, 267.5391156546443, 188.00043601427677, 81.94680998612874, 277.610540915398, 210.13678918169342, 113.96206896551627, 205.83332805446813, 128.49275505759658, 74.6712948249994, 416.85856130555703, 340.39779677030083, 79.29060099551167, 323.1659620229952, 204.18026801080228, 115.77737321196405, 234.72046967030928, 151.6053086514227, 69.82867977321429, 358.0364921581524, 233.15363770250107, 75.663629666011, 344.55260314342047, 233.54204322200496, 84.62584269662914, 146.22846441947564, 76.36029962546813, 206.88810924428103, 298.26319984847885, 87.0884642814558]
<class 'list'>
<class 'list'>
621
['0EA27ICBLF_1607.txt' '0EA27ICBLF_1608.txt']


In [204]:
# Testing Code
user = users['UserID'][0]
files = get_user(user)
print(files)
tappy_data = extract_tappy_df(files)
print(tappy_data.head(10))
tappy_info = extract_features(tappy_data)
print(tappy_info)
print(len(tappy_info))

['0EA27ICBLF_1607.txt' '0EA27ICBLF_1608.txt']
      UserKey        Date     Timestamp Hand  Hold time Direction  \
0  0EA27ICBLF  2016-01-22  18:41:04.336    L      101.6        LL   
1  0EA27ICBLF  2016-01-22  18:42:14.070    L       85.9        LL   
2  0EA27ICBLF  2016-01-22  18:42:14.273    L       78.1        LL   
3  0EA27ICBLF  2016-01-22  18:42:14.617    L       62.5        LL   
4  0EA27ICBLF  2016-01-22  18:42:15.586    S      125.0        LS   
5  0EA27ICBLF  2016-01-22  18:42:15.766    L       78.1        SL   
6  0EA27ICBLF  2016-01-22  18:42:15.969    R       85.9        LR   
7  0EA27ICBLF  2016-01-22  18:42:16.875    R       85.9        RR   
8  0EA27ICBLF  2016-01-22  18:42:17.289    L       70.3        RL   
9  0EA27ICBLF  2016-01-22  18:42:17.727    L      101.6        LL   

   Latency time  Flight time  
0         234.4        156.3  
1         437.5        359.4  
2         210.9        125.0  
3         359.4        281.3  
4         187.5         93.8  
5       

In [209]:
# Get data for all users with data
user_names = usersA['UserID']
NUM_FEATURES = 30
NUM_USERS = len(user_names)
new_data = np.zeros((NUM_USERS, NUM_FEATURES))
i = 0
for user in user_names: 
    files = get_user(user)
    tappy_data = extract_tappy_df(files)
    tappy_info = extract_features(tappy_data)
    new_data[i] = tappy_info
    i = i + 1

[[ 82.16961031 267.53911565 188.00043601 ... 206.88810924 298.26319985
   87.08846428]
 [ 97.65414013 406.71624204 309.67515924 ... 304.99617691 405.92413793
  102.87368816]
 [ 64.45294118 390.05882353 319.51176471 ... 396.24444444 463.91746032
   65.56825397]
 ...
 [109.72922327 353.76224312 241.76607454 ... 248.41788977 358.17113825
  111.24684389]
 [ 80.44666667 288.29333333 204.96666667 ... 246.69807692 337.6
   88.70961538]
 [101.82541322 314.88592133 545.18966942 ... 484.98432    329.00080128
  101.90512   ]]


In [232]:
# Data frame holding all the data
columns = ['LLHold', 'LLLatency', 'LLHold', 'LRHold', 'LRLatency', 'LRHold', 'LSHold', 'LSLatency', 'LSHold',
           'RLHold', 'RLLatency', 'RLHold', 'RRHold', 'RRLatency', 'RRHold', 'RSHold', 'RSLatency', 'RSHold',
           'SLHold', 'SLLatency', 'SLHold', 'SRHold', 'SRLatency', 'SRHold', 'SSHold', 'SSLatency', 'SSHold',
        'flightAvg', 'latencyAvg', 'holdAvg']
user_info = pd.DataFrame(new_data, columns=columns)

data_All = pd.concat([usersA, user_info], axis=1, join='inner')
data_All.head(10)

Unnamed: 0,UserID,BirthYear,DA,DiagnosisYear,Gender,Impact,Levadopa,MAOB,Other,Parkinsons,...,SLHold,SRHold,SRLatency,SRHold.1,SSHold,SSLatency,SSHold.1,flightAvg,latencyAvg,holdAvg
0,0EA27ICBLF,1952.0,1,2000.0,1,3,1,0,0,1,...,233.153638,75.66363,344.552603,233.542043,84.625843,146.228464,76.3603,206.888109,298.2632,87.088464
1,0QAZFRHQHW,1959.0,0,,1,0,0,0,0,0,...,333.9,101.220455,536.85,438.259091,59.507692,185.415385,119.307692,304.996177,405.924138,102.873688
3,1HOEBIGASW,1944.0,0,,0,0,0,0,0,0,...,237.620968,103.780645,355.1,269.406452,69.8875,247.925,161.9875,208.651721,325.569638,123.793469
5,1XNJCXS3EY,1936.0,0,,0,0,0,0,0,0,...,375.0,,,,230.5,263.7,48.8,326.358964,406.111687,90.442364
6,2JTCBKUP8T,1958.0,0,2013.0,0,2,1,0,0,1,...,375.726115,146.145679,567.071605,381.348148,267.55,328.1,185.55,372.102157,528.300888,157.756762
7,2X17VCRRQA,1951.0,0,2003.0,0,3,1,0,0,1,...,312.344357,154.357043,336.742957,259.888577,143.010188,269.166488,136.553619,241.129007,315.140509,178.433283
8,310NXPGJPD,1961.0,0,2015.0,1,2,1,0,1,1,...,404.966667,202.125,489.275,428.7,70.690909,340.209091,167.972727,362.335071,521.718863,153.256493
9,39KQRZCESF,1942.0,1,2013.0,0,2,1,0,0,1,...,236.784527,132.159082,461.76088,310.991864,133.906047,283.64186,154.672558,249.24067,391.230062,154.359388
10,3DIXPRIOSW,1936.0,0,,0,0,0,0,0,0,...,187.152398,125.322878,381.143542,248.288192,135.639048,250.711429,122.004762,190.283476,316.385886,132.308581
12,3MZWDTW7CC,,0,,1,0,0,0,0,1,...,457.404651,132.926531,632.536735,481.255102,142.6,251.95,140.65,356.03349,528.370549,159.590161
