# Import Dataset

In [None]:
!wget -r -N -c -np https://physionet.org/files/tappy/1.0.0/
from zipfile import ZipFile

archived_data = './physionet.org/files/tappy/1.0.0/Archived-Data.zip'
archievd_users = './physionet.org/files/tappy/1.0.0/Archived-Users.zip'

with ZipFile(archived_data, 'r') as zip:
    zip.extractall()
with ZipFile(archievd_users, 'r') as zip:
    zip.extractall()

# Import packages

In [None]:
import os
import re
import numpy as np
import pandas as pd
from functools import partial
from pandas.io.common import EmptyDataError
from scipy.stats import skew, kurtosis
from zipfile import ZipFile

# Read data

## Read users

### Get users files list

In [None]:
user_root = "./Archived users/"
user_fn_list = os.listdir(user_root)

### Read users files

In [None]:
def read_one_file(fn, root):
    out = dict()
    with open(root + fn) as f:
        for line in f.readlines():
            k, v = line.split(": ")
            out[k] = v.strip()
            out['ID'] = re.findall(r'_(\w+)\.', fn)[0]
    return out

In [None]:
users_list = list(map(partial(read_one_file, root=user_root), user_fn_list))

In [None]:
users = pd.DataFrame(users_list)
users.replace('------', np.nan, inplace=True)
users.replace('', np.nan, inplace=True)
users['Levadopa'] = users['Levadopa'] == 'True'
users['MAOB'] = users['MAOB'] == 'True'
users['Parkinsons'] = users['Parkinsons'] == 'True'
users['Tremors'] = users['Tremors'] == 'True'
users['Other'] = users['Other'] == 'True'

## Read keys

### Get keys files list

In [None]:
keys_root = "./Tappy Data/"
keys_fn_list = os.listdir(keys_root)

### Read keys files

In [None]:
def read_one_key_file(fn, root):
    try:
        df = pd.read_csv(root + fn, delimiter='\t', header=None, error_bad_lines=False,
                         usecols=range(8), low_memory=False,
                        dtype={0:'str', 1:'str', 2:'str', 3:'str', 4:'float', 5:'str', 6:'float', 7:'float'})
        df.columns = ['ID', 'Date', 'TS', 'Hand', 'HoldTime', 'Direction', 'LatencyTime', 'FlightTime']
    except ValueError:
        try:
            df = pd.read_csv(root + fn, delimiter='\t', header=None, error_bad_lines=False,
                             usecols=range(8), low_memory=False)
            df.columns = ['ID', 'Date', 'TS', 'Hand', 'HoldTime', 'Direction', 'LatencyTime', 'FlightTime']
            df = df[df['ID'].apply(lambda x: len(str(x)) == 10)
                   & df['Date'].apply(lambda x: len(str(x)) == 6)
                   & df['TS'].apply(lambda x: len(str(x)) == 12)
                   & np.in1d(df['Hand'], ["L", "R", "S"])
                   & df['HoldTime'].apply(lambda x: re.search(r"[^\d.]", str(x)) is None)
                   & np.in1d(df['Direction'], ['LL', 'LR', 'RL', 'RR', 'LS', 'SL', 'RS', 'SR', 'RR'])
                   & df['LatencyTime'].apply(lambda x: re.search(r"[^\d.]", str(x)) is None)
                   & df['FlightTime'].apply(lambda x: re.search(r"[^\d.]", str(x)) is None)]
            df['HoldTime'] = df['HoldTime'].astype(np.float)
            df['LatencyTime'] = df['HoldTime'].astype(np.float)
            df['FlightTime'] = df['HoldTime'].astype(np.float)
        except EmptyDataError:
            df =  pd.DataFrame(columns = ['ID', 'Date', 'TS', 'Hand', 'HoldTime', 'Direction', 'LatencyTime', 'FlightTime'])
    except EmptyDataError:
        df =  pd.DataFrame(columns = ['ID', 'Date', 'TS', 'Hand', 'HoldTime', 'Direction', 'LatencyTime', 'FlightTime'])
    return df

In [None]:
keys_list = list(map(partial(read_one_key_file, root=keys_root), keys_fn_list))

In [None]:
keys = pd.concat(keys_list, ignore_index=True, axis=0)

# Visualize data

## Visualize users

In [None]:
users.head()

## Visualize keys

In [None]:
keys.head()

# Filter data

## Filter users

In [None]:
user_w_sufficient_data = set((keys.groupby('ID').size() >= 2000).index)
user_eligible = set(users[((users['Parkinsons']) & (users['Impact'] == 'Mild') 
                       | (~users['Parkinsons']))
                      & (~users['Levadopa'])]['ID'])
valid_users = user_w_sufficient_data.intersection(user_eligible)

## Filter data

In [None]:
valid_keys = keys[(keys['HoldTime'] > 0)
                   & (keys['LatencyTime'] > 0)
                   & (keys['HoldTime'] < 2000)
                   & (keys['LatencyTime'] < 2000)
                   & np.in1d(keys['ID'], list(valid_users))]

# Process data

## Process by hold time

In [None]:
hold_by_user = valid_keys[valid_keys['Hand'] != 'S'].groupby(['ID', 'Hand'])['HoldTime'].agg([np.mean, np.std, skew, kurtosis])

In [None]:
hold_by_user_flat = hold_by_user.unstack()
hold_by_user_flat.columns = ['_'.join(col).strip() for col in hold_by_user_flat.columns.values]
hold_by_user_flat['mean_hold_diff'] = hold_by_user_flat['mean_L'] - hold_by_user_flat['mean_R']

## Process by latency time

In [None]:
latency_by_user = valid_keys[np.in1d(valid_keys['Direction'], ['LL', 'LR', 'RL', 'RR'])].groupby(['ID', 'Direction'])['LatencyTime'].agg([np.mean, np.std, skew, kurtosis])

In [None]:
latency_by_user_flat = latency_by_user.unstack()
latency_by_user_flat.columns = ['_'.join(col).strip() for col in latency_by_user_flat.columns.values]
latency_by_user_flat['mean_LR_RL_diff'] = latency_by_user_flat['mean_LR'] - latency_by_user_flat['mean_RL']
latency_by_user_flat['mean_LL_RR_diff'] = latency_by_user_flat['mean_LL'] - latency_by_user_flat['mean_RR']

## Gather full processed data 

In [None]:
combined = pd.concat([hold_by_user_flat, latency_by_user_flat], axis=1)

In [None]:
full_set = pd.merge(combined.reset_index(), users[['ID', 'Parkinsons']], on='ID')
full_set.set_index('ID', inplace=True)
full_set.dropna(inplace=True)

# Export processed data

In [None]:
full_set.head()

In [None]:
full_set.to_csv('output.csv')