In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("pd_hoa_activities.csv")
df

Unnamed: 0,pid,task,duration,age,class
0,0,1,146,72,HOA
1,0,2,210,72,HOA
2,0,3,241,72,HOA
3,0,4,328,72,HOA
4,0,5,229,72,HOA
...,...,...,...,...,...
670,74,5,235,78,PD
671,74,6,41,78,PD
672,74,7,11,78,PD
673,74,8,9,78,PD


In [3]:
# Replace "?" with NaN
df.replace("?", np.NaN, inplace=True)

In [4]:
print(df["class"].value_counts())

HOA            396
PD             189
hoa             27
healthy         27
parkinson's      9
Parkinson's      9
pd               9
Parkinson        9
Name: class, dtype: int64


In [5]:
# Clean class
def clean_class(df):
    ser = df['class'].copy()
    
    for i in range(len(ser)):
        curr_class = str(ser.iloc[i])
        curr_class = curr_class.lower()
        
        if "hoa" in curr_class or "healthy" in curr_class:
            ser.iloc[i] = "HOA"
        elif "pd" in curr_class or "parkinson's" in curr_class or "parkinson":
            ser.iloc[i] = "PD"
    
    df["class"] = ser
    
clean_class(df)

In [6]:
print(df["class"].value_counts())

HOA    450
PD     225
Name: class, dtype: int64


In [7]:
# Clean task
activities = {"1":"Water plants", "2":"Fill medication dispenser", "3":"Wash counter top", "4":"Sweep and dust", 
              "5":"Cook", "6":"Wash hands", "7":"Perform the Timed Up and Go (TUG) test", 
              "8":"Perform TUG with questions being asked", "dot":"A day out task"}

def clean_activities(df):
    ser = df["task"]
    
    for i in activities:
        ser.replace(i, activities[i], inplace=True)

clean_activities(df)

In [8]:
# Drop nulls
df.dropna(inplace=True)
df

Unnamed: 0,pid,task,duration,age,class
0,0,Water plants,146,72,HOA
1,0,Fill medication dispenser,210,72,HOA
2,0,Wash counter top,241,72,HOA
3,0,Sweep and dust,328,72,HOA
4,0,Cook,229,72,HOA
...,...,...,...,...,...
670,74,Cook,235,78,PD
671,74,Wash hands,41,78,PD
672,74,Perform the Timed Up and Go (TUG) test,11,78,PD
673,74,Perform TUG with questions being asked,9,78,PD


In [9]:
# Check and change column types accordingly
df["duration"] = df["duration"].astype(np.int64)
for column in df.columns:
    print(column, df[column].dtype)

pid int64
task object
duration int64
age int64
class object


In [10]:
# Reset index
df.reset_index(inplace=True)

In [11]:
# Cleaned data
df.to_csv("cleaned_pd_hoa_activities.csv", index=False)