# Cleaning Data

### Purpose/Driving Question

In this notebook we use the function obtained from readingData notebook to create cleaned datasets for each of the participants and the Users in our datasets. This means we can just read the cleaned versions each time rather than having to do the preprocessing in each notebook.

### Importing Packages and Defining Functions

In [1]:
import pandas as pd
import xmltodict

All of the following functions are defined from readingData, but we present them here again.

In [2]:
def read_Pacer_data(filename):
    #Read in the data
    dat = pd.read_csv(filename)
    #Select necessary columns
    dat = dat[["date","steps"]]
    #Extract datetime data
    dat["datetime"] = pd.to_datetime(dat["date"], format = '%m/%d/%Y, %H:%M:%S %z')
    dat["Date"] = dat["datetime"].dt.date
    dat["Hour"] = dat["datetime"].dt.hour
    dat["Min"] = dat["datetime"].dt.minute
    #Aggregate over the hours
    dat = dat.groupby(["Date","Hour"])["steps"].agg("sum").reset_index()
    #Relabel columns
    dat.columns = [["Date", "Hour", "Steps"]]
    
    return dat

def read_QS_data(filename):
    #Read in CSV file
    dat = pd.read_csv(filename)
    #Extract datetime information
    dat["Datetime"] = pd.to_datetime(dat["Start"], format = '%d-%b-%Y %H:%M')
    dat["Date"] = dat["Datetime"].dt.date
    dat["Hour"] = dat["Datetime"].dt.hour
    #Format columns
    dat = dat[["Date", "Hour", "Steps (count)"]]
    dat.columns = ["Date", "Hour", "Steps"]
    
    return dat

def read_XML_data(filename):
    #Read in XML file
    with open(filename, 'r') as xml_file:
        input_data = xmltodict.parse(xml_file.read())
    #Extract record data from XML
    record_list = input_data['HealthData']['Record']
    df = pd.DataFrame(record_list)
    #Convert dates to datetime objects and steps to numeric
    date_format = '%Y-%m-%d %H:%M:%S %z'
    df['@startDate'] = pd.to_datetime(df['@startDate'], format = date_format)
    df['@endDate'] = pd.to_datetime(df['@endDate'], format = date_format)
    df['@value'] = pd.to_numeric(df['@value'])
    #Sum up values for each hour
    dat = df.resample("H", on="@startDate").sum().reset_index()
    #Extract date and hour information, and relabel columns
    dat["Date"] = dat["@startDate"].dt.date
    dat["Hour"] = dat["@startDate"].dt.hour
    dat["Steps"] = dat["@value"]
    dat = dat[["Date","Hour","Steps"]]
    
    return dat

def read_CLEAN_data(filename):
    #Read in CSV file
    dat = pd.read_csv(filename)
    #Convert datetimes
    dat["Date"] = pd.to_datetime(dat["Date"], format = '%Y-%m-%d').dt.date
    
    return dat

In [3]:
def read_step_data(filename, read_type):
    read_type = read_type.lower()
    if read_type == "pacer":
        return read_Pacer_data(filename)
    elif read_type == "qsaccess" or read_type == "qs":
        return read_QS_data(filename)
    elif read_type == "xml":
        return read_XML_data(filename)
    elif read_type == "clean" or read_type == "cleaned":
        return read_CLEAN_data(filename)
    else:
        raise Exception("Not a valid file type to read! Use pacer, qs, xml or clean")

### Cleaning Datasets

For each of the datasets, we import it using the function and then write it to a given csv file.

In [4]:
#User 1
readfile = "../../data/Participant_ID_A/User1.csv"
readtype = "qs"
writefile = "../../data/cleaned/user1.csv"
read_step_data(readfile, readtype).to_csv(writefile, index = False)

In [5]:
#User 2
readfile = "../../data/Participant_ID_B/User2.csv"
readtype = "qs"
writefile = "../../data/cleaned/user2.csv"
read_step_data(readfile, readtype).to_csv(writefile, index = False)

In [6]:
#Participant 1
readfile = "../../data/Participant_ID_01/DetailedSteps_2020_10_24_1932.csv"
readtype = "pacer"
writefile = "../../data/cleaned/participant1.csv"
read_step_data(readfile, readtype).to_csv(writefile, index = False)

In [7]:
#Participant 2
readfile = "../../data/Participant_ID_02/export.xml"
readtype = "xml"
writefile = "../../data/cleaned/participant2.csv"
read_step_data(readfile, readtype).to_csv(writefile, index = False)

In [8]:
#Participant 3
readfile = "../../data/Participant_ID_03/export.xml"
readtype = "xml"
writefile = "../../data/cleaned/participant3.csv"
read_step_data(readfile, readtype).to_csv(writefile, index = False)

In [9]:
#Serena
readfile = "../../data/Participant_ID_C/User3.csv"
readtype = "qs"
writefile = "../../data/cleaned/serena.csv"
read_step_data(readfile, readtype).to_csv(writefile, index = False)

In [10]:
#Participant 6
readfile = "../../data/Participant_ID_06/DetailedSteps_2020_11_09_1153.csv"
readtype = "pacer"
writefile = "../../data/cleaned/participant6.csv"
read_step_data(readfile, readtype).to_csv(writefile, index = False)

In [11]:
#Participant 7
readfile = "../../data/Participant_ID_07/Health Data.csv"
readtype = "qs"
writefile = "../../data/cleaned/participant7.csv"
read_step_data(readfile, readtype).to_csv(writefile, index = False)