In [1]:
import pandas as pd
import os
import datetime
import numpy as np

In [2]:
participant_file = '/Users/yasaman/UWEXP/script-input/sensors/participants.csv'
fitbit_data_path = '/Users/yasaman/UWEXP/data/fitbit-dumps'
data_types = {"participantID" : "str",
              "date" : "str",
              "totalTimeInBed" : "int64",
              "totalMinutesAsleep" : "int64",
              "totalSleepRecords" : "int64", 
              "logid" : str,
              "minutesToFallAsleep" : str,
              "minutesAwake" : str, 
              "timeInBed" : str, 
              "minutesAsleep" : str, 
              "efficiency" : str,
              "isMainSleep" : str,
              "startTime" : str, 
              "duration" : str, 
              "minutesAfterWakeup" : str}
institution = 'UW'

In [3]:
pids = pd.read_csv(participant_file, header = None, names = ['PID'])
file_names = pids['PID'].apply(lambda x : 'PID{0:0>3}_sleep.csv'.format(x))

In [4]:
def read_and_clean_sleep(path, name, types, institution):
    full_path = '{}/{}'.format(path, name)
    sleep = pd.read_csv(full_path, 
                        header = 0, 
                        dtype=types, 
                        parse_dates=['date', 'startTime'])
    # NOTE when column types are not passed (illustrated below) pandas automatically take TRUE/FALSE as boolean 
    #      True/False when the type is passed as str, then 'True' and 'False' are the values.
    # sleep = pd.read_csv(full_path, header = 0)
    sleep = sleep[sleep['totalTimeInBed'] > 0]
    sleep.rename(index=str, columns={'participantID': 'PID'}, inplace=True)
    sleep['PID'] = name[3:6]
    columns = ['institution']
    columns.extend(list(sleep.columns))
    sleep['institution'] = institution
    sleep = sleep[columns]
    return sleep

In [5]:
sleep_aggregated = []
for name in file_names:
    if name == 'PID007_sleep.csv':
        name = 'PID007-2_sleep.csv'
    if name == 'PID136_sleep.csv':
        name = 'PID136-1_sleep.csv'   
    if os.path.isfile('{}/{}'.format(fitbit_data_path, name)):
        #print('appending data for {}'.format(name))
        sleep = read_and_clean_sleep(fitbit_data_path, name, data_types, institution)
        sleep_aggregated.append(sleep)
        if name == 'PID136-1_sleep.csv':
            name = 'PID136-2_sleep.csv'
            #print('appending data for {}'.format(name))
            sleep = read_and_clean_sleep(fitbit_data_path, name, data_types, institution)
            sleep_aggregated.append(sleep)
    else:
        print('file {} does not exist'.format(name))

file PID017_sleep.csv does not exist
file PID018_sleep.csv does not exist
file PID071_sleep.csv does not exist
file PID081_sleep.csv does not exist
file PID083_sleep.csv does not exist
file PID100_sleep.csv does not exist
file PID105_sleep.csv does not exist
file PID121_sleep.csv does not exist
file PID133_sleep.csv does not exist
file PID135_sleep.csv does not exist
file PID147_sleep.csv does not exist
file PID151_sleep.csv does not exist
file PID152_sleep.csv does not exist
file PID177_sleep.csv does not exist
file PID182_sleep.csv does not exist
file PID185_sleep.csv does not exist
file PID187_sleep.csv does not exist
file PID192_sleep.csv does not exist
file PID200_sleep.csv does not exist
file PID204_sleep.csv does not exist
file PID207_sleep.csv does not exist


In [6]:
sleep_aggregated_vertical = pd.concat(sleep_aggregated, axis=0)
sleep_aggregated_vertical.to_csv('results/aggregated_vertical_sleep.csv', index=False)

In [7]:
columns = list(sleep_aggregated_vertical.columns)
info_columns = [
    'institution',
    'logid',
    'isMainSleep'
]
total_columns = [
    'totalTimeInBed', 
    'totalMinutesAsleep', 
    'totalSleepRecords'
]
record_columns = [
    'minutesToFallAsleep', 
    'minutesAwake', 
    'timeInBed', 
    'minutesAsleep', 
    'efficiency',  
    'startTime', 
    'duration', 
    'minutesAfterWakeup'
]

In [8]:
main_sleep = sleep_aggregated_vertical[sleep_aggregated_vertical['isMainSleep'] == 'True'][['PID', 'date'] + total_columns + record_columns]
# NOTE I tested and there is no pid-date pair that has more than one main sleep record
other_sleep = sleep_aggregated_vertical[sleep_aggregated_vertical['isMainSleep'] == 'False'][['PID', 'date'] + record_columns]
#other_sleep.sort_values(by=['date', 'PID'])

In [9]:
other_sleep['timestamp'] = other_sleep['startTime'].values.astype(np.int64)
cols = list(other_sleep.columns)
cols.remove('PID')
cols.remove('date')
cols.remove('startTime')
other_sleep[cols] = other_sleep[cols].apply(pd.to_numeric)

In [10]:
counts = other_sleep.groupby(by=['PID', 'date']).size()
other_sleep = other_sleep.groupby(by=['PID', 'date']).mean()
other_sleep['counts_other_sleep'] = counts
other_sleep = other_sleep.reset_index()
# NOTE I had to do this and later set the index to PID and date again to avoid having datetime type 
#      for date in other_sleep.
other_sleep['timestamp'] = other_sleep['timestamp'].astype('int64')
other_sleep['startTime'] = pd.to_datetime(other_sleep['timestamp'])
other_sleep.drop(columns='timestamp', inplace=True)

In [11]:
main_columns = {col: col+'_main' for col in record_columns}
main_sleep.rename(index=str, columns=main_columns, inplace=True)
main_sleep = main_sleep.set_index(['PID', 'date'])
other_columns = {col: col+'_other_aggregated' for col in record_columns}
other_sleep.rename(index=str, columns=other_columns, inplace=True)
other_sleep = other_sleep.set_index(['PID', 'date'])

In [12]:
sleep_aggregated_horizontal = pd.concat([main_sleep, other_sleep], axis=1).reset_index()
sleep_aggregated_horizontal.to_csv('results/aggregated_horizontal_sleep.csv', index=False)