# Fitbit Exploration

In [1]:
import os
import sys
sys.path.append('../')

from src.features import build_features
from src.visualization import visualize
from src.reports import make_report

import pandas as pd
import numpy as np

from datetime import datetime, timedelta
import json
import ast

# Data Import

## Functions
Below are a set of useful functions for handling the Fitbit Data

In [2]:
def get_device_df(info_df):
    '''
    Take dictionary-like entries for fitbit info dataframe for each row in a dataframe and makes a new dataframe
    
    Inputs:
    - info_df: the fitbit info dataframe with the dictionary-like entries
    
    Returns a dataframe for the device column
    '''
    
    overall_dict = {}
    for row in range(len(info_df)):
        Dict = ast.literal_eval(info_df['devices'][row])
        if type(Dict) == dict:
            Dict = Dict
        elif type(Dict) in [tuple,list] and len(Dict) > 1:
            Dict = Dict[0]
        else:
            continue

        for key in Dict.keys():
            overall_dict.setdefault(key, [])
            overall_dict[key].append(Dict[key])
        # adding in the date of recording
        overall_dict.setdefault('date', [])
        overall_dict['date'].append(info_df.index[row])
        
    df = pd.DataFrame(overall_dict)
    df['date'] = pd.to_datetime(df['date'],errors='coerce')
    return df.set_index('date')

In [3]:
def process_fitbit_intraday(raw_df,resample_rate=60):
    '''
    
    '''
    try:
        df = raw_df.resample(f'{resample_rate}T').mean()
    except TypeError:
        print(f"\t\tDataframe is most likely empty ({len(raw_df)})")
        return raw_df
    return df

## Single Participant
Starting with one participant to see how to work with the data

In [4]:
pt = "2xtqkfz1"
#pt_dir = f"/Volumes/HEF_Dissertation_Research/utx000/extension/data/fitbit/{pt}"
pt_dir = f"../data/raw/utx000/fitbit/{pt}"
single_info = pd.read_csv(f"{pt_dir}/fitbit/fitbit_info.csv")
single_daily = pd.read_csv(f"{pt_dir}/fitbit/fitbit_daily_records.csv")
single_intra = pd.read_csv(f"{pt_dir}/fitbit/fitbit_intraday_records.csv",index_col=0,parse_dates=True)

### Info CSV
Below is an output of the ```fitbit_info.csv```

In [5]:
single_info.head()

Unnamed: 0,date,devices,friends,friends_leaderboard
0,2020-10-02 22:00:08.291453,"[{'battery': 'High', 'batteryLevel': 93, 'devi...",[],"[{'id': '87GWSF', 'type': 'inactive-user'}]"


<div class="alert alert-block alert-danger"> 
    The get_device_df function does not work, but it doesn't give us any good information anyway so I will leave it unchanged

In [6]:
#devices = get_device_df(single_info)
#devices.head()

The ```fitbit_info.csv``` seems to have good information for debugging purposes and tracking battery levels, but overall won't be useful for any analysis purposes.

### Daily CSV

In [7]:
single_daily.head()

Unnamed: 0,date,activities_calories,activities_caloriesBMR,activities_steps,activities_distance,activities_minutesSedentary,activities_minutesLightlyActive,activities_minutesFairlyActive,activities_minutesVeryActive,activities_activityCalories,body_bmi,body_fat,body_weight,foods_log_caloriesIn,foods_log_water,activities_heart,sleep
0,2020-05-01 00:00:00,1320.0,1320.0,0,0.0,1440,0,0,0,0.0,19.993023,0.0,119.99,0.0,0.0,"{'customHeartRateZones': [], 'heartRateZones':...",
1,2020-05-02 00:00:00,1320.0,1320.0,0,0.0,1440,0,0,0,0.0,19.993023,0.0,119.99,0.0,0.0,"{'customHeartRateZones': [], 'heartRateZones':...",
2,2020-05-03 00:00:00,1320.0,1320.0,0,0.0,1440,0,0,0,0.0,19.993023,0.0,119.99,0.0,0.0,"{'customHeartRateZones': [], 'heartRateZones':...",
3,2020-05-04 00:00:00,1320.0,1320.0,0,0.0,1440,0,0,0,0.0,19.993023,0.0,119.99,0.0,0.0,"{'customHeartRateZones': [], 'heartRateZones':...",
4,2020-05-05 00:00:00,1320.0,1320.0,0,0.0,1440,0,0,0,0.0,19.993023,0.0,119.99,0.0,0.0,"{'customHeartRateZones': [], 'heartRateZones':...",


In [8]:
single_daily["sleep"].unique()

array([nan,
       "[{'dateOfSleep': '2020-07-10', 'duration': 19740000, 'efficiency': 91, 'endTime': '2020-07-10T09:11:00.000', 'infoCode': 0, 'isMainSleep': True, 'levels': {'data': [{'dateTime': '2020-07-10T03:42:00.000', 'level': 'wake', 'seconds': 720}, {'dateTime': '2020-07-10T03:54:00.000', 'level': 'light', 'seconds': 960}, {'dateTime': '2020-07-10T04:10:00.000', 'level': 'deep', 'seconds': 2550}, {'dateTime': '2020-07-10T04:52:30.000', 'level': 'light', 'seconds': 2880}, {'dateTime': '2020-07-10T05:40:30.000', 'level': 'wake', 'seconds': 240}, {'dateTime': '2020-07-10T05:44:30.000', 'level': 'light', 'seconds': 1860}, {'dateTime': '2020-07-10T06:15:30.000', 'level': 'deep', 'seconds': 600}, {'dateTime': '2020-07-10T06:25:30.000', 'level': 'light', 'seconds': 1170}, {'dateTime': '2020-07-10T06:45:00.000', 'level': 'wake', 'seconds': 1050}, {'dateTime': '2020-07-10T07:02:30.000', 'level': 'light', 'seconds': 30}, {'dateTime': '2020-07-10T07:03:00.000', 'level': 'rem', 'seconds':

### Intraday CSV

In [9]:
single_intra.head()

Unnamed: 0_level_0,activities_calories,activities_steps,activities_distance,activities_heart
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-05-20 14:57:29,,,,70.0
2020-05-20 14:58:00,1.0087,,,
2020-05-20 14:58:28,,,,70.0
2020-05-20 14:58:43,,,,70.0
2020-05-20 14:58:58,,,,70.0


## All Participants
Now that we have looked at the individual data files, now we can start to import all the data for all participants.

In [10]:
def import_fitbit(filename, pt_dir=f"/Volumes/HEF_Dissertation_Research/utx000/extension/data/fitbit/",verbose=False):
    '''
    
    '''
    print(f"\tReading from file {filename}")
    df = pd.DataFrame()
    for pt in os.listdir(pt_dir):
        if pt[0] != ".":
            if verbose:
                print(f"\t\tReading for participant {pt}")
            try:
                temp = pd.read_csv(f"{pt_dir}{pt}/fitbit/fitbit_{filename}.csv", index_col=0, parse_dates=True)
                if filename[:4] == "intr":
                    temp = process_fitbit_intraday(temp)
                
                temp["beiwe"] = pt
                df = df.append(temp)
            except FileNotFoundError:
                print(f"\t\tFile {filename} not found for participant {pt}")
                
    return df

In [11]:
#fitbit_info = import_fitbit("info")
#fitbit_intra = import_fitbit("intraday_records")

### Daily Fitbit
Below is an output of the ```daily_fitbit.csv``` file and a closer look at some of the more interesting entries.

In [12]:
pt_dir = f"../data/raw/utx000/fitbit/"
fitbit_daily = import_fitbit("daily_records",pt_dir)
fitbit_daily.head()

	Reading from file daily_records
		File daily_records not found for participant 789gcb6u


Unnamed: 0_level_0,activities_calories,activities_caloriesBMR,activities_steps,activities_distance,activities_minutesSedentary,activities_minutesLightlyActive,activities_minutesFairlyActive,activities_minutesVeryActive,activities_activityCalories,body_bmi,body_fat,body_weight,foods_log_caloriesIn,foods_log_water,activities_heart,sleep,beiwe
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2020-05-01,1881.0,1881.0,0,0.0,1440,0,0,0,0.0,23.754,0.0,180.0,0.0,0.0,"{'customHeartRateZones': [], 'heartRateZones':...",,hfttkth7
2020-05-02,1881.0,1881.0,0,0.0,1440,0,0,0,0.0,23.754,0.0,180.0,0.0,0.0,"{'customHeartRateZones': [], 'heartRateZones':...",,hfttkth7
2020-05-03,1881.0,1881.0,0,0.0,1440,0,0,0,0.0,23.754,0.0,180.0,0.0,0.0,"{'customHeartRateZones': [], 'heartRateZones':...",,hfttkth7
2020-05-04,1881.0,1881.0,0,0.0,1440,0,0,0,0.0,23.754,0.0,180.0,0.0,0.0,"{'customHeartRateZones': [], 'heartRateZones':...",,hfttkth7
2020-05-05,1881.0,1881.0,0,0.0,1440,0,0,0,0.0,23.754,0.0,180.0,0.0,0.0,"{'customHeartRateZones': [], 'heartRateZones':...",,hfttkth7


The ```fitbit_daily.csv``` has a lot of good summary data including sleep summary values and sleep data by the minute.

In [27]:
def get_sleep_df(daily_df,verbose=False):
    '''
    Creates a dataframe with the daily sleep data summarized
    
    Inputs:
    - daily_df: dataframe created from the daily fitbit csv file
    
    Returns a dataframe of the daily sleep data
    '''
    overall_dict = {"dateOfSleep": [],"duration": [],"efficiency":[],"endTime": [],"infoCode": [],"isMainSleep": [],"levels": [],"logId": [],
                    "minutesAfterWakeup": [],"minutesAsleep": [],"minutesAwake": [],"minutesToFallAsleep": [],"startTime": [],"timeInBed": [],
                    "type": [],"date": [],"beiwe": [],"awakeCount": [],"awakeDuration": [],"awakeningsCount": [],"minuteData": [],"restlessCount": [],"restlessDuration": []}
    for row in range(len(daily_df)):
        # in case Fitbit didn't record sleep records for that night - value is NaN
        pt = daily_df['beiwe'][row]
        if verbose:
            print(f"\t\tWorking for Participant {pt}")
        # pts with classic sleep data
        if "awakeCount" in daily_df.columns:
            continue
        if type(daily_df['sleep'][row]) == float:
            continue
        else:
            Dict = ast.literal_eval(daily_df['sleep'][row])
            if type(Dict) == dict:
                Dict = Dict
            else:
                Dict = Dict[0]
            for key in overall_dict.keys():
                #overall_dict.setdefault(key, [])
                if key in ["date","beiwe"]:
                    pass
                elif key in Dict.keys():
                    overall_dict[key].append(Dict[key])
                else:
                    overall_dict[key].append(np.nan)
            # adding in the date of recording
            overall_dict.setdefault('date', [])
            overall_dict['date'].append(daily_df.index[row])
            # adding beiwe id
            overall_dict.setdefault('beiwe', [])
            overall_dict['beiwe'].append(daily_df['beiwe'][row])

    for key, val in overall_dict.items():
        print(f"{key}: {len(overall_dict[key])}")
    df = pd.DataFrame(overall_dict)
    df['date'] = pd.to_datetime(df['date'],errors='coerce')
    # removing classic sleep stage data
    return df.set_index('date')

In [28]:
daily_slp = get_sleep_df(fitbit_daily,verbose=False)
daily_slp.head()

dateOfSleep: 4424
duration: 4424
efficiency: 4424
endTime: 4424
infoCode: 4424
isMainSleep: 4424
levels: 4424
logId: 4424
minutesAfterWakeup: 4424
minutesAsleep: 4424
minutesAwake: 4424
minutesToFallAsleep: 4424
startTime: 4424
timeInBed: 4424
type: 4424
date: 4424
beiwe: 4424
awakeCount: 4424
awakeDuration: 4424
awakeningsCount: 4424
minuteData: 4424
restlessCount: 4424
restlessDuration: 4424


Unnamed: 0_level_0,dateOfSleep,duration,efficiency,endTime,infoCode,isMainSleep,levels,logId,minutesAfterWakeup,minutesAsleep,...,startTime,timeInBed,type,beiwe,awakeCount,awakeDuration,awakeningsCount,minuteData,restlessCount,restlessDuration
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-05-14,2020-05-14,24360000,97,2020-05-14T07:13:00.000,0.0,True,{'data': [{'dateTime': '2020-05-14T00:27:00.00...,27170628887,0,379,...,2020-05-14T00:27:00.000,406,stages,hfttkth7,,,,,,
2020-05-15,2020-05-15,29580000,87,2020-05-15T08:06:30.000,0.0,True,{'data': [{'dateTime': '2020-05-14T23:53:30.00...,27187368950,8,392,...,2020-05-14T23:53:30.000,493,stages,hfttkth7,,,,,,
2020-05-16,2020-05-16,19740000,95,2020-05-16T04:57:00.000,0.0,True,{'data': [{'dateTime': '2020-05-15T23:28:00.00...,27197675883,7,287,...,2020-05-15T23:28:00.000,329,stages,hfttkth7,,,,,,
2020-05-17,2020-05-17,26820000,96,2020-05-17T09:28:30.000,0.0,True,{'data': [{'dateTime': '2020-05-17T02:01:30.00...,27214680283,8,403,...,2020-05-17T02:01:30.000,447,stages,hfttkth7,,,,,,
2020-05-18,2020-05-18,24960000,92,2020-05-18T07:20:00.000,0.0,True,{'data': [{'dateTime': '2020-05-18T00:24:00.00...,27227113904,0,351,...,2020-05-18T00:24:00.000,416,stages,hfttkth7,,,,,,


In [31]:
temp = daily_slp.drop(["dateOfSleep","infoCode","logId","type","awakeCount","awakeDuration","awakeningsCount","minuteData","restlessCount","restlessDuration"],axis=1)
temp.columns

Index(['duration', 'efficiency', 'endTime', 'isMainSleep', 'levels',
       'minutesAfterWakeup', 'minutesAsleep', 'minutesAwake',
       'minutesToFallAsleep', 'startTime', 'timeInBed', 'beiwe'],
      dtype='object')

In [29]:
daily_slp[daily_slp["type"] == "classic"]

Unnamed: 0_level_0,dateOfSleep,duration,efficiency,endTime,infoCode,isMainSleep,levels,logId,minutesAfterWakeup,minutesAsleep,...,startTime,timeInBed,type,beiwe,awakeCount,awakeDuration,awakeningsCount,minuteData,restlessCount,restlessDuration
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-05-24,2020-05-24,5040000,89,2020-05-24T15:02:00.000,2.0,False,{'data': [{'dateTime': '2020-05-24T13:38:00.00...,27333500560,0,75,...,2020-05-24T13:38:00.000,84,classic,hfttkth7,,,,,,
2020-05-28,2020-05-28,5580000,86,2020-05-28T16:12:00.000,2.0,False,{'data': [{'dateTime': '2020-05-28T14:39:00.00...,27390407346,0,80,...,2020-05-28T14:39:00.000,93,classic,hfttkth7,,,,,,
2020-06-16,2020-06-16,5640000,87,2020-06-16T15:48:30.000,2.0,True,{'data': [{'dateTime': '2020-06-16T14:14:30.00...,27656420594,0,82,...,2020-06-16T14:14:30.000,94,classic,hfttkth7,,,,,,
2020-08-11,2020-08-11,7140000,91,2020-08-11T16:58:00.000,2.0,False,{'data': [{'dateTime': '2020-08-11T14:58:30.00...,28420475063,1,107,...,2020-08-11T14:58:30.000,119,classic,hfttkth7,,,,,,
2020-08-24,2020-08-24,3960000,97,2020-08-24T15:58:00.000,2.0,False,{'data': [{'dateTime': '2020-08-24T14:52:00.00...,28589437636,0,64,...,2020-08-24T14:52:00.000,66,classic,hfttkth7,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-08-26,2020-08-26,3600000,100,2020-08-26T10:43:00.000,2.0,False,{'data': [{'dateTime': '2020-08-26T09:43:00.00...,28612292769,0,60,...,2020-08-26T09:43:00.000,60,classic,e8js2jdf,,,,,,
2020-08-29,2020-08-29,4140000,96,2020-08-29T16:23:30.000,2.0,False,{'data': [{'dateTime': '2020-08-29T15:14:00.00...,28667043947,0,66,...,2020-08-29T15:14:00.000,69,classic,e8js2jdf,,,,,,
2020-08-30,2020-08-30,4320000,100,2020-08-30T16:53:30.000,2.0,False,{'data': [{'dateTime': '2020-08-30T15:41:00.00...,28667962761,0,72,...,2020-08-30T15:41:00.000,72,classic,e8js2jdf,,,,,,
2020-09-04,2020-09-04,4140000,88,2020-09-04T16:07:30.000,2.0,False,{'data': [{'dateTime': '2020-09-04T14:58:30.00...,28743621965,0,61,...,2020-09-04T14:58:30.000,69,classic,e8js2jdf,,,,,,


In [36]:
def get_minute_sleep_df(daily_sleep):
    '''
    Creates a dataframe for the minute sleep data
    
    Input(s):
    - daily_sleep: dataframe holding the daily sleep data with a column called minuteData
    
    Returns:
    - sleep_stages: a dataframe with sleep stage data for every stage transition
    - summary: a dataframe with the nightly sleep stage information
    '''
    
    data_dict = {'startDate':[],'endDate':[],'dateTime':[],'level':[],'seconds':[],'beiwe':[]}
    summary_dict = {'startDate':[],'endDate':[],'deep_count':[],'deep_minutes':[],'light_count':[],'light_minutes':[],
                    'rem_count':[],'rem_minutes':[],'wake_count':[],'wake_minutes':[],'beiwe':[]}
    for row in range(len(daily_sleep)):
        d0 = pd.to_datetime(daily_sleep['startTime'][row])
        d1 = pd.to_datetime(daily_sleep.index[row])
        print("new row")
        print(daily_sleep['type'][row])
        print(daily_sleep['levels'][row])
        sleep_dict = daily_sleep['levels'][row]
        for key in sleep_dict.keys():
            if key == 'data': # data without short wake periods
                temp_data = sleep_dict['data']
                for temp_data_dict in temp_data:
                    for data_key in temp_data_dict.keys():
                        data_dict[data_key].append(temp_data_dict[data_key])
                    data_dict['startDate'].append(d0.date())
                    data_dict['endDate'].append(d1.date())
                    data_dict['beiwe'].append(daily_sleep['beiwe'][row])
            elif key == 'summary': # nightly summary data - already in dictionary form
                for summary_key in sleep_dict['summary'].keys():
                    stage_dict = sleep_dict['summary'][summary_key]
                    for stage_key in ['count','minutes']:
                        summary_dict[f'{summary_key}_{stage_key}'].append(stage_dict[stage_key])
                    
                summary_dict['startDate'].append(d0.date())
                summary_dict['endDate'].append(d1.date())
                summary_dict['beiwe'].append(daily_sleep['beiwe'][row])
            else: # shortData or data with short wake periods - don't need
                pass
            
    sleep_stages = pd.DataFrame(data_dict)
    sleep_stages.columns = ['start_date','end_date','time','stage','time_at_stage','beiwe'] # renaming columns
    # adding column for numeric value of sleep stage 
    def numeric_from_str_sleep_stage(row):
        if row['stage'] == 'wake':
            return 0
        elif row['stage'] == 'light':
            return 1
        elif row['stage'] == 'deep':
            return 2
        elif row['stage'] == 'rem':
            return 3
        else:
            return -1
        
    sleep_stages['value'] = sleep_stages.apply(lambda row: numeric_from_str_sleep_stage(row), axis=1)
    
    summary = pd.DataFrame(summary_dict)
    return sleep_stages, summary

In [37]:
stages, summary = get_minute_sleep_df(daily_slp)
stages.head()

new row
stages
{'data': [{'dateTime': '2020-05-14T00:27:00.000', 'level': 'wake', 'seconds': 510}, {'dateTime': '2020-05-14T00:35:30.000', 'level': 'light', 'seconds': 420}, {'dateTime': '2020-05-14T00:42:30.000', 'level': 'deep', 'seconds': 1590}, {'dateTime': '2020-05-14T01:09:00.000', 'level': 'light', 'seconds': 1290}, {'dateTime': '2020-05-14T01:30:30.000', 'level': 'rem', 'seconds': 840}, {'dateTime': '2020-05-14T01:44:30.000', 'level': 'light', 'seconds': 780}, {'dateTime': '2020-05-14T01:57:30.000', 'level': 'deep', 'seconds': 360}, {'dateTime': '2020-05-14T02:03:30.000', 'level': 'light', 'seconds': 600}, {'dateTime': '2020-05-14T02:13:30.000', 'level': 'deep', 'seconds': 810}, {'dateTime': '2020-05-14T02:27:00.000', 'level': 'light', 'seconds': 270}, {'dateTime': '2020-05-14T02:31:30.000', 'level': 'rem', 'seconds': 450}, {'dateTime': '2020-05-14T02:39:00.000', 'level': 'light', 'seconds': 1230}, {'dateTime': '2020-05-14T02:59:30.000', 'level': 'deep', 'seconds': 270}, {'date

KeyError: 'asleep_count'

In [15]:
summary.head()

Unnamed: 0,startDate,endDate,deep_count,deep_minutes,light_count,light_minutes,rem_count,rem_minutes,wake_count,wake_minutes,beiwe
0,2020-05-01,2020-05-01,5,119,43,313,18,135,53,73,15tejjtw
1,2020-05-02,2020-05-02,4,115,46,336,12,106,47,94,15tejjtw
2,2020-05-03,2020-05-03,4,90,22,157,5,54,20,60,15tejjtw
3,2020-05-04,2020-05-04,5,104,40,323,8,88,37,93,15tejjtw
4,2020-05-06,2020-05-06,4,101,30,211,19,121,34,74,15tejjtw
