In [1]:
import csv
import sys
import os
import glob
import pandas as pd
#Add the following to ignore SettingWithCopyWarning that occurs when sorting the data in 128
pd.options.mode.chained_assignment = None

In [2]:
#Grab all csv files that were exported from fitbit 
path = r"C:\Users\jwkon\Data Learning\FitData\Fit_Project(Jan-May)\Raw Data"
allFiles = glob.glob(os.path.join(path,"*.csv"))

file = []
rows = []
for f in allFiles:
    with open(f, 'r') as csvfile:
        csvfile = csvfile.read().split('\n')
        csvreader = csv.reader(csvfile)
        fields = next(csvreader)
        for row in csvreader:
            rows.append(row)
data = pd.DataFrame(rows)

In [3]:
#Filter only elements that contain data from January-May
sleep = data[data[0].str.contains('2020-01|2020-02|2020-03|2020-04|2020-05', na=False)]
sleep

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
1,2020-04-01,2158,3404,1.52,0,876,215,0,0,724
2,2020-04-02,2777,12610,5.5,0,0,253,77,12,1467
3,2020-04-03,2360,6783,2.98,0,789,218,33,0,941
4,2020-04-04,2306,5610,2.5,8,822,237,0,0,880
5,2020-04-05,2330,5978,2.66,0,746,216,19,7,924
...,...,...,...,...,...,...,...,...,...,...
482,2020-05-04 11:30PM,2020-05-05 6:39AM,383,46,23,429,51,269,63,
483,2020-05-03 11:53PM,2020-05-04 7:43AM,412,58,21,470,78,300,34,
484,2020-05-03 12:20AM,2020-05-03 8:40AM,422,78,25,500,79,292,51,
485,2020-05-02 2:55AM,2020-05-02 9:03AM,304,64,20,368,70,190,44,


In [4]:
#Filter the dataset with the assumption that the first sleep column will display time. So, we check if it contains AM or PM
sleep = sleep[sleep[0].str.contains('AM|PM', na=False)]
sleep

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
34,2020-04-30 12:29AM,2020-04-30 7:26AM,348,69,21,417,61,249,38,
35,2020-04-29 1:49PM,2020-04-29 3:24PM,84,11,1,95,,,,
36,2020-04-28 9:28PM,2020-04-29 6:54AM,469,97,25,566,109,288,72,
37,2020-04-28 5:28AM,2020-04-28 7:37AM,126,3,1,129,,,,
38,2020-04-27 11:17PM,2020-04-28 3:47AM,215,55,13,270,30,152,33,
...,...,...,...,...,...,...,...,...,...,...
482,2020-05-04 11:30PM,2020-05-05 6:39AM,383,46,23,429,51,269,63,
483,2020-05-03 11:53PM,2020-05-04 7:43AM,412,58,21,470,78,300,34,
484,2020-05-03 12:20AM,2020-05-03 8:40AM,422,78,25,500,79,292,51,
485,2020-05-02 2:55AM,2020-05-02 9:03AM,304,64,20,368,70,190,44,


In [6]:
#There are more rows than the activity data and requires more filtering to be done. We will filter the second column by searching for elements that contain PM
awake_pm = sleep[sleep[1].str.contains('PM', na=False)]
awake_pm.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
35,2020-04-29 1:49PM,2020-04-29 3:24PM,84,11,1,95,,,,
40,2020-04-25 4:45PM,2020-04-25 5:52PM,57,10,3,67,,,,
100,2020-02-29 9:16PM,2020-02-29 10:31PM,59,16,5,75,,,,
120,2020-02-09 12:13PM,2020-02-09 1:48PM,87,8,3,95,,,,
129,2020-02-01 1:20PM,2020-02-01 3:03PM,90,7,2,103,,,,


In [9]:
#Combining both dataframes and dropping duplicates lets us filter out the extra 21 rows
sleep = awake_pm.append(sleep)
sleep = sleep[~sleep.index.duplicated(keep=False)]

In [10]:
sleep.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
35,2020-04-29 1:49PM,2020-04-29 3:24PM,84,11,1,95,,,,
40,2020-04-25 4:45PM,2020-04-25 5:52PM,57,10,3,67,,,,
100,2020-02-29 9:16PM,2020-02-29 10:31PM,59,16,5,75,,,,
120,2020-02-09 12:13PM,2020-02-09 1:48PM,87,8,3,95,,,,
129,2020-02-01 1:20PM,2020-02-01 3:03PM,90,7,2,103,,,,


In [13]:
# Sleep data is reversed, so we need to order by ascending
sleep.sort_values(by=[1], inplace=True, ascending=True)

In [14]:
# Search for Sleep column header (The fitbit export contains a csv with both activity and sleep data. Finding the 'Sleep' column helps us identify the start of the sleep dataset)
sleep_col = data[data[0].str.contains('Sleep', na=False)]
sleep_col.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
32,Sleep,,,,,,,,,


In [15]:
#Row 32 contains the 'Sleep' column. We assume the column headers start on the next row so we assign it as such.
sleep.columns = [data.iloc[33]]

In [16]:
#Preview column headers
sleep.head(1)

33,Start Time,End Time,Minutes Asleep,Minutes Awake,Number of Awakenings,Time in Bed,Minutes REM Sleep,Minutes Light Sleep,Minutes Deep Sleep,NaN
203,2020-01-01 1:16AM,2020-01-01 8:28AM,334,98,20,432,47,242,45,


In [17]:
#Drop NaN and index header
sleep = sleep.iloc[:, :-1]
sleep = sleep.rename_axis(None, axis=1)
sleep = sleep.reset_index(drop=True)

In [18]:
#Preview dataframe
sleep

Unnamed: 0,Start Time,End Time,Minutes Asleep,Minutes Awake,Number of Awakenings,Time in Bed,Minutes REM Sleep,Minutes Light Sleep,Minutes Deep Sleep
0,2020-01-01 1:16AM,2020-01-01 8:28AM,334,98,20,432,47,242,45
1,2020-01-01 11:00PM,2020-01-02 7:27AM,414,93,33,507,50,346,18
2,2020-01-02 11:03PM,2020-01-03 5:32AM,331,58,27,389,31,278,22
3,2020-01-03 8:53PM,2020-01-04 6:06AM,464,89,36,553,84,341,39
4,2020-01-04 8:55PM,2020-01-05 7:47AM,526,126,46,652,79,401,46
...,...,...,...,...,...,...,...,...,...
162,2020-05-28 10:25PM,2020-05-29 4:38AM,325,48,22,373,56,259,10
163,2020-05-29 9:32PM,2020-05-30 5:18AM,394,72,27,466,52,285,57
164,2020-05-31 3:02PM,2020-05-31 4:05PM,59,4,1,63,,,
165,2020-05-30 7:52PM,2020-05-31 5:00AM,441,107,32,548,71,320,50


In [41]:
sleep.to_csv('Organized_Sleep_Data(Jan-May).csv', index=False)