In [1]:
import csv
import sys
import os
import glob
import pandas as pd
#Add the following to ignore SettingWithCopyWarning that occurs when sorting the data in 128
pd.options.mode.chained_assignment = None

# **Import the exported FitBit csv files and concatenate them to one dataframe**

In [2]:
path = r"C:\Users\jwkon\Data Learning\FitData\Fit_Project\Raw Data"
allFiles = glob.glob(os.path.join(path,"*.csv"))

file = []
rows = []
for f in allFiles:
    with open(f, 'r') as csvfile:
        csvfile = csvfile.read().split('\n')
        csvreader = csv.reader(csvfile)
        fields = next(csvreader)
        for row in csvreader:
            rows.append(row)
data = pd.DataFrame(rows)

#### The sleep portion is filtered out under the assumption that the first column will display time. So, we check if it contains AM or PM

In [3]:
sleep = data[data[0].str.contains('AM|PM', na=False)]
sleep

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
34,2020-04-30 12:29AM,2020-04-30 7:26AM,348,69,21,417,61,249,38,
35,2020-04-29 1:49PM,2020-04-29 3:24PM,84,11,1,95,,,,
36,2020-04-28 9:28PM,2020-04-29 6:54AM,469,97,25,566,109,288,72,
37,2020-04-28 5:28AM,2020-04-28 7:37AM,126,3,1,129,,,,
38,2020-04-27 11:17PM,2020-04-28 3:47AM,215,55,13,270,30,152,33,
...,...,...,...,...,...,...,...,...,...,...
482,2020-05-04 11:30PM,2020-05-05 6:39AM,383,46,23,429,51,269,63,
483,2020-05-03 11:53PM,2020-05-04 7:43AM,412,58,21,470,78,300,34,
484,2020-05-03 12:20AM,2020-05-03 8:40AM,422,78,25,500,79,292,51,
485,2020-05-02 2:55AM,2020-05-02 9:03AM,304,64,20,368,70,190,44,


There are more rows than the activity data and requires more filtering to be done.
We will filter the second column by searching for elements that contain PM

In [4]:
awake_pm = data[data[1].str.contains('PM', na=False)]
awake_pm.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
35,2020-04-29 1:49PM,2020-04-29 3:24PM,84,11,1,95,,,,
40,2020-04-25 4:45PM,2020-04-25 5:52PM,57,10,3,67,,,,
100,2020-02-29 9:16PM,2020-02-29 10:31PM,59,16,5,75,,,,
120,2020-02-09 12:13PM,2020-02-09 1:48PM,87,8,3,95,,,,
129,2020-02-01 1:20PM,2020-02-01 3:03PM,90,7,2,103,,,,


In [5]:
#combine both dataframes together and drop duplicates
sleep = awake_pm.append(sleep)
sleep = sleep[~sleep.index.duplicated(keep=False)]

Combining both dataframes and dropping duplicates lets us filter out the extra 21 rows

In [6]:
sleep

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
34,2020-04-30 12:29AM,2020-04-30 7:26AM,348,69,21,417,61,249,38,
36,2020-04-28 9:28PM,2020-04-29 6:54AM,469,97,25,566,109,288,72,
37,2020-04-28 5:28AM,2020-04-28 7:37AM,126,3,1,129,,,,
38,2020-04-27 11:17PM,2020-04-28 3:47AM,215,55,13,270,30,152,33,
39,2020-04-26 12:54AM,2020-04-26 9:46AM,439,93,30,532,83,309,47,
...,...,...,...,...,...,...,...,...,...,...
482,2020-05-04 11:30PM,2020-05-05 6:39AM,383,46,23,429,51,269,63,
483,2020-05-03 11:53PM,2020-05-04 7:43AM,412,58,21,470,78,300,34,
484,2020-05-03 12:20AM,2020-05-03 8:40AM,422,78,25,500,79,292,51,
485,2020-05-02 2:55AM,2020-05-02 9:03AM,304,64,20,368,70,190,44,


In [7]:
# Sleep data is reversed, so we need to order by ascending
sleep.sort_values(by=[1], inplace=True, ascending=True)

In [8]:
# Search for column header
sleep_col = data[data[0].str.contains('Sleep', na=False)]
sleep_col.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
32,Sleep,,,,,,,,,


In [9]:
sleep.columns = [data.iloc[33]]

In [10]:
sleep.head(1)

33,Start Time,End Time,Minutes Asleep,Minutes Awake,Number of Awakenings,Time in Bed,Minutes REM Sleep,Minutes Light Sleep,Minutes Deep Sleep,NaN
203,2020-01-01 1:16AM,2020-01-01 8:28AM,334,98,20,432,47,242,45,


Dropping the NaN column and index header

In [11]:
sleep = sleep.iloc[:, :-1]
sleep = sleep.rename_axis(None, axis=1)
sleep = sleep.reset_index(drop=True)

In [12]:
sleep

Unnamed: 0,Start Time,End Time,Minutes Asleep,Minutes Awake,Number of Awakenings,Time in Bed,Minutes REM Sleep,Minutes Light Sleep,Minutes Deep Sleep
0,2020-01-01 1:16AM,2020-01-01 8:28AM,334,98,20,432,47,242,45
1,2020-01-01 11:00PM,2020-01-02 7:27AM,414,93,33,507,50,346,18
2,2020-01-02 11:03PM,2020-01-03 5:32AM,331,58,27,389,31,278,22
3,2020-01-03 8:53PM,2020-01-04 6:06AM,464,89,36,553,84,341,39
4,2020-01-04 8:55PM,2020-01-05 7:47AM,526,126,46,652,79,401,46
...,...,...,...,...,...,...,...,...,...
208,2020-07-26 10:14PM,2020-07-27 6:09AM,406,69,24,475,89,264,53
209,2020-07-28 12:26AM,2020-07-28 7:21AM,367,48,24,415,79,237,51
210,2020-07-28 10:13PM,2020-07-29 6:46AM,457,56,28,513,99,267,91
211,2020-07-29 10:24PM,2020-07-30 6:04AM,399,61,29,460,66,289,44


### **Save to CSV**

In [13]:
sleep.to_csv('Organized_Sleep_Data.csv', index=False)