#### Load garmin datasets

In [None]:
#1182 - 3486
import os
import pandas as pd
import numpy as np

directory_3month = './garmin_data/3_month/'
directory_6month = './garmin_data/6_month/'
files_3month = os.listdir(directory_3month)
files_6month = os.listdir(directory_6month)
# dictionary to store the dataframes in: key = mobile code, value = dataframe
all_df_3month = {}
all_df_6month = {}
garmin_id_3month = []
garmin_id_6month = []

for f in files_3month:
    
    key = f.split(".")[0]
    key = f.split()
    if (len(key) == 1):
        key = key[0].split(".")
    if not key[0].isnumeric():
        continue
    garmin_id_3month.append(key[0])
    
    fname = os.path.join(directory_3month, f)
    
    df = pd.read_csv(fname)
    all_df_3month[key[0]] = df


for f in files_6month:
    
    key = f.split(".")[0]
    key = f.split()
    if (len(key) == 1):
        key = key[0].split(".")
    if not key[0].isnumeric():
        continue
    garmin_id_6month.append(key[0])
    
    fname = os.path.join(directory_6month, f)
    
    df = pd.read_csv(fname)
    all_df_6month[key[0]] = df

In [None]:
print(garmin_id_3month)
print(garmin_id_6month)

['32455277', '20763027', '27361835', '32937810', '25230030', '17180706', '33075391', '11822993', '34865333', '20126808', '17309235', '23916703', '26141560', '27148444', '22541511']
['22541511', '32937810', '23916703', '25230030', '33075391', '25230030', '34865333', '27148444', '11822993', '17309235', '27361835', '17180706', '26141560', '32455277', '20763027', '20126808']


#### Filter 6 month data for participants that actually stayed around for 6 months

In [None]:
# check if there exists data for past July 
for garmin_id in garmin_id_6month:
    cur_df = all_df_6month.get(garmin_id)
    cur_df['tmp'] = cur_df['dte_tme'].str.startswith('2022-08')
    hasTrue = cur_df['tmp'].any()
    if not hasTrue:
        all_df_6month.pop(garmin_id)
        garmin_id_6month.remove(garmin_id)

    

In [None]:
df = all_df_3month.get(garmin_id_3month[0])
print(garmin_id_3month[0])

32455277


#### What do the garmin data types mean?  
To clean up the data, we need to determine what to do with these 0 values. For some data types, we might want to keep the 0, but not for others (e.g. heart rate of 0 doesn't make sense).  
- calories: estimate of total calories burned for current date (from [here](https://ilumivu.freshdesk.com/support/solutions/articles/9000169512-garmin-data))
- hr: heart rate (from [here](https://ilumivu.freshdesk.com/support/solutions/articles/9000169512-garmin-data))
- steps: estimate of steps taken for current date (from [here](https://ilumivu.freshdesk.com/support/solutions/articles/9000169512-garmin-data))
- pulseOx: estimates the percentage of oxygenated
    - "Generally speaking, this value should be 95% or higher in most settings, but this value can be influenced by altitude, activity and an individual’s health. Numbers below 90% may be considered low, according to the Mayo Clinic." This is from [here](https://www.garmin.com/en-US/garmin-technology/health-science/pulse-ox/)
- ibi: this stands for inter-beat interval and it's the time interval between individual heart beats (from [here](https://ilumivu.freshdesk.com/support/solutions/articles/9000169512-garmin-data))
- stress: stress level is measured based on heart-rate variability (a.k.a ibi). 
    - "If you are too active to determine stress level (such as during a workout), a stress level will not be recorded." This is from [here](https://support.garmin.com/en-US/?faq=WT9BmhjacO4ZpxbCc0EKn9#:~:text=0%E2%80%9325%3A%20Resting%20state,76%E2%80%93100%3A%20High%20stress)
- floorsClimbed: it's the estimate of floors climbed for current date (from [here](https://ilumivu.freshdesk.com/support/solutions/articles/9000169512-garmin-data))
- intensityMinutes: it's how long the watch thinks you've been active for
    - "Intensity minutes are earned based on your current heart rate when compared to your average resting heart rate or the number of steps taken per minute. For example, you can earn intensity minutes once a brisk walk or run is detected. If no heart rate sensor is present, only moderate intensity minutes will be earned. Your age, weight, height, and resting heart rate calculated by your Garmin watch also factor into intensity minutes." This is from [here](https://support.garmin.com/en-US/?faq=pNU9nnDzzGAHmEavp9rpY8#:~:text=Intensity%20minutes%20are%20earned%20based,intensity%20minutes%20will%20be%20earned.)

#### Cleaning up stress  
According to an [article](https://support.garmin.com/en-US/?faq=WT9BmhjacO4ZpxbCc0EKn9#:~:text=0%E2%80%9325%3A%20Resting%20state,76%E2%80%93100%3A%20High%20stress) written by Garmin support, stress levels correspond to the following scale:
- 0-25: Resting state
- 26-50: Low stress
- 51-75: Medium stress
- 76-100: High stress  
We're going to replace the stress values by using this scale. We labeled the scale like so:
- 1 = resting state
- 2 = low stress
- 3 = medium stres
- 4 = high stress

In [None]:
temp4 = df.copy()
temp4 = temp4.loc[temp4['data_type'] == 'stress']
temp4

Unnamed: 0,ts,dte_tme,rsp_id,data_type,val
782,1.651259e+09,2022-04-29 15:11:02,32683,stress,73
785,1.651259e+09,2022-04-29 15:11:03,32683,stress,73
797,1.651259e+09,2022-04-29 15:11:08,32683,stress,73
808,1.651259e+09,2022-04-29 15:11:12,32683,stress,71
816,1.651259e+09,2022-04-29 15:11:16,32683,stress,68
...,...,...,...,...,...
3156122,1.660611e+09,2022-08-15 20:47:30,32683,stress,12
3156126,1.660611e+09,2022-08-15 20:47:32,32683,stress,12
3156133,1.660611e+09,2022-08-15 20:47:35,32683,stress,10
3156139,1.660611e+09,2022-08-15 20:47:37,32683,stress,10


In [None]:
def cleanStress(val):
    if val in range(0, 26):
        return 1
    elif val in range(26, 51):
        return 2
    elif val in range(51, 76):
        return 3
    else:
        return 4
temp4['val'] = temp4['val'].apply(cleanStress)
temp4

Unnamed: 0,ts,dte_tme,rsp_id,data_type,val
782,1.651259e+09,2022-04-29 15:11:02,32683,stress,3
785,1.651259e+09,2022-04-29 15:11:03,32683,stress,3
797,1.651259e+09,2022-04-29 15:11:08,32683,stress,3
808,1.651259e+09,2022-04-29 15:11:12,32683,stress,3
816,1.651259e+09,2022-04-29 15:11:16,32683,stress,3
...,...,...,...,...,...
3156122,1.660611e+09,2022-08-15 20:47:30,32683,stress,1
3156126,1.660611e+09,2022-08-15 20:47:32,32683,stress,1
3156133,1.660611e+09,2022-08-15 20:47:35,32683,stress,1
3156139,1.660611e+09,2022-08-15 20:47:37,32683,stress,1


In [None]:
res = df.merge(temp4, on=['ts','dte_tme', 'data_type'], how='left')
res = res.drop(columns=['rsp_id_y', 'val_y'], axis=1)
res = res.rename(columns={'rsp_id_x':'rsp_id', 'val_x':'val'})
res

Unnamed: 0,ts,dte_tme,rsp_id,data_type,val
0,1.651256e+09,2022-04-29 14:15:38,32683,hr,0
1,1.651256e+09,2022-04-29 14:15:38,32683,steps,0
2,1.651256e+09,2022-04-29 14:15:38,32683,calories,323
3,1.651256e+09,2022-04-29 14:15:38,32683,floorsClimbed,0
4,1.651256e+09,2022-04-29 14:15:38,32683,intensityMinutes,0
...,...,...,...,...,...
3156175,1.660611e+09,2022-08-15 20:47:38,32683,ibi,818
3156176,1.660611e+09,2022-08-15 20:47:39,32683,ibi,885
3156177,1.660611e+09,2022-08-15 20:47:40,32683,hr,74
3156178,1.660611e+09,2022-08-15 20:47:40,32683,stress,8


#### Making a class that can clean all the dataframes

In [None]:
class Clean():
    def __init__(self, all_df):
        self.all_df = all_df    
        
    def newDate(self, df):
        tmp = df.copy()
        tmp['new_date'] = tmp['dte_tme'].str.split().str[0]
        tmp = tmp.drop(['dte_tme'], axis=1)
        return tmp

    # helper function for cleanStress
    # def helperStress(self, val):
    #     if val in range(0, 26):
    #         return 1
    #     elif val in range(26, 51):
    #         return 2
    #     elif val in range(51, 76):
    #         return 3
    #     else:
    #         return 4
    
    # helper for combineDF  
    def cleanStress(self, df):
        tmp = df.groupby(['new_date','data_type'], as_index=False)['val'].mean()
        tmp = tmp.loc[tmp['data_type']=='stress']
        return tmp
    
    
    # helper for combineDF
    def getMax(self, df):
        tmp = df.groupby(['new_date', 'data_type'], as_index=False)['val'].max()
        ls = ['calories', 'steps', 'floorsClimbed', 'intensityMinutes']
        tmp = tmp[tmp.data_type.isin(ls) == True]
        return tmp
    
    # helper for combineDF
    def cleanHR(self, df):
        condition = (df['val'] == 0 )
        df.loc[condition, 'val'] = np.random.randint(low=60, high=100)
        tmp = df.groupby(['new_date','data_type'], as_index=False)['val'].mean()
        tmp = tmp.loc[tmp['data_type']=='hr']
        return tmp
    
    def combineDF(self, df):
        part1 = self.getMax(df)
        part2 = self.cleanHR(df)
        part3 = self.cleanStress(df)
        frames = [part1, part2, part3]
        res = pd.concat(frames)
        res = res.sort_values(by=['new_date'])
        return res
    
        
    def transform(self, combined_df, key):
        list_dates = combined_df.new_date.unique()
        tmp = combined_df.pivot(index = 'new_date', columns='data_type', values='val')
        tmp.insert(0, "Participant Id", key)
        tmp.insert(1, "date", list_dates)
        return tmp
    
    def process(self):
        for key, df in self.all_df.items():
            res = self.newDate(df)
            combined_res = self.combineDF(res)
            transformed_res = self.transform(combined_res, key)
            self.all_df[key] = transformed_res
            
        return self.all_df
        

In [None]:
cleaner_6month = Clean(all_df_6month)
cleaned_6month = cleaner_6month.process()
# print(cleaned_6month)

In [None]:
cleaner_3month = Clean(all_df_3month)
cleaned_3month = cleaner_3month.process()

#### Merge all of the dataframes into one

In [None]:
df_list1 = []
for df1 in cleaned_3month.values():
    df_list1.append(df1)

df_list2 = []
for df2 in cleaned_6month.values():
    df_list2.append(df2)    

final_3month = pd.concat(df_list1)
final_3month.to_csv('./3month_garmin_jessica.csv')

final_6month = pd.concat(df_list2)
final_6month.to_csv('./6month_garmin_jessica.csv')

This is a link that describes what each data type is: https://ilumivu.freshdesk.com/support/solutions/articles/9000169512-garmin-data  
  
To get the average resting heart rate, we should only get the hr values of when "intensityMinutes" is 0 when calculating mean. 
