--------------
# Creating the dataframe to be used for predicting demographics
---------------------

In [41]:
import json
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from datetime import datetime
import preprocessing_functions
from pymongo import MongoClient
warnings.filterwarnings("ignore")
from dataprep.eda import create_report
import feature_engineering_functions

In [42]:
# load fitbit data
fitbit = pd.read_pickle('../data/daily_fitbit_df_unprocessed.pkl')
fitbit.head()

Unnamed: 0,id,date,nightly_temperature,nremhr,rmssd,spo2,full_sleep_breathing_rate,stress_score,sleep_points_percentage,exertion_points_percentage,...,sleep_rem_ratio,steps,minutes_in_default_zone_1,minutes_below_default_zone_1,minutes_in_default_zone_2,minutes_in_default_zone_3,age,gender,bmi,heart_rate_alert
0,621e2e8e67b776a24055b564,2021-05-24,34.137687,57.432,89.603,,14.8,78.0,0.833333,0.675,...,1.341772,8833.0,83.0,1349.0,0.0,0.0,<30,MALE,<19,
1,621e2e8e67b776a24055b564,2021-05-25,33.794544,57.681,94.303,,15.8,80.0,0.833333,0.725,...,1.197531,9727.0,56.0,1374.0,4.0,0.0,<30,MALE,<19,
2,621e2e8e67b776a24055b564,2021-05-26,34.611011,57.481,119.212,,14.6,84.0,0.966667,0.725,...,1.670732,8253.0,85.0,1350.0,0.0,0.0,<30,MALE,<19,
3,621e2e8e67b776a24055b564,2021-05-27,34.408304,57.493,111.709,,14.8,82.0,0.933333,0.725,...,1.588235,9015.0,90.0,1282.0,0.0,0.0,<30,MALE,<19,
4,621e2e8e67b776a24055b564,2021-05-28,34.178922,56.75,103.034,,15.2,81.0,0.866667,0.725,...,1.090909,12949.0,146.0,1274.0,4.0,0.0,<30,MALE,<19,


In [43]:
fitbit.columns

Index(['id', 'date', 'nightly_temperature', 'nremhr', 'rmssd', 'spo2',
       'full_sleep_breathing_rate', 'stress_score', 'sleep_points_percentage',
       'exertion_points_percentage', 'responsiveness_points_percentage',
       'daily_temperature_variation', 'badgeType', 'calories',
       'filteredDemographicVO2Max', 'distance', 'activityType', 'bpm',
       'lightly_active_minutes', 'moderately_active_minutes',
       'very_active_minutes', 'sedentary_minutes', 'mindfulness_session',
       'scl_avg', 'resting_hr', 'sleep_duration', 'minutesToFallAsleep',
       'minutesAsleep', 'minutesAwake', 'minutesAfterWakeup',
       'sleep_efficiency', 'sleep_deep_ratio', 'sleep_wake_ratio',
       'sleep_light_ratio', 'sleep_rem_ratio', 'steps',
       'minutes_in_default_zone_1', 'minutes_below_default_zone_1',
       'minutes_in_default_zone_2', 'minutes_in_default_zone_3', 'age',
       'gender', 'bmi', 'heart_rate_alert'],
      dtype='object')

---------------------------------------------------------
# Fitbit Basic Preprocessing
---------------------------------------------------------
1. Select the experiment days
2. Drop duplicates
3. Convert data types falsely described as categorical

In [44]:
fitbit = preprocessing_functions.fitbit_basic_preprocessing(fitbit)

In [45]:
fitbit["date"] = pd.to_datetime(pd.to_datetime(fitbit["date"]).dt.date)
fitbit = fitbit.sort_values(by='date', ascending=True)

In [46]:
def fitbit_intraday_sleep(df, db_name):
    # setup mongo connection for reading extra data
    with open('.\\..\\credentials.json') as f:
        data = json.load(f)
        username = data['username']
        password = data['password']
    client = MongoClient('mongodb://%s:%s@127.0.0.1' % (username, password))
    db = client[db_name]
    col = db.fitbit

    # read intra-day data from Mongo
    df_mongo = pd.DataFrame(list(col.find({"$and": [
        {'type': 'sleep'},
        {'data.mainSleep': True}
    ]},
        {'_id': 0, 'id': 1, 'data.dateOfSleep': 1, 'data.startTime': 1, 'data.endTime': 1}
    )))
    df_mongo.loc[:, "date"] = df_mongo.data.str.get('dateOfSleep')
    df_mongo.loc[:, "startTime"] = df_mongo.data.str.get('startTime')
    df_mongo.loc[:, "endTime"] = df_mongo.data.str.get('endTime')
    df_mongo.drop(columns=['data'], inplace=True)
    df_mongo["date"] = pd.to_datetime(pd.to_datetime(df_mongo["date"]).dt.date)
    df = df.merge(df_mongo, how='left', on=['id', 'date'])
    return df

In [47]:
# read intra-day data from Mongo
fitbit = fitbit_intraday_sleep(fitbit, 'rais')
fitbit.head()

Unnamed: 0,id,date,nightly_temperature,nremhr,rmssd,spo2,full_sleep_breathing_rate,stress_score,sleep_points_percentage,exertion_points_percentage,...,minutes_in_default_zone_1,minutes_below_default_zone_1,minutes_in_default_zone_2,minutes_in_default_zone_3,age,gender,bmi,heart_rate_alert,startTime,endTime
0,621e2e8e67b776a24055b564,2021-05-24,34.137687,57.432,89.603,,14.8,78.0,0.833333,0.675,...,83.0,1349.0,0.0,0.0,<30,MALE,<19,,2021-05-24T00:40:00.000,2021-05-24T09:21:00.000
1,621e328667b776a240281372,2021-05-24,,,,,,,,,...,278.0,766.0,29.0,1.0,>=30,MALE,>=30,,,
2,621e326767b776a24012e179,2021-05-24,,46.12,53.968,95.3,14.4,80.0,0.666667,0.925,...,131.0,1225.0,0.0,0.0,<30,FEMALE,<19,,,
3,621e332267b776a24092a584,2021-05-24,,,,,,,,,...,175.0,1130.0,9.0,0.0,<30,MALE,21.0,,,
4,621e333567b776a240a0c217,2021-05-24,,,,,,,,,...,,,,,<30,MALE,21.0,,,


# date engineering

In [48]:
fitbit = preprocessing_functions.date_engineering(fitbit)
fitbit.head()

Unnamed: 0,id,date,nightly_temperature,nremhr,rmssd,spo2,full_sleep_breathing_rate,stress_score,sleep_points_percentage,exertion_points_percentage,...,startTime,endTime,month_sin,weekday_sin,week_sin,day_sin,month_cos,weekday_cos,week_cos,day_cos
0,621e2e8e67b776a24055b564,2021-05-24,34.137687,57.432,89.603,,14.8,78.0,0.833333,0.675,...,2021-05-24T00:40:00.000,2021-05-24T09:21:00.000,-0.866025,0.0,0.309017,-0.988468,0.5,1.0,0.951057,0.151428
1,621e328667b776a240281372,2021-05-24,,,,,,,,,...,,,-0.866025,0.0,0.309017,-0.988468,0.5,1.0,0.951057,0.151428
2,621e326767b776a24012e179,2021-05-24,,46.12,53.968,95.3,14.4,80.0,0.666667,0.925,...,,,-0.866025,0.0,0.309017,-0.988468,0.5,1.0,0.951057,0.151428
3,621e332267b776a24092a584,2021-05-24,,,,,,,,,...,,,-0.866025,0.0,0.309017,-0.988468,0.5,1.0,0.951057,0.151428
4,621e333567b776a240a0c217,2021-05-24,,,,,,,,,...,,,-0.866025,0.0,0.309017,-0.988468,0.5,1.0,0.951057,0.151428


# one-hot-encoding

In [49]:
fitbit['bmi'].value_counts()

20.0    643
24.0    583
<19     458
22.0    457
21.0    456
23.0    392
19.0    301
26.0    264
>=25    261
>=30    257
27.0    196
29.0    171
28.0    130
25.0    129
Name: bmi, dtype: int64

In [50]:
fitbit = preprocessing_functions.one_hot_encoding(fitbit)
fitbit.head()

Unnamed: 0,id,date,nightly_temperature,nremhr,rmssd,spo2,full_sleep_breathing_rate,stress_score,sleep_points_percentage,exertion_points_percentage,...,Martial Arts,Run,Spinning,Sport,Swim,Treadmill,Walk,Weights,Workout,Yoga/Pilates
0,621e2e8e67b776a24055b564,2021-05-24,34.137687,57.432,89.603,,14.8,78.0,0.833333,0.675,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,621e328667b776a240281372,2021-05-24,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,621e326767b776a24012e179,2021-05-24,,46.12,53.968,95.3,14.4,80.0,0.666667,0.925,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,621e332267b776a24092a584,2021-05-24,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,621e333567b776a240a0c217,2021-05-24,,,,,,,,,...,,,,,,,,,,


In [51]:
fitbit['bmi'].value_counts()

Normal         3161
Overweight     1151
Underweight     458
Obese           257
Name: bmi, dtype: int64

In [52]:
fitbit.columns

Index(['id', 'date', 'nightly_temperature', 'nremhr', 'rmssd', 'spo2',
       'full_sleep_breathing_rate', 'stress_score', 'sleep_points_percentage',
       'exertion_points_percentage', 'responsiveness_points_percentage',
       'daily_temperature_variation', 'calories', 'filteredDemographicVO2Max',
       'distance', 'bpm', 'lightly_active_minutes',
       'moderately_active_minutes', 'very_active_minutes', 'sedentary_minutes',
       'mindfulness_session', 'scl_avg', 'resting_hr', 'sleep_duration',
       'minutesToFallAsleep', 'minutesAsleep', 'minutesAwake',
       'minutesAfterWakeup', 'sleep_efficiency', 'sleep_deep_ratio',
       'sleep_wake_ratio', 'sleep_light_ratio', 'sleep_rem_ratio', 'steps',
       'minutes_in_default_zone_1', 'minutes_below_default_zone_1',
       'minutes_in_default_zone_2', 'minutes_in_default_zone_3', 'age',
       'gender', 'bmi', 'heart_rate_alert', 'startTime', 'endTime',
       'month_sin', 'weekday_sin', 'week_sin', 'day_sin', 'month_cos',
      

In [53]:
fitbit.to_pickle("../data/unified_dataframe/dataframe_demographics_unpreprocessed.pkl")

# post preprocessing

In [54]:
fitbit = preprocessing_functions.wear_days(fitbit)
fitbit = preprocessing_functions.use_EDA_SpO2_ECG(fitbit)
fitbit = preprocessing_functions.use_during_sleep(fitbit)

In [55]:
fitbit.columns

Index(['id', 'date', 'nightly_temperature', 'nremhr', 'rmssd', 'spo2',
       'full_sleep_breathing_rate', 'stress_score', 'sleep_points_percentage',
       'exertion_points_percentage', 'responsiveness_points_percentage',
       'daily_temperature_variation', 'calories', 'filteredDemographicVO2Max',
       'distance', 'bpm', 'lightly_active_minutes',
       'moderately_active_minutes', 'very_active_minutes', 'sedentary_minutes',
       'mindfulness_session', 'scl_avg', 'resting_hr', 'sleep_duration',
       'minutesToFallAsleep', 'minutesAsleep', 'minutesAwake',
       'minutesAfterWakeup', 'sleep_efficiency', 'sleep_deep_ratio',
       'sleep_wake_ratio', 'sleep_light_ratio', 'sleep_rem_ratio', 'steps',
       'minutes_in_default_zone_1', 'minutes_below_default_zone_1',
       'minutes_in_default_zone_2', 'minutes_in_default_zone_3', 'age',
       'gender', 'bmi', 'heart_rate_alert', 'startTime', 'endTime',
       'month_sin', 'weekday_sin', 'week_sin', 'day_sin', 'month_cos',
      

In [56]:
cols = list(fitbit.columns)
columns_excluded = fitbit[['id', 'date', 'startTime', 'endTime',
       'month_sin', 'weekday_sin', 'week_sin', 'day_sin', 'month_cos',
       'weekday_cos', 'week_cos', 'day_cos', 'DAILY_FLOORS', 'DAILY_STEPS',
       'GOAL_BASED_WEIGHT_LOSS', 'LIFETIME_DISTANCE', 'LIFETIME_FLOORS',
       'LIFETIME_WEIGHT_GOAL_SETUP', 'Aerobic Workout', 'Bike', 'Bootcamp',
       'Circuit Training', 'Elliptical', 'Hike', 'Interval Workout',
       'Martial Arts', 'Run', 'Spinning', 'Sport', 'Swim', 'Treadmill', 'Walk',
       'Weights', 'Workout', 'Yoga/Pilates', 'wear_day', 'early_features',
       'used_during_night', 'filteredDemographicVO2Max', 'age',
       'gender', 'bmi']]
for col in columns_excluded:
    cols.remove(col)

In [57]:
# continuous features
for col in cols:
    fitbit[col] = fitbit[col].apply(pd.to_numeric, errors='coerce')
    fitbit[col] = fitbit[col].fillna(fitbit[col].median())

In [58]:
# categorical features
categorical = ['mindfulness_session', 'heart_rate_alert', 'DAILY_FLOORS', 'DAILY_STEPS',
               'GOAL_BASED_WEIGHT_LOSS', 'LIFETIME_DISTANCE', 'LIFETIME_FLOORS', 'LIFETIME_WEIGHT_GOAL_SETUP',
               'Aerobic Workout', 'Bike', 'Bootcamp', 'Circuit Training', 'Elliptical', 'Hike', 'Interval Workout',
               'Martial Arts', 'Run', 'Spinning', 'Sport', 'Swim', 'Treadmill', 'Walk', 'Weights', 'Workout',
               'Yoga/Pilates']
for col in categorical:
    fitbit[col] = fitbit[col].fillna(fitbit[col].mode().iloc[0])

In [59]:
#create_report(fitbit).show_browser()

# adding all new features

In [60]:
fitbit = feature_engineering_functions.add_features(fitbit, 'daily')

In [61]:
fitbit

Unnamed: 0,id,date,nightly_temperature,nremhr,rmssd,spo2,full_sleep_breathing_rate,stress_score,sleep_points_percentage,exertion_points_percentage,...,Steps_hour14,Steps_hour15,Steps_hour16,Steps_hour17,Steps_hour18,Steps_hour19,Steps_hour20,Steps_hour21,Steps_hour22,Steps_hour23
0,621e2e8e67b776a24055b564,2021-05-24,34.137687,57.432,89.603,95.9,14.8,78.0,0.833333,0.675,...,191.0,33.0,342.0,1712.0,1838.0,160.0,155.0,37.0,31.0,53.0
1,621e2e8e67b776a24055b564,2021-05-25,33.794544,57.681,94.303,95.9,15.8,80.0,0.833333,0.725,...,120.0,24.0,446.0,377.0,3165.0,332.0,97.0,9.0,24.0,58.0
2,621e2e8e67b776a24055b564,2021-05-26,34.611011,57.481,119.212,95.9,14.6,84.0,0.966667,0.725,...,85.0,477.0,390.0,2821.0,293.0,158.0,66.0,0.0,70.0,74.0
3,621e2e8e67b776a24055b564,2021-05-27,34.408304,57.493,111.709,95.9,14.8,82.0,0.933333,0.725,...,58.0,296.0,401.0,21.0,3054.0,503.0,411.0,96.0,63.0,8.0
4,621e2e8e67b776a24055b564,2021-05-28,34.178922,56.750,103.034,95.9,15.2,81.0,0.866667,0.725,...,250.0,82.0,1363.0,3014.0,81.0,104.0,1984.0,18.0,141.0,60.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5022,621e375b67b776a240290cdc,2021-07-22,33.956446,62.554,34.369,95.9,14.8,0.0,0.000000,0.000,...,7.0,60.0,7.0,163.0,92.0,106.0,2034.0,3453.0,49.0,170.0
5023,621e375b67b776a240290cdc,2021-07-23,34.011607,55.542,30.813,95.9,16.6,81.0,0.700000,0.800,...,1013.0,810.0,371.0,833.0,1411.0,93.0,424.0,4029.0,1083.0,42.0
5024,621e375b67b776a240290cdc,2021-07-24,33.687826,69.579,19.407,95.9,17.0,84.0,0.833333,0.900,...,1656.0,1276.0,1903.0,534.0,491.0,583.0,84.0,1103.0,621.0,737.0
5025,621e375b67b776a240290cdc,2021-07-25,34.112386,65.899,22.892,95.9,18.0,84.0,0.833333,0.900,...,42.0,1235.0,716.0,1130.0,1163.0,888.0,245.0,1401.0,54.0,1627.0


In [62]:
fitbit.to_pickle("../data/unified_dataframe/dataframe_demographics_preprocessed.pkl")