In [2]:
import gc
import os
import json
import pickle
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
from datetime import date
from bson import ObjectId
import matplotlib.dates as md
from datetime import datetime
from pymongo import MongoClient
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker

Connect securely to the database

In [3]:
# setup mongo connection for reading extra data
with open('.\\..\\credentials.json') as f:
    data = json.load(f)
    username = data['username']
    password = data['password']
client = MongoClient('mongodb://%s:%s@127.0.0.1' % (username, password))
db = client.rais_anonymized
col = db.fitbit

Find all users

In [17]:
users = db.fitbit.distinct('id')

Integrate all the fitbit data one by one with hourly granularity

Computed Temperature
Not applicable in hourly granularity.

Daily Heart Rate Variability Summary
Not applicable in hourly granularity.

Daily SpO2
Not applicable in hourly granularity.

Respiratory Rate Summary
Not applicable in hourly granularity.

Stress Score
Not applicable in hourly granularity.

Wrist Temperature

In [18]:
users_wrist_temperature = pd.DataFrame(columns=["id", "data"])
for user in users:
    user_data = pd.DataFrame(list(
        db.fitbit.find({"$and": [
            {"type": "Wrist Temperature"},
            {"id": user}
        ]},
            {"id": 1, "data.recorded_time": 1, "data.temperature": 1, "_id": 0}
        )
    ))

    users_wrist_temperature = pd.concat([users_wrist_temperature, user_data], axis=0)

# split data column (json format) into two columns
users_wrist_temperature["date"] = users_wrist_temperature["data"].apply(lambda d: d["recorded_time"])
users_wrist_temperature["temperature"] = users_wrist_temperature["data"].apply(lambda d: d["temperature"])
users_wrist_temperature.drop(["data"], inplace=True, axis=1)

# convert timestamp date object and then to datetime64[ns]
users_wrist_temperature["date"] = pd.to_datetime(users_wrist_temperature["date"], infer_datetime_format=True)
users_wrist_temperature["hour"] = users_wrist_temperature["date"].dt.hour
users_wrist_temperature["date"] = pd.to_datetime(users_wrist_temperature["date"].dt.date, infer_datetime_format=True)

# group by date and then take the average
users_wrist_temperature = users_wrist_temperature.groupby(['id', 'date', 'hour']).mean()
users_wrist_temperature.reset_index(drop=False, inplace=True)

# merge
df = users_wrist_temperature
df

Unnamed: 0,id,date,hour,temperature
0,621e2e8e67b776a24055b564,2021-05-24,0,-1.466659
1,621e2e8e67b776a24055b564,2021-05-24,1,0.210289
2,621e2e8e67b776a24055b564,2021-05-24,2,-1.359378
3,621e2e8e67b776a24055b564,2021-05-24,3,-0.316295
4,621e2e8e67b776a24055b564,2021-05-24,4,0.416455
...,...,...,...,...
74746,621e375b67b776a240290cdc,2021-07-29,19,-1.698178
74747,621e375b67b776a240290cdc,2021-07-29,20,-3.432095
74748,621e375b67b776a240290cdc,2021-07-29,21,-2.635845
74749,621e375b67b776a240290cdc,2021-07-29,22,-1.439095


Badge

In [20]:
users_badges = pd.DataFrame(columns=["id", "data"])
for user in users:
    user_data = pd.DataFrame(list(
        db.fitbit.find({"$and": [
            {"type": "badge"},
            {"id": user}
        ]},
            {"id": 1, "data.dateTime": 1, "data.badgeType": 1, "_id": 0}
        )
    ))

    users_badges = pd.concat([users_badges, user_data], axis=0)

# split data column (json format) into two columns
users_badges["date"] = users_badges["data"].apply(lambda d: d["dateTime"])
users_badges["badgeType"] = users_badges["data"].apply(lambda d: d["badgeType"])
users_badges.drop(["data"], inplace=True, axis=1)

# convert timestamp date object and then to datetime64[ns]
users_badges["date"] = pd.to_datetime(users_badges["date"])
users_badges["hour"] = users_badges["date"].dt.hour
users_badges["date"] = pd.to_datetime(users_badges["date"].dt.date)

# group by ID and date to have only unique dates per participant
users_badges = users_badges.groupby(['id', 'date', 'hour']).badgeType.apply(list).reset_index(drop=False)
users_badges.badgeType = users_badges.badgeType.apply(lambda l: list(set(l)) if isinstance(l, list) else l)
users_badges.badgeType = users_badges.badgeType.apply(lambda l: np.NaN if l == [np.nan] else l)

# merge
df = df.merge(users_badges, how='outer', on=['id', 'date', 'hour'])
df

Unnamed: 0,id,date,hour,temperature,badgeType
0,621e2e8e67b776a24055b564,2021-05-24,0,-1.466659,
1,621e2e8e67b776a24055b564,2021-05-24,1,0.210289,
2,621e2e8e67b776a24055b564,2021-05-24,2,-1.359378,
3,621e2e8e67b776a24055b564,2021-05-24,3,-0.316295,
4,621e2e8e67b776a24055b564,2021-05-24,4,0.416455,
...,...,...,...,...,...
74914,621e375367b776a24021e950,2022-01-18,0,,[LIFETIME_FLOORS]
74915,621e375367b776a24021e950,2022-01-19,0,,[DAILY_FLOORS]
74916,621e375367b776a24021e950,2022-01-20,0,,[DAILY_FLOORS]
74917,621e375b67b776a240290cdc,2021-07-31,0,,"[DAILY_FLOORS, DAILY_STEPS]"


Calories

In [23]:
if os.path.exists('data/users_calories_hourly.pkl'):
    print("Reading from pickle...")
    f = open("data/users_calories_hourly.pkl", "rb")
    # disable garbage collector
    gc.disable()
    # read pickle
    users_calories = pickle.load(f)
    # enable garbage collector again
    gc.enable()
    f.close()
else:
    users_calories = pd.DataFrame(columns=["id", "data"])
    for user in tqdm(users):
        user_data = pd.DataFrame(list(
            db.fitbit.find({"$and": [
                {"type": "calories"},
                {"id": user}
            ]},
                {"id": 1, "data.dateTime": 1, "data.value": 1, "_id": 0}
            )
        ))

        users_calories = pd.concat([users_calories, user_data], axis=0)

    print("Column Split...")
    # split data column (json format) into two columns
    users_calories.reset_index(drop=True, inplace=True)
    users_calories["date"] = users_calories["data"].apply(lambda d: d["dateTime"])
    users_calories["calories"] = users_calories["data"].apply(lambda d: d["value"])
    users_calories.drop(["data"], inplace=True, axis=1)
    print("Column Split Completed")
    print("Date Conversion...")
    # convert timestamp date object and then to datetime64[ns]
    users_calories["date"] = pd.to_datetime(users_calories["date"], infer_datetime_format=True)
    users_calories["hour"] = users_calories["date"].dt.hour
    users_calories["date"] = pd.to_datetime(users_calories["date"].dt.date, format="%Y/%m/%d")
    print("Date Conversion Completed")
    users_calories.to_pickle('data/users_calories_hourly.pkl')

# group by date and then take the average
users_calories.calories = users_calories.calories.astype(float)
users_calories = users_calories.groupby(['id', 'date', 'hour']).sum()
users_calories.reset_index(drop=False, inplace=True)

# merge
df = df.merge(users_calories, how='outer', on=['id', 'date', 'hour'])
df

100%|██████████| 71/71 [03:38<00:00,  3.07s/it]


Column Split...
Column Split Completed
Date Conversion...
Date Conversion Completed


Unnamed: 0,id,date,hour,temperature,badgeType,calories
0,621e2e8e67b776a24055b564,2021-05-24,0,-1.466659,,89.04
1,621e2e8e67b776a24055b564,2021-05-24,1,0.210289,,65.62
2,621e2e8e67b776a24055b564,2021-05-24,2,-1.359378,,65.40
3,621e2e8e67b776a24055b564,2021-05-24,3,-0.316295,,69.01
4,621e2e8e67b776a24055b564,2021-05-24,4,0.416455,,65.73
...,...,...,...,...,...,...
159014,621e375b67b776a240290cdc,2021-08-17,7,,,54.00
159015,621e375b67b776a240290cdc,2021-08-17,8,,,54.00
159016,621e375b67b776a240290cdc,2021-08-17,9,,,54.00
159017,621e375b67b776a240290cdc,2021-08-17,10,,,54.00


VO2 Max
Not applicable at hourly granularity.

Distance

In [24]:
if os.path.exists('data/users_distance_hourly.pkl'):
    print("Reading from pickle...")
    f = open("data/users_distance_hourly.pkl", "rb")
    # disable garbage collector
    gc.disable()
    # read pickle
    users_distance = pickle.load(f)
    # enable garbage collector again
    gc.enable()
    f.close()
else:
    users_distance = pd.DataFrame(columns=["id", "data"])
    for user in tqdm(users):
        user_data = pd.DataFrame(list(
            db.fitbit.find({"$and": [
                {"type": "distance"},
                {"id": user}
            ]},
                {"id": 1, "data.dateTime": 1, "data.value": 1, "_id": 0}
            )
        ))

        users_distance = pd.concat([users_distance, user_data], axis=0)

    print("Column Split...")
    # split data column (json format) into two columns
    users_distance.reset_index(drop=True, inplace=True)
    users_distance["date"] = users_distance["data"].apply(lambda d: d["dateTime"])
    users_distance["distance"] = users_distance["data"].apply(lambda d: d["value"])
    users_distance.drop(["data"], inplace=True, axis=1)
    print("Column Split Completed")
    print("Date Conversion...")
    # convert timestamp date object and then to datetime64[ns]
    users_distance["date"] = pd.to_datetime(users_distance["date"], infer_datetime_format=True)
    users_distance["hour"] = users_distance["date"].dt.hour
    users_distance["date"] = pd.to_datetime(users_distance["date"].dt.date, infer_datetime_format=True)
    print("Date Conversion Completed")
    users_distance.to_pickle('data/users_distance_hourly.pkl')

# group by date and then take the average
users_distance.distance = users_distance.distance.astype(float)
users_distance = users_distance.groupby(['id', 'date', 'hour']).sum()
users_distance.reset_index(drop=False, inplace=True)
users_distance.distance = users_distance.distance / 100  # converts cm to m

# merge
df = df.merge(users_distance, how='outer', on=['id', 'date', 'hour'])
df

100%|██████████| 71/71 [00:42<00:00,  1.69it/s]


Column Split...
Column Split Completed
Date Conversion...
Date Conversion Completed


Unnamed: 0,id,date,hour,temperature,badgeType,calories,distance
0,621e2e8e67b776a24055b564,2021-05-24,0,-1.466659,,89.04,98.3
1,621e2e8e67b776a24055b564,2021-05-24,1,0.210289,,65.62,0.0
2,621e2e8e67b776a24055b564,2021-05-24,2,-1.359378,,65.40,
3,621e2e8e67b776a24055b564,2021-05-24,3,-0.316295,,69.01,11.0
4,621e2e8e67b776a24055b564,2021-05-24,4,0.416455,,65.73,0.0
...,...,...,...,...,...,...,...
159014,621e375b67b776a240290cdc,2021-08-17,7,,,54.00,
159015,621e375b67b776a240290cdc,2021-08-17,8,,,54.00,
159016,621e375b67b776a240290cdc,2021-08-17,9,,,54.00,
159017,621e375b67b776a240290cdc,2021-08-17,10,,,54.00,


Exercise

In [25]:
users_exercise = pd.DataFrame(columns=["id", "data"])
for user in users:
    user_data = pd.DataFrame(list(
        db.fitbit.find({"$and": [
            {"type": "exercise"},
            {"id": user}
        ]},
            {"id": 1, "data.originalStartTime": 1, "data.activityTypeId": 1, "_id": 0}
        )
    ))

    users_exercise = pd.concat([users_exercise, user_data], axis=0)

# split data column (json format) into two columns
users_exercise["date"] = users_exercise["data"].apply(lambda d: d["originalStartTime"])
users_exercise["activityType"] = users_exercise["data"].apply(lambda d: d["activityTypeId"])
users_exercise.drop(["data"], inplace=True, axis=1)

# convert timestamp date object and then to datetime64[ns]
users_exercise["date"] = pd.to_datetime(users_exercise["date"], infer_datetime_format=True)
users_exercise["hour"] = users_exercise["date"].dt.hour
users_exercise["date"] = pd.to_datetime(users_exercise["date"].dt.date, infer_datetime_format=True)

# Get distinct activity types
activity_types = pd.DataFrame(list(
        db.fitbit.find(
            {"type": "exercise"},
            {"id": 1, "data.activityTypeId": 1, "data.activityName": 1, "_id": 0}
        )
    ))
activity_types["activityTypeId"] = activity_types.data.apply(lambda d: d["activityTypeId"])
activity_types["activityName"] = activity_types.data.apply(lambda d: d["activityName"])
activity_types.drop(["data", "id"], inplace=True, axis=1)
activity_types = activity_types.drop_duplicates().reset_index(drop=True)
ACTIVITIES = {
    90013: "Walk",
    15000: "Sport",
    3001: "Aerobic Workout",
    52000: "Yoga/Pilates",
    90024: "Swim",
    90001: "Bike",
    20047: "Elliptical",
    2131: "Weights",
    55001: "Spinning",
    1071: "Bike",
    90009: "Run",
    20049: "Treadmill",
    53000: "Yoga/Pilates",
    55002: "Martial Arts",
    2040: "Circuit Training",
    2065: "Stairclimber",
    3000: "Workout",
    90012: "Hike",
    12339646: "Run",
    12350445: "Walk",
    23418750: "Swim",
    55003: "Bootcamp",
    15430: "Martial Arts",
    20057: "Interval Workout",
    15675: "Tennis",
    61980497: "Workout"
}
users_exercise["activityType"] = users_exercise["activityType"].apply(lambda a: ACTIVITIES.get(a))
users_exercise = users_exercise.groupby(['id', 'date', 'hour']).activityType.apply(list).reset_index(drop=False)
users_exercise.activityType = users_exercise.activityType.apply(lambda l: list(set(l)) if isinstance(l, list) else l)

# merge
df = df.merge(users_exercise, how='outer', on=['id', 'date', 'hour'])
df.to_pickle("data/temp_df_1_hourly.pkl")
df

Unnamed: 0,id,date,hour,temperature,badgeType,calories,distance,activityType
0,621e2e8e67b776a24055b564,2021-05-24,0,-1.466659,,89.04,98.3,
1,621e2e8e67b776a24055b564,2021-05-24,1,0.210289,,65.62,0.0,
2,621e2e8e67b776a24055b564,2021-05-24,2,-1.359378,,65.40,,
3,621e2e8e67b776a24055b564,2021-05-24,3,-0.316295,,69.01,11.0,
4,621e2e8e67b776a24055b564,2021-05-24,4,0.416455,,65.73,0.0,
...,...,...,...,...,...,...,...,...
159039,621e323667b776a240f19134,2021-06-13,21,,,,,[Walk]
159040,621e323667b776a240f19134,2021-06-28,20,,,,,[Walk]
159041,621e339967b776a240e502de,2021-10-09,12,,,,,[Hike]
159042,621e339967b776a240e502de,2021-10-10,9,,,,,[Spinning]


Heart Rate

In [26]:
if os.path.exists('data/temp_df_1_hourly.pkl'):
    print("Reading DataFrame from pickle...")
    f = open("data/temp_df_1_hourly.pkl", "rb")
    # disable garbage collector
    gc.disable()
    # read pickle
    df = pickle.load(f)
    # enable garbage collector again
    gc.enable()
    f.close()
    print("Reading completed.")

Reading DataFrame from pickle...
Reading completed.


In [27]:
if os.path.exists('data/users_hr_hourly.pkl'):
    print("Reading hourly heart rate data from pickle...")
    f = open("data/users_hr_hourly.pkl", "rb")
    # disable garbage collector
    gc.disable()
    # read pickle
    users_hr = pickle.load(f)
    # enable garbage collector again
    gc.enable()
    f.close()
    print("Reading completed.")
else:
    warnings.warn("\nTo read and aggregate heart rate data from MongoDB you need to ensure index existence for both query (type, id) and projection (data.dateTime, data.value.bpm) fields (compound index of four fields)...\n")
    rows = 0
    users_hr = pd.DataFrame(columns=["id", "date", "bpm"])
    for user in tqdm(users):
        user_data = pd.DataFrame(list(
            db.fitbit.find({"$and": [
                {"id": user},
                {"type": "heart_rate"}
            ]},
                {"id": 1, "data.dateTime": 1, "data.value.bpm": 1, "_id": 0}
            )
        ))

        # split data column (json format) into two columns
        # user_data.reset_index(drop=True, inplace=True)
        user_data["date"] = user_data["data"].apply(lambda d: d["dateTime"])
        user_data["bpm"] = user_data["data"].apply(lambda d: d["value"].get("bpm"))
        user_data.drop(["data"], inplace=True, axis=1)
        # convert timestamp date object and then to datetime64[ns]
        # user_data["date"] = pd.to_datetime(user_data["date"], format="%Y-%m-%dT%H:%M:%S")
        user_data["date"] = pd.to_datetime(user_data["date"], infer_datetime_format=True)
        user_data['hour'] = user_data['date'].dt.hour
        user_data['date'] = pd.to_datetime(user_data['date'].dt.date, infer_datetime_format=True)

        # group by date and hour and then take the average
        user_data.bpm = user_data.bpm.astype(float)
        user_data = user_data.groupby(['id', 'date', 'hour']).mean()
        user_data.reset_index(drop=False, inplace=True)

        users_hr = pd.concat([users_hr, user_data], axis=0)
    users_hr.to_pickle('data/users_hr_hourly.pkl')

# convert timestamp date object and then to datetime64[ns]
users_hr["date"] = pd.to_datetime(pd.to_datetime(users_hr["date"], format="%Y/%m/%d  %H:%M:%S").dt.date, format="%Y/%m/%d")

# merge
df = df.merge(users_hr, how='outer', on=['id', 'date', 'hour'])
df

To read and aggregate heart rate data from MongoDB you need to ensure index existence for both query (type, id) and projection (data.dateTime, data.value.bpm) fields (compound index of four fields)...

100%|██████████| 71/71 [16:10<00:00, 13.67s/it]


Unnamed: 0,id,date,hour,temperature,badgeType,calories,distance,activityType,bpm
0,621e2e8e67b776a24055b564,2021-05-24,0,-1.466659,,89.04,98.3,,66.874763
1,621e2e8e67b776a24055b564,2021-05-24,1,0.210289,,65.62,0.0,,58.711364
2,621e2e8e67b776a24055b564,2021-05-24,2,-1.359378,,65.40,,,55.133495
3,621e2e8e67b776a24055b564,2021-05-24,3,-0.316295,,69.01,11.0,,60.488613
4,621e2e8e67b776a24055b564,2021-05-24,4,0.416455,,65.73,0.0,,59.110638
...,...,...,...,...,...,...,...,...,...
159039,621e323667b776a240f19134,2021-06-13,21,,,,,[Walk],
159040,621e323667b776a240f19134,2021-06-28,20,,,,,[Walk],
159041,621e339967b776a240e502de,2021-10-09,12,,,,,[Hike],
159042,621e339967b776a240e502de,2021-10-10,9,,,,,[Spinning],


Lightly Active Minutes
Not applicable at hourly granularity.

Moderately Active Minutes
Not applicable at hourly granularity.

Very Active Minutes
Not applicable at hourly granularity.

Sedentary Minutes
Not applicable at hourly granularity.

In [28]:
df.to_pickle('data/temp_df_2_hourly.pkl')
df = pd.read_pickle('data/temp_df_2_hourly.pkl')

Mindfulness Sessions

In [29]:
def try_parsing_date(text):
    for fmt in ('%Y-%m-%dT%H:%M:%S%z', '%Y-%m-%dT%H:%M%z'):
        try:
            return datetime.strptime(text, fmt)
        except ValueError:
            pass
    raise ValueError('no valid date format found for {}'.format(text))

In [30]:
users_mindfulness = pd.DataFrame(columns=["id", "data"])
for user in users:
    user_data = pd.DataFrame(list(
        db.fitbit.find({"$and": [
            {"type": "mindfulness_sessions"},
            {"id": user}
        ]},
            {"id": 1, "data.start_date_time": 1, "data.session_type": 1, "_id": 0}
        )
    ))

    users_mindfulness = pd.concat([users_mindfulness, user_data], axis=0)

# split data column (json format) into two columns
users_mindfulness["date"] = users_mindfulness["data"].apply(lambda d: d["start_date_time"])
users_mindfulness["activityType"] = users_mindfulness["data"].apply(lambda d: d["session_type"])
users_mindfulness.drop(["data"], inplace=True, axis=1)

# convert timestamp date object and then to datetime64[ns]
# users_mindfulness["date"] = pd.to_datetime(users_mindfulness["date"], format='%Y-%m-%dT%H:%M:%S%z')
users_mindfulness["date"] = pd.to_datetime(users_mindfulness["date"], infer_datetime_format=True)
users_mindfulness["hour"] = users_mindfulness["date"].dt.hour
users_mindfulness["date"] = pd.to_datetime(users_mindfulness["date"].dt.date, infer_datetime_format=True)

users_mindfulness["mindfulness_session"] = True  # instead of storing the session type, only store if user engaged in session
users_mindfulness.drop(['activityType'], axis=1, inplace=True)
users_mindfulness = users_mindfulness.drop_duplicates()

# merge
df = df.merge(users_mindfulness, how='outer', on=['id', 'date', 'hour'])
df.mindfulness_session.fillna('False', inplace=True)
df

Unnamed: 0,id,date,hour,temperature,badgeType,calories,distance,activityType,bpm,mindfulness_session
0,621e2e8e67b776a24055b564,2021-05-24,0,-1.466659,,89.04,98.3,,66.874763,False
1,621e2e8e67b776a24055b564,2021-05-24,1,0.210289,,65.62,0.0,,58.711364,False
2,621e2e8e67b776a24055b564,2021-05-24,2,-1.359378,,65.40,,,55.133495,False
3,621e2e8e67b776a24055b564,2021-05-24,3,-0.316295,,69.01,11.0,,60.488613,False
4,621e2e8e67b776a24055b564,2021-05-24,4,0.416455,,65.73,0.0,,59.110638,False
...,...,...,...,...,...,...,...,...,...,...
159039,621e323667b776a240f19134,2021-06-13,21,,,,,[Walk],,False
159040,621e323667b776a240f19134,2021-06-28,20,,,,,[Walk],,False
159041,621e339967b776a240e502de,2021-10-09,12,,,,,[Hike],,False
159042,621e339967b776a240e502de,2021-10-10,9,,,,,[Spinning],,False


Mindfulness EDA Data Sessions

In [31]:
users_eda = pd.DataFrame(columns=["id", "data"])
for user in users:
    user_data = pd.DataFrame(list(
        db.fitbit.find({"$and": [
            {"type": "mindfulness_eda_data_sessions"},
            {"id": user}
        ]},
            {"id": 1, "data.timestamp": 1, "data.scl_avg": 1, "_id": 0}
        )
    ))

    users_eda = pd.concat([users_eda, user_data], axis=0)

# split data column (json format) into two columns
users_eda["date"] = users_eda["data"].apply(lambda d: d["timestamp"])
users_eda["scl_avg"] = users_eda["data"].apply(lambda d: d["scl_avg"])
users_eda.drop(["data"], inplace=True, axis=1)

users_eda.reset_index(drop=True, inplace=True)
# convert timestamp date object and then to datetime64[ns]
users_eda["date"] = pd.to_datetime(users_exercise["date"], infer_datetime_format=True)
users_eda["hour"] = users_eda["date"].dt.hour
users_eda["date"] = pd.to_datetime(users_eda["date"].dt.date, infer_datetime_format=True)

# group by date and then take the average
users_eda = users_eda.groupby(['id', 'date', 'hour']).mean()
users_eda.reset_index(drop=False, inplace=True)

# merge
df = df.merge(users_eda, how='outer', on=['id', 'date', 'hour'])
df

Unnamed: 0,id,date,hour,temperature,badgeType,calories,distance,activityType,bpm,mindfulness_session,scl_avg
0,621e2e8e67b776a24055b564,2021-05-24,0.0,-1.466659,,89.04,98.3,,66.874763,False,
1,621e2e8e67b776a24055b564,2021-05-24,1.0,0.210289,,65.62,0.0,,58.711364,False,
2,621e2e8e67b776a24055b564,2021-05-24,2.0,-1.359378,,65.40,,,55.133495,False,
3,621e2e8e67b776a24055b564,2021-05-24,3.0,-0.316295,,69.01,11.0,,60.488613,False,
4,621e2e8e67b776a24055b564,2021-05-24,4.0,0.416455,,65.73,0.0,,59.110638,False,
...,...,...,...,...,...,...,...,...,...,...,...
159309,621e2f9167b776a240011ccb,2021-10-18,0.0,,,,,,,,14.363017
159310,621e2f9167b776a240011ccb,2021-10-19,0.0,,,,,,,,13.025694
159311,621e2f9167b776a240011ccb,2021-10-21,0.0,,,,,,,,15.787319
159312,621e2f9167b776a240011ccb,2022-01-20,0.0,,,,,,,,15.145738


Resting Heart Rate
Not applicable at hourly granularity.

Sleep

Steps

In [33]:
if not os.path.exists("data/users_steps_hourly.pkl"):
    steps_dataframe = pd.DataFrame(columns=["date", "hour", "steps", "id"])
    for user in tqdm(users):
        user_dataframe = pd.DataFrame(list(
            db.fitbit.find(
                {"type": "steps",
                 "id": user},
                {"data.dateTime": 1, "data.value": 1, "id": 1, "_id": 0}
            )
        ))
        user_dataframe['date'] = user_dataframe['data'].apply(lambda d: d['dateTime'])
        user_dataframe['steps'] = user_dataframe['data'].apply(lambda d: d['value'])
        user_dataframe.drop(["data"], inplace=True, axis=1)

        # basic preprocessing for steps - transformations
        user_dataframe['steps'] = pd.to_numeric(user_dataframe['steps'])  # was string
        user_dataframe["date"] = pd.to_datetime(user_dataframe["date"], infer_datetime_format=True)
        user_dataframe["hour"] = user_dataframe["date"].dt.hour
        user_dataframe["date"] = pd.to_datetime(user_dataframe["date"].dt.date, infer_datetime_format=True)

        steps_dataframe = pd.concat([steps_dataframe, user_dataframe], axis=0, ignore_index=True)

    # group by date and then take the sum
    steps_dataframe = steps_dataframe.groupby(['id', 'date', 'hour']).sum()
    steps_dataframe.reset_index(drop=False, inplace=True)
    steps_dataframe.to_pickle("data/users_steps_hourly.pkl")

infile = open('data/users_steps_hourly.pkl','rb')
steps_daily = pickle.load(infile)
infile.close()

# merge
df = df.merge(steps_daily, how='outer', on=['id', 'date', 'hour'])
df

100%|██████████| 71/71 [01:02<00:00,  1.14it/s]


Unnamed: 0,id,date,hour,temperature,badgeType,calories,distance,activityType,bpm,mindfulness_session,scl_avg,steps
0,621e2e8e67b776a24055b564,2021-05-24,0.0,-1.466659,,89.04,98.3,,66.874763,False,,134.0
1,621e2e8e67b776a24055b564,2021-05-24,1.0,0.210289,,65.62,0.0,,58.711364,False,,0.0
2,621e2e8e67b776a24055b564,2021-05-24,2.0,-1.359378,,65.40,,,55.133495,False,,
3,621e2e8e67b776a24055b564,2021-05-24,3.0,-0.316295,,69.01,11.0,,60.488613,False,,15.0
4,621e2e8e67b776a24055b564,2021-05-24,4.0,0.416455,,65.73,0.0,,59.110638,False,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
159309,621e2f9167b776a240011ccb,2021-10-18,0.0,,,,,,,,14.363017,
159310,621e2f9167b776a240011ccb,2021-10-19,0.0,,,,,,,,13.025694,
159311,621e2f9167b776a240011ccb,2021-10-21,0.0,,,,,,,,15.787319,
159312,621e2f9167b776a240011ccb,2022-01-20,0.0,,,,,,,,15.145738,


Time in Heart Rate Zones

In [34]:
users_time_in_heart_rate_zones = pd.DataFrame(columns=["id", "data"])
for user in users:
    user_data = pd.DataFrame(list(
        db.fitbit.find({"$and": [
            {"type": "time_in_heart_rate_zones"},
            {"id": user}
        ]},
            {"id": 1, "data.dateTime": 1, "data.value": 1, "_id": 0}
        )
    ))

    users_time_in_heart_rate_zones = pd.concat([users_time_in_heart_rate_zones, user_data], axis=0)

# split data column (json format) into two columns
users_time_in_heart_rate_zones["date"] = users_time_in_heart_rate_zones["data"].apply(lambda d: d["dateTime"])
users_time_in_heart_rate_zones["minutes_in_default_zone_1"] = users_time_in_heart_rate_zones["data"].apply(lambda d: d["value"].get("valuesInZones").get("IN_DEFAULT_ZONE_1"))
users_time_in_heart_rate_zones["minutes_below_default_zone_1"] = users_time_in_heart_rate_zones["data"].apply(lambda d: d["value"].get("valuesInZones").get("BELOW_DEFAULT_ZONE_1"))
users_time_in_heart_rate_zones["minutes_in_default_zone_2"] = users_time_in_heart_rate_zones["data"].apply(lambda d: d["value"].get("valuesInZones").get("IN_DEFAULT_ZONE_2"))
users_time_in_heart_rate_zones["minutes_in_default_zone_3"] = users_time_in_heart_rate_zones["data"].apply(lambda d: d["value"].get("valuesInZones").get("IN_DEFAULT_ZONE_3"))
users_time_in_heart_rate_zones.drop(["data"], inplace=True, axis=1)

# convert timestamp date object and then to datetime64[ns]
users_time_in_heart_rate_zones["date"] = pd.to_datetime(users_time_in_heart_rate_zones["date"], format="%m/%d/%y %H:%M:%S")
users_time_in_heart_rate_zones["hour"] = users_time_in_heart_rate_zones["date"].dt.hour
users_time_in_heart_rate_zones["date"] = pd.to_datetime(users_time_in_heart_rate_zones["date"].dt.date, infer_datetime_format=True)

# merge
df = df.merge(users_time_in_heart_rate_zones, how='outer', on=['id', 'date', 'hour'])
df

Unnamed: 0,id,date,hour,temperature,badgeType,calories,distance,activityType,bpm,mindfulness_session,scl_avg,steps,minutes_in_default_zone_1,minutes_below_default_zone_1,minutes_in_default_zone_2,minutes_in_default_zone_3
0,621e2e8e67b776a24055b564,2021-05-24,0.0,-1.466659,,89.04,98.3,,66.874763,False,,134.0,83.0,1349.0,0.0,0.0
1,621e2e8e67b776a24055b564,2021-05-24,1.0,0.210289,,65.62,0.0,,58.711364,False,,0.0,,,,
2,621e2e8e67b776a24055b564,2021-05-24,2.0,-1.359378,,65.40,,,55.133495,False,,,,,,
3,621e2e8e67b776a24055b564,2021-05-24,3.0,-0.316295,,69.01,11.0,,60.488613,False,,15.0,,,,
4,621e2e8e67b776a24055b564,2021-05-24,4.0,0.416455,,65.73,0.0,,59.110638,False,,0.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159377,621e2f9167b776a240011ccb,2021-10-18,0.0,,,,,,,,14.363017,,,,,
159378,621e2f9167b776a240011ccb,2021-10-19,0.0,,,,,,,,13.025694,,,,,
159379,621e2f9167b776a240011ccb,2021-10-21,0.0,,,,,,,,15.787319,,,,,
159380,621e2f9167b776a240011ccb,2022-01-20,0.0,,,,,,,,15.145738,,,,,


Profile

In [35]:
def get_age(date_of_birth):
    today = date.today()
    return today.year - date_of_birth.year - ((today.month, today.day) < (date_of_birth.month, date_of_birth.day))

In [36]:
users_profiles = pd.DataFrame(columns=["id", "data"])
for user in users:
    user_data = pd.DataFrame(list(
        db.fitbit.find({"$and": [
            {"type": "Profile"},
            {"id": user}
        ]},
            # {"id": 1, "data.date_of_birth": 1, "data.gender": 1, "data.height": 1, "data.weight": 1,  "_id": 0}
            {"id": 1, "data.age": 1, "data.gender": 1, "data.bmi": 1,  "_id": 0}
        )
    ))

    users_profiles = pd.concat([users_profiles, user_data], axis=0)

users_profiles["age"] = users_profiles["data"].apply(lambda d: d["age"] if "age" in d else np.NaN)
users_profiles["gender"] = users_profiles["data"].apply(lambda d: d["gender"] if "gender" in d else np.NaN)
users_profiles["bmi"] = users_profiles["data"].apply(lambda d: d["bmi"] if "bmi" in d else np.NaN)

users_profiles.drop(['data'], axis=1, inplace=True)

# merge
df = df.merge(users_profiles, how='left', on=['id'])
df

Unnamed: 0,id,date,hour,temperature,badgeType,calories,distance,activityType,bpm,mindfulness_session,scl_avg,steps,minutes_in_default_zone_1,minutes_below_default_zone_1,minutes_in_default_zone_2,minutes_in_default_zone_3,age,gender,bmi
0,621e2e8e67b776a24055b564,2021-05-24,0.0,-1.466659,,89.04,98.3,,66.874763,False,,134.0,83.0,1349.0,0.0,0.0,<30,MALE,<19
1,621e2e8e67b776a24055b564,2021-05-24,1.0,0.210289,,65.62,0.0,,58.711364,False,,0.0,,,,,<30,MALE,<19
2,621e2e8e67b776a24055b564,2021-05-24,2.0,-1.359378,,65.40,,,55.133495,False,,,,,,,<30,MALE,<19
3,621e2e8e67b776a24055b564,2021-05-24,3.0,-0.316295,,69.01,11.0,,60.488613,False,,15.0,,,,,<30,MALE,<19
4,621e2e8e67b776a24055b564,2021-05-24,4.0,0.416455,,65.73,0.0,,59.110638,False,,0.0,,,,,<30,MALE,<19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159377,621e2f9167b776a240011ccb,2021-10-18,0.0,,,,,,,,14.363017,,,,,,>=30,FEMALE,20.0
159378,621e2f9167b776a240011ccb,2021-10-19,0.0,,,,,,,,13.025694,,,,,,>=30,FEMALE,20.0
159379,621e2f9167b776a240011ccb,2021-10-21,0.0,,,,,,,,15.787319,,,,,,>=30,FEMALE,20.0
159380,621e2f9167b776a240011ccb,2022-01-20,0.0,,,,,,,,15.145738,,,,,,>=30,FEMALE,20.0


In [37]:
df.to_pickle('./data/hourly_fitbit_df_unprocessed.pkl')
df = pd.read_pickle('./data/hourly_fitbit_df_unprocessed.pkl')
df

Unnamed: 0,id,date,hour,temperature,badgeType,calories,distance,activityType,bpm,mindfulness_session,scl_avg,steps,minutes_in_default_zone_1,minutes_below_default_zone_1,minutes_in_default_zone_2,minutes_in_default_zone_3,age,gender,bmi
0,621e2e8e67b776a24055b564,2021-05-24,0.0,-1.466659,,89.04,98.3,,66.874763,False,,134.0,83.0,1349.0,0.0,0.0,<30,MALE,<19
1,621e2e8e67b776a24055b564,2021-05-24,1.0,0.210289,,65.62,0.0,,58.711364,False,,0.0,,,,,<30,MALE,<19
2,621e2e8e67b776a24055b564,2021-05-24,2.0,-1.359378,,65.40,,,55.133495,False,,,,,,,<30,MALE,<19
3,621e2e8e67b776a24055b564,2021-05-24,3.0,-0.316295,,69.01,11.0,,60.488613,False,,15.0,,,,,<30,MALE,<19
4,621e2e8e67b776a24055b564,2021-05-24,4.0,0.416455,,65.73,0.0,,59.110638,False,,0.0,,,,,<30,MALE,<19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159377,621e2f9167b776a240011ccb,2021-10-18,0.0,,,,,,,,14.363017,,,,,,>=30,FEMALE,20.0
159378,621e2f9167b776a240011ccb,2021-10-19,0.0,,,,,,,,13.025694,,,,,,>=30,FEMALE,20.0
159379,621e2f9167b776a240011ccb,2021-10-21,0.0,,,,,,,,15.787319,,,,,,>=30,FEMALE,20.0
159380,621e2f9167b776a240011ccb,2022-01-20,0.0,,,,,,,,15.145738,,,,,,>=30,FEMALE,20.0


Integrating SEMA data

Step goals

In [38]:
# user goals in SEMA
SEMA_GOALS_TO_MIN = {
    "2000": 0,
    "4999": 2000,
    "7999": 5000,
    "9999": 8000,
    "14999": 10000,
    "19999": 15000,
    "24999": 20000,
    "25000": 25000,
    "NO_GOAL": 0
}

SEMA_GOALS_TO_MAX = {
    "2000": 2000,
    "4999": 5000,
    "7999": 8000,
    "9999": 10000,
    "14999": 15000,
    "19999": 20000,
    "24999": 25000,
    "25000": 30000,
    "NO_GOAL": 0
}

SEMA_LABELS = {
    "2000": "Less than 2000",
    "4999": "2000-4999",
    "7999": "5000-7999",
    "9999": "8000-9999",
    "14999": "10000-14999",
    "19999": "15000-19999",
    "24999": "20000-24999",
    "25000": "More than 25000",
    "NO_GOAL": "No Goal",
    np.nan: "No Goal",
    None: "No Goal"
}

In [39]:
if not os.path.exists("data/users_step_goals_hourly.pkl"):
    # Get user self-reported goals from SEMA
    sema_goals = pd.DataFrame(columns=["_id", "user_id", "data"])
    for user in users:
        user_data = pd.DataFrame(list(
                        db.sema.find({ "$and": [
                            { "data.STEPS": { "$ne": "<no-response>" } },
                            {"user_id": user}
                        ] },
                            {"data.STEPS": 1, "id": 1, "user_id": 1, "data.STARTED_TS": 1}
                        )
                    ))

        sema_goals = pd.concat([sema_goals, user_data], axis=0)

    # split data column (json format) into two columns
    sema_goals["timestamp"] = sema_goals["data"].apply(lambda d: d["STARTED_TS"])
    sema_goals["step_goal"] = sema_goals["data"].apply(lambda d: d["STEPS"])
    sema_goals.drop(["data", "_id"], inplace=True, axis=1)

    # convert timestamp to day format
    sema_goals["date"] = pd.to_datetime(sema_goals["timestamp"], infer_datetime_format=True)
    sema_goals["hour"] = sema_goals["date"].dt.hour
    sema_goals["date"] = pd.to_datetime(sema_goals["date"].dt.date, infer_datetime_format=True) # convert from object to datetime
    sema_goals.drop(["timestamp"], inplace=True, axis=1)

    # add min goal and max goal columns
    sema_goals['min_goal'] = sema_goals.step_goal.apply(lambda s: SEMA_GOALS_TO_MIN.get(s))
    sema_goals['max_goal'] = sema_goals.step_goal.apply(lambda s: SEMA_GOALS_TO_MAX.get(s))

    # add goal labels
    sema_goals['step_goal_label'] = sema_goals['step_goal'].apply(lambda v: SEMA_LABELS[v])
    sema_goals[['date', 'hour', 'user_id', 'step_goal', 'min_goal', 'max_goal', 'step_goal_label']].to_pickle('./data/users_step_goals_hourly.pkl')

users_step_goals = pd.read_pickle('./data/users_step_goals_hourly.pkl')
users_step_goals['id'] = users_step_goals.user_id.copy()
users_step_goals.drop(['user_id'], axis=1, inplace=True)

# merge
df = df.merge(users_step_goals, how='outer', on=['id', 'date', 'hour'])
df

Unnamed: 0,id,date,hour,temperature,badgeType,calories,distance,activityType,bpm,mindfulness_session,...,minutes_below_default_zone_1,minutes_in_default_zone_2,minutes_in_default_zone_3,age,gender,bmi,step_goal,min_goal,max_goal,step_goal_label
0,621e2e8e67b776a24055b564,2021-05-24,0.0,-1.466659,,89.04,98.3,,66.874763,False,...,1349.0,0.0,0.0,<30,MALE,<19,,,,
1,621e2e8e67b776a24055b564,2021-05-24,1.0,0.210289,,65.62,0.0,,58.711364,False,...,,,,<30,MALE,<19,,,,
2,621e2e8e67b776a24055b564,2021-05-24,2.0,-1.359378,,65.40,,,55.133495,False,...,,,,<30,MALE,<19,,,,
3,621e2e8e67b776a24055b564,2021-05-24,3.0,-0.316295,,69.01,11.0,,60.488613,False,...,,,,<30,MALE,<19,,,,
4,621e2e8e67b776a24055b564,2021-05-24,4.0,0.416455,,65.73,0.0,,59.110638,False,...,,,,<30,MALE,<19,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159430,621e36f967b776a240e5e7c9,2021-05-03,11.0,,,,,,,,...,,,,,,,7999,5000.0,8000.0,5000-7999
159431,621e36f967b776a240e5e7c9,2021-05-06,11.0,,,,,,,,...,,,,,,,9999,8000.0,10000.0,8000-9999
159432,621e36f967b776a240e5e7c9,2021-05-15,13.0,,,,,,,,...,,,,,,,14999,10000.0,15000.0,10000-14999
159433,621e36f967b776a240e5e7c9,2021-05-16,12.0,,,,,,,,...,,,,,,,14999,10000.0,15000.0,10000-14999


Integrating Emotions & Location

In [41]:
if not os.path.exists('./data/sema_mood_place_hourly.pkl'):
    # Get user self-reported goals from SEMA
    users = db.sema.distinct('user_id')

    sema_mood = pd.DataFrame(columns=["_id", "user_id", "data"])
    for user in users:
        user_data = pd.DataFrame(list(
                        db.sema.find({
                            "$or": [
                                {
                                    "$and": [
                                        { "data.MOOD": { "$ne": "<no-response>" } },
                                        {"data.MOOD": { "$ne": None }},
                                        {"user_id": user}
                                    ]
                                },
                                {
                                    "$and": [
                                        { "data.PLACE": { "$ne": "<no-response>" } },
                                        {"data.PLACE": { "$ne": None }},
                                        {"user_id": user}
                                    ]
                                }
                            ]
                        },
                            {"data.MOOD": 1, "data.PLACE": 1, "id": 1, "_id": 0, "user_id": 1, "data.STARTED_TS": 1}
                        )
                    ))

        sema_mood = pd.concat([sema_mood, user_data], axis=0)

    sema_mood["date"] = pd.to_datetime(sema_mood["data"].apply(lambda d: d["STARTED_TS"]), infer_datetime_format=True)
    sema_mood["hour"] = sema_mood["date"].dt.hour
    sema_mood["date"] = pd.to_datetime(sema_mood["date"].dt.date, infer_datetime_format=True)
    sema_mood["data.MOOD"] = sema_mood["data"].apply(lambda d: d["MOOD"])
    sema_mood["data.PLACE"] = sema_mood["data"].apply(lambda d: d["PLACE"])
    sema_mood.drop(["_id", "data"], axis=1, inplace=True)
    sema_mood.to_pickle('./data/sema_mood_place_hourly.pkl')
else:
    sema_mood = pd.read_pickle('./data/sema_mood_place_hourly.pkl')

infile = open('./data/sema_mood_place_hourly.pkl','rb')
sema = pickle.load(infile)
infile.close()

# replace not common moods
sema['data.MOOD'] = sema['data.MOOD'].apply(lambda mood: 'SAD' if mood == 'SADNESS' else ('HAPPY' if mood == 'JOY' else mood))
sema = sema[(sema['data.MOOD'] != 'FEAR') & (sema['data.MOOD'] != 'SURPRISE') & (sema['data.MOOD'] != 'ANGER') & (sema['data.MOOD'] != '<no-response>')]
print(sema['data.MOOD'].value_counts())
print(sema['data.PLACE'].value_counts())

sema_moods = pd.get_dummies(sema['data.MOOD'])
sema_places = pd.get_dummies(sema['data.PLACE'])

# combine one-hot encoding with actual df
sema = pd.concat([sema, sema_moods, sema_places], axis=1)
sema.drop(['data.MOOD', 'data.PLACE'], axis=1, inplace=True)

sema_grouped = sema.groupby(['date', 'hour', 'user_id']).max()
sema_grouped.reset_index(drop=False, inplace=True)
sema_grouped['id'] = sema_grouped['user_id'].copy()
sema_grouped.drop(['user_id'], axis=1, inplace=True)
sema_grouped.reset_index(drop=True, inplace=True)
sema_grouped.id = sema_grouped.id.apply(lambda id: ObjectId(id))

# merge
df = df.merge(sema_grouped, how='outer', on=['id', 'date', 'hour'])
df.to_pickle('./data/hourly_fitbit_sema_df_unprocessed.pkl')
df

RESTED/RELAXED    1179
TIRED             1126
NEUTRAL            822
HAPPY              790
TENSE/ANXIOUS      620
ALERT              345
SAD                149
Name: data.MOOD, dtype: int64
HOME             2960
WORK/SCHOOL       869
OUTDOORS          381
HOME_OFFICE       309
TRANSIT           224
ENTERTAINMENT     209
OTHER              60
GYM                18
Name: data.PLACE, dtype: int64


Unnamed: 0,id,date,hour,temperature,badgeType,calories,distance,activityType,bpm,mindfulness_session,...,TENSE/ANXIOUS,TIRED,ENTERTAINMENT,GYM,HOME,HOME_OFFICE,OTHER,OUTDOORS,TRANSIT,WORK/SCHOOL
0,621e2e8e67b776a24055b564,2021-05-24,0.0,-1.466659,,89.04,98.3,,66.874763,False,...,,,,,,,,,,
1,621e2e8e67b776a24055b564,2021-05-24,1.0,0.210289,,65.62,0.0,,58.711364,False,...,,,,,,,,,,
2,621e2e8e67b776a24055b564,2021-05-24,2.0,-1.359378,,65.40,,,55.133495,False,...,,,,,,,,,,
3,621e2e8e67b776a24055b564,2021-05-24,3.0,-0.316295,,69.01,11.0,,60.488613,False,...,,,,,,,,,,
4,621e2e8e67b776a24055b564,2021-05-24,4.0,0.416455,,65.73,0.0,,59.110638,False,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159503,621e36f967b776a240e5e7c9,2021-05-22,18.0,,,,,,,,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
159504,621e362467b776a2404ad513,2021-05-23,18.0,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
159505,621e339967b776a240e502de,2021-11-19,16.0,,,,,,,,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
159506,621e339967b776a240e502de,2021-11-19,22.0,,,,,,,,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [43]:
df.to_csv('./data/hourly_fitbit_sema_df_unprocessed.csv')