# Creating the daily dataframe

In [12]:
import json
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from datetime import datetime
import preprocessing_functions
from pymongo import MongoClient
warnings.filterwarnings("ignore")
from dataprep.eda import create_report
import feature_engineering_functions

In [13]:
# load fitbit data
fitbit = pd.read_pickle('../data/daily_fitbit_df_unprocessed.pkl')

---------------------------------------------------------
Fitbit Basic Preprocessing
---------------------------------------------------------
1. Select the experiment days
2. Drop duplicates
3. Convert data types falsely described as categorical

In [14]:
fitbit = preprocessing_functions.fitbit_basic_preprocessing(fitbit)
fitbit.columns

Index(['id', 'date', 'nightly_temperature', 'nremhr', 'rmssd', 'spo2',
       'full_sleep_breathing_rate', 'stress_score', 'sleep_points_percentage',
       'exertion_points_percentage', 'responsiveness_points_percentage',
       'daily_temperature_variation', 'badgeType', 'calories',
       'filteredDemographicVO2Max', 'distance', 'activityType', 'bpm',
       'lightly_active_minutes', 'moderately_active_minutes',
       'very_active_minutes', 'sedentary_minutes', 'mindfulness_session',
       'scl_avg', 'resting_hr', 'sleep_duration', 'minutesToFallAsleep',
       'minutesAsleep', 'minutesAwake', 'minutesAfterWakeup',
       'sleep_efficiency', 'sleep_deep_ratio', 'sleep_wake_ratio',
       'sleep_light_ratio', 'sleep_rem_ratio', 'steps',
       'minutes_in_default_zone_1', 'minutes_below_default_zone_1',
       'minutes_in_default_zone_2', 'minutes_in_default_zone_3', 'age',
       'gender', 'bmi', 'heart_rate_alert'],
      dtype='object')

In [15]:
fitbit["date"] = pd.to_datetime(pd.to_datetime(fitbit["date"]).dt.date)
fitbit = fitbit.sort_values(by='date', ascending=True)
fitbit.head()

Unnamed: 0,id,date,nightly_temperature,nremhr,rmssd,spo2,full_sleep_breathing_rate,stress_score,sleep_points_percentage,exertion_points_percentage,...,sleep_rem_ratio,steps,minutes_in_default_zone_1,minutes_below_default_zone_1,minutes_in_default_zone_2,minutes_in_default_zone_3,age,gender,bmi,heart_rate_alert
0,621e2e8e67b776a24055b564,2021-05-24,34.137687,57.432,89.603,,14.8,78.0,0.833333,0.675,...,1.341772,8833.0,83.0,1349.0,0.0,0.0,<30,MALE,<19,
24,621e328667b776a240281372,2021-05-24,,,,,,,,,...,,8550.0,278.0,766.0,29.0,1.0,>=30,MALE,>=30,
25,621e326767b776a24012e179,2021-05-24,,46.12,53.968,95.3,14.4,80.0,0.666667,0.925,...,,16992.0,131.0,1225.0,0.0,0.0,<30,FEMALE,<19,
26,621e332267b776a24092a584,2021-05-24,,,,,,,,,...,,21284.0,175.0,1130.0,9.0,0.0,<30,MALE,21.0,
27,621e333567b776a240a0c217,2021-05-24,,,,,,,,,...,,,,,,,<30,MALE,21.0,


In [16]:
# read intra-day data from Mongo
fitbit = preprocessing_functions.fitbit_intraday_sleep(fitbit, 'rais')
fitbit.head(200)

Unnamed: 0,id,date,nightly_temperature,nremhr,rmssd,spo2,full_sleep_breathing_rate,stress_score,sleep_points_percentage,exertion_points_percentage,...,minutes_in_default_zone_1,minutes_below_default_zone_1,minutes_in_default_zone_2,minutes_in_default_zone_3,age,gender,bmi,heart_rate_alert,startTime,endTime
0,621e2e8e67b776a24055b564,2021-05-24,34.137687,57.432,89.603,,14.8,78.0,0.833333,0.675,...,83.0,1349.0,0.0,0.0,<30,MALE,<19,,2021-05-24T00:40:00.000,2021-05-24T09:21:00.000
1,621e328667b776a240281372,2021-05-24,,,,,,,,,...,278.0,766.0,29.0,1.0,>=30,MALE,>=30,,,
2,621e326767b776a24012e179,2021-05-24,,46.120,53.968,95.3,14.4,80.0,0.666667,0.925,...,131.0,1225.0,0.0,0.0,<30,FEMALE,<19,,,
3,621e332267b776a24092a584,2021-05-24,,,,,,,,,...,175.0,1130.0,9.0,0.0,<30,MALE,21.0,,,
4,621e333567b776a240a0c217,2021-05-24,,,,,,,,,...,,,,,<30,MALE,21.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,621e351a67b776a240f6204b,2021-05-28,33.532748,72.999,24.946,,13.2,0.0,0.000000,0.000,...,19.0,847.0,0.0,0.0,<30,MALE,22.0,,2021-05-28T03:40:00.000,2021-05-28T11:12:30.000
196,621e36dd67b776a240ce9a45,2021-05-28,,,,,,,,,...,68.0,876.0,46.0,2.0,>=30,MALE,24.0,,,
197,621e346f67b776a24081744f,2021-05-28,,,,,,,,,...,121.0,903.0,0.0,0.0,,,,,,
198,621e341067b776a24037b105,2021-05-28,32.943088,75.273,22.769,,13.6,,,,...,283.0,1156.0,0.0,0.0,>=30,MALE,26.0,,2021-05-28T00:11:30.000,2021-05-28T06:38:30.000


-----------------------
# Merge all self-reported data with the fitbit data constructing a unified dataframe


# ttm

In [17]:
ttm = pd.read_pickle('../data/surveys/ttm_classification.pkl')
ttm = ttm.rename(columns = {"stage": "label_ttm_stage"})
ttm["date"] = pd.to_datetime(pd.to_datetime(ttm["date"]).dt.date)
ttm = ttm.sort_values(by='date', ascending=True)
ttm.drop(columns=['date'], inplace=True)
ttm.head(100)

Unnamed: 0,id,label_ttm_stage
0,621e314867b776a24029ebf9,Preparation
1,621e36c267b776a240ba2756,Action
2,621e328667b776a240281372,Maintenance
3,621e2f3967b776a240c654db,Preparation
4,621e32d967b776a240627414,Maintenance
5,621e34ec67b776a240d60873,Maintenance
6,621e332267b776a24092a584,Maintenance
7,621e326767b776a24012e179,Maintenance
8,621e375b67b776a240290cdc,Action
9,621e34db67b776a240c9c2be,Preparation


In [18]:
data = fitbit.merge(ttm, how='left', on=['id'])
data

Unnamed: 0,id,date,nightly_temperature,nremhr,rmssd,spo2,full_sleep_breathing_rate,stress_score,sleep_points_percentage,exertion_points_percentage,...,minutes_below_default_zone_1,minutes_in_default_zone_2,minutes_in_default_zone_3,age,gender,bmi,heart_rate_alert,startTime,endTime,label_ttm_stage
0,621e2e8e67b776a24055b564,2021-05-24,34.137687,57.432,89.603,,14.8,78.0,0.833333,0.675,...,1349.0,0.0,0.0,<30,MALE,<19,,2021-05-24T00:40:00.000,2021-05-24T09:21:00.000,Maintenance
1,621e328667b776a240281372,2021-05-24,,,,,,,,,...,766.0,29.0,1.0,>=30,MALE,>=30,,,,Maintenance
2,621e326767b776a24012e179,2021-05-24,,46.120,53.968,95.3,14.4,80.0,0.666667,0.925,...,1225.0,0.0,0.0,<30,FEMALE,<19,,,,Maintenance
3,621e332267b776a24092a584,2021-05-24,,,,,,,,,...,1130.0,9.0,0.0,<30,MALE,21.0,,,,Maintenance
4,621e333567b776a240a0c217,2021-05-24,,,,,,,,,...,,,,<30,MALE,21.0,,,,Contemplation
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5022,621e30b267b776a240c5e13f,2022-01-17,,,,,,,,,...,126.0,0.0,0.0,<30,FEMALE,21.0,,,,
5023,621e312a67b776a240164d59,2022-01-17,34.237581,,,,,,,,...,563.0,0.0,0.0,>=30,MALE,25.0,,2022-01-17T00:45:00.000,2022-01-17T07:55:00.000,Contemplation
5024,621e309b67b776a240b532b0,2022-01-17,31.815133,,,,,,,,...,1208.0,6.0,0.0,>=30,MALE,23.0,,2022-01-17T00:56:30.000,2022-01-17T05:57:30.000,
5025,621e333567b776a240a0c217,2022-01-17,,,,,,,,,...,,,,<30,MALE,21.0,,,,Contemplation


# breq

In [19]:
breq = pd.read_pickle('../data/surveys/breq.pkl')
breq = breq.rename(columns = {"user_id": "id", "submitdate":"date", "breq_self_determination":"label_breq_self_determination"})
breq["date"] = pd.to_datetime(pd.to_datetime(breq["date"]).dt.date)
breq = breq.sort_values(by='date', ascending=True)
breq.drop_duplicates(subset='id', keep='last', inplace=True)  # keep only one record per user
breq = breq.drop(columns=['date', 'type','breq_amotivation', 'breq_external_regulation', 'breq_introjected_regulation', 'breq_identified_regulation', 'breq_intrinsic_regulation'])
breq.head(100)

Unnamed: 0,id,label_breq_self_determination
0,621e314867b776a24029ebf9,identified_regulation
0,621e328667b776a240281372,intrinsic_regulation
0,621e32d967b776a240627414,identified_regulation
0,621e36c267b776a240ba2756,intrinsic_regulation
0,621e2f3967b776a240c654db,intrinsic_regulation
0,621e332267b776a24092a584,identified_regulation
0,621e34ec67b776a240d60873,identified_regulation
1,621e375b67b776a240290cdc,identified_regulation
1,621e34db67b776a240c9c2be,intrinsic_regulation
1,621e36f967b776a240e5e7c9,introjected_regulation


In [20]:
data = data.merge(breq, how='left', on=['id'])
data.head()

Unnamed: 0,id,date,nightly_temperature,nremhr,rmssd,spo2,full_sleep_breathing_rate,stress_score,sleep_points_percentage,exertion_points_percentage,...,minutes_in_default_zone_2,minutes_in_default_zone_3,age,gender,bmi,heart_rate_alert,startTime,endTime,label_ttm_stage,label_breq_self_determination
0,621e2e8e67b776a24055b564,2021-05-24,34.137687,57.432,89.603,,14.8,78.0,0.833333,0.675,...,0.0,0.0,<30,MALE,<19,,2021-05-24T00:40:00.000,2021-05-24T09:21:00.000,Maintenance,intrinsic_regulation
1,621e328667b776a240281372,2021-05-24,,,,,,,,,...,29.0,1.0,>=30,MALE,>=30,,,,Maintenance,intrinsic_regulation
2,621e326767b776a24012e179,2021-05-24,,46.12,53.968,95.3,14.4,80.0,0.666667,0.925,...,0.0,0.0,<30,FEMALE,<19,,,,Maintenance,
3,621e332267b776a24092a584,2021-05-24,,,,,,,,,...,9.0,0.0,<30,MALE,21.0,,,,Maintenance,identified_regulation
4,621e333567b776a240a0c217,2021-05-24,,,,,,,,,...,,,<30,MALE,21.0,,,,Contemplation,intrinsic_regulation


# sema

In [21]:
sema = pd.read_pickle('../data/semas/semas_read_from_the_base_experiment_dates.pkl')
sema = preprocessing_functions.sema_basic_preprocessing(sema)
sema["date"] = pd.to_datetime(pd.to_datetime(sema["date"]).dt.date)
sema = sema.sort_values(by='date', ascending=True)
sema = sema.drop(columns='positive_feelings')
sema = sema.rename(columns = {"negative_feelings": "label_sema_negative_feelings"})
sema.head()

Unnamed: 0,id,date,label_sema_negative_feelings
0,621e2f3967b776a240c654db,2021-05-24,0
11,621e362467b776a2404ad513,2021-05-24,0
10,621e301e67b776a240608a72,2021-05-24,0
9,621e30e467b776a240e817c7,2021-05-24,1
8,621e36f967b776a240e5e7c9,2021-05-24,1


In [22]:
data = data.merge(sema, how='left', on=['id','date'])
data.head()

Unnamed: 0,id,date,nightly_temperature,nremhr,rmssd,spo2,full_sleep_breathing_rate,stress_score,sleep_points_percentage,exertion_points_percentage,...,minutes_in_default_zone_3,age,gender,bmi,heart_rate_alert,startTime,endTime,label_ttm_stage,label_breq_self_determination,label_sema_negative_feelings
0,621e2e8e67b776a24055b564,2021-05-24,34.137687,57.432,89.603,,14.8,78.0,0.833333,0.675,...,0.0,<30,MALE,<19,,2021-05-24T00:40:00.000,2021-05-24T09:21:00.000,Maintenance,intrinsic_regulation,
1,621e328667b776a240281372,2021-05-24,,,,,,,,,...,1.0,>=30,MALE,>=30,,,,Maintenance,intrinsic_regulation,
2,621e326767b776a24012e179,2021-05-24,,46.12,53.968,95.3,14.4,80.0,0.666667,0.925,...,0.0,<30,FEMALE,<19,,,,Maintenance,,
3,621e332267b776a24092a584,2021-05-24,,,,,,,,,...,0.0,<30,MALE,21.0,,,,Maintenance,identified_regulation,
4,621e333567b776a240a0c217,2021-05-24,,,,,,,,,...,,<30,MALE,21.0,,,,Contemplation,intrinsic_regulation,


# personality

In [23]:
big5 = pd.read_pickle('../data/surveys/personality.pkl')
big5 = big5.rename(columns={"user_id": "id", "submitdate": "date", "ipip_extraversion_category":"label_ipip_extraversion_category",
                            "ipip_agreeableness_category":"label_ipip_agreeableness_category", "ipip_conscientiousness_category":"label_ipip_conscientiousness_category",
                           "ipip_stability_category":"label_ipip_stability_category", "ipip_intellect_category":"label_ipip_intellect_category"})
big5["date"] = pd.to_datetime(pd.to_datetime(big5["date"]).dt.date)
big5 = big5.sort_values(by='date', ascending=True)
big5.drop_duplicates(subset='id', keep='last', inplace=True)
big5 = big5.drop(columns=['date', 'type','extraversion', 'agreeableness', 'conscientiousness', 'stability', 'intellect',  'gender'])
big5.head(100)

Unnamed: 0,id,label_ipip_extraversion_category,label_ipip_agreeableness_category,label_ipip_conscientiousness_category,label_ipip_stability_category,label_ipip_intellect_category
0,621e2e8e67b776a24055b564,LOW,LOW,HIGH,HIGH,AVERAGE
48,621e36c267b776a240ba2756,HIGH,AVERAGE,LOW,HIGH,LOW
39,621e34db67b776a240c9c2be,AVERAGE,HIGH,AVERAGE,HIGH,HIGH
3,621e2f3967b776a240c654db,AVERAGE,AVERAGE,LOW,HIGH,AVERAGE
37,621e341067b776a24037b105,LOW,LOW,LOW,LOW,LOW
5,621e2f7a67b776a240f14425,AVERAGE,AVERAGE,HIGH,HIGH,AVERAGE
30,621e335a67b776a240bb12ff,LOW,LOW,HIGH,AVERAGE,LOW
7,621e2fb367b776a24015accd,HIGH,AVERAGE,LOW,LOW,HIGH
8,621e2fce67b776a240279baa,HIGH,HIGH,AVERAGE,LOW,AVERAGE
25,621e32d967b776a240627414,HIGH,AVERAGE,HIGH,AVERAGE,HIGH


In [24]:
data = data.merge(big5, how='left', on=['id'])
data.head()

Unnamed: 0,id,date,nightly_temperature,nremhr,rmssd,spo2,full_sleep_breathing_rate,stress_score,sleep_points_percentage,exertion_points_percentage,...,startTime,endTime,label_ttm_stage,label_breq_self_determination,label_sema_negative_feelings,label_ipip_extraversion_category,label_ipip_agreeableness_category,label_ipip_conscientiousness_category,label_ipip_stability_category,label_ipip_intellect_category
0,621e2e8e67b776a24055b564,2021-05-24,34.137687,57.432,89.603,,14.8,78.0,0.833333,0.675,...,2021-05-24T00:40:00.000,2021-05-24T09:21:00.000,Maintenance,intrinsic_regulation,,LOW,LOW,HIGH,HIGH,AVERAGE
1,621e328667b776a240281372,2021-05-24,,,,,,,,,...,,,Maintenance,intrinsic_regulation,,HIGH,AVERAGE,HIGH,LOW,LOW
2,621e326767b776a24012e179,2021-05-24,,46.12,53.968,95.3,14.4,80.0,0.666667,0.925,...,,,Maintenance,,,,,,,
3,621e332267b776a24092a584,2021-05-24,,,,,,,,,...,,,Maintenance,identified_regulation,,LOW,LOW,LOW,AVERAGE,HIGH
4,621e333567b776a240a0c217,2021-05-24,,,,,,,,,...,,,Contemplation,intrinsic_regulation,,AVERAGE,HIGH,AVERAGE,LOW,LOW


# stai

In [25]:
stai = pd.read_pickle('../data/surveys/stai.pkl')
stai = stai.rename(columns={"user_id": "id", "submitdate": "date", "stai_stress_category":"label_stai_stress_category"})
stai["date"] = pd.to_datetime(pd.to_datetime(stai["date"]).dt.date)
stai = stai.sort_values(by='date', ascending=True)
stai = stai.drop(columns=['type', 'stai_stress'])
stai.head()

Unnamed: 0,id,date,label_stai_stress_category
0,621e2e8e67b776a24055b564,2021-05-31,Below average
0,621e328667b776a240281372,2021-05-31,Average
0,621e329067b776a2402ffad2,2021-05-31,Above average
3,621e30e467b776a240e817c7,2021-05-31,Above average
4,621e32af67b776a24045b4cf,2021-05-31,Above average


In [26]:
data = data.merge(stai, how='left', on=['id','date'])
data.head()

Unnamed: 0,id,date,nightly_temperature,nremhr,rmssd,spo2,full_sleep_breathing_rate,stress_score,sleep_points_percentage,exertion_points_percentage,...,endTime,label_ttm_stage,label_breq_self_determination,label_sema_negative_feelings,label_ipip_extraversion_category,label_ipip_agreeableness_category,label_ipip_conscientiousness_category,label_ipip_stability_category,label_ipip_intellect_category,label_stai_stress_category
0,621e2e8e67b776a24055b564,2021-05-24,34.137687,57.432,89.603,,14.8,78.0,0.833333,0.675,...,2021-05-24T09:21:00.000,Maintenance,intrinsic_regulation,,LOW,LOW,HIGH,HIGH,AVERAGE,
1,621e328667b776a240281372,2021-05-24,,,,,,,,,...,,Maintenance,intrinsic_regulation,,HIGH,AVERAGE,HIGH,LOW,LOW,
2,621e326767b776a24012e179,2021-05-24,,46.12,53.968,95.3,14.4,80.0,0.666667,0.925,...,,Maintenance,,,,,,,,
3,621e332267b776a24092a584,2021-05-24,,,,,,,,,...,,Maintenance,identified_regulation,,LOW,LOW,LOW,AVERAGE,HIGH,
4,621e333567b776a240a0c217,2021-05-24,,,,,,,,,...,,Contemplation,intrinsic_regulation,,AVERAGE,HIGH,AVERAGE,LOW,LOW,


# panas

In [27]:
panas = pd.read_pickle('../data/surveys/panas_classification.pkl')
panas = panas.rename(columns={"user_id": "id", "submitdate": "date", "negative_affect_category":"label_panas_negative_affect"})
panas["date"] = pd.to_datetime(pd.to_datetime(panas["date"]).dt.date)
panas = panas.sort_values(by='date', ascending=True)
panas.head()

Unnamed: 0,id,date,label_panas_negative_affect
0,621e329067b776a2402ffad2,2021-05-31,Average
0,621e328667b776a240281372,2021-05-31,Above average
4,621e32af67b776a24045b4cf,2021-05-31,Above average
6,621e301e67b776a240608a72,2021-05-31,Average
2,621e32d967b776a240627414,2021-05-31,Average


In [28]:
data = data.merge(panas, how='left', on=['id','date'])
data.head(1000)

Unnamed: 0,id,date,nightly_temperature,nremhr,rmssd,spo2,full_sleep_breathing_rate,stress_score,sleep_points_percentage,exertion_points_percentage,...,label_ttm_stage,label_breq_self_determination,label_sema_negative_feelings,label_ipip_extraversion_category,label_ipip_agreeableness_category,label_ipip_conscientiousness_category,label_ipip_stability_category,label_ipip_intellect_category,label_stai_stress_category,label_panas_negative_affect
0,621e2e8e67b776a24055b564,2021-05-24,34.137687,57.432,89.603,,14.8,78.0,0.833333,0.675,...,Maintenance,intrinsic_regulation,,LOW,LOW,HIGH,HIGH,AVERAGE,,
1,621e328667b776a240281372,2021-05-24,,,,,,,,,...,Maintenance,intrinsic_regulation,,HIGH,AVERAGE,HIGH,LOW,LOW,,
2,621e326767b776a24012e179,2021-05-24,,46.120,53.968,95.3,14.4,80.0,0.666667,0.925,...,Maintenance,,,,,,,,,
3,621e332267b776a24092a584,2021-05-24,,,,,,,,,...,Maintenance,identified_regulation,,LOW,LOW,LOW,AVERAGE,HIGH,,
4,621e333567b776a240a0c217,2021-05-24,,,,,,,,,...,Contemplation,intrinsic_regulation,,AVERAGE,HIGH,AVERAGE,LOW,LOW,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,621e2efa67b776a2409dd1c3,2021-06-06,,,,,,,,,...,,,,,,,,,,
996,621e33ed67b776a2401cf5f7,2021-06-06,33.996583,,,,,,,,...,Precontemplation,intrinsic_regulation,0.0,HIGH,HIGH,AVERAGE,HIGH,HIGH,,
997,621e351a67b776a240f6204b,2021-06-06,33.651786,81.510,17.497,,17.4,0.0,0.000000,0.000,...,Action,intrinsic_regulation,0.0,,,,,,,
998,621e351a67b776a240f6204b,2021-06-06,33.651786,81.510,17.497,,17.4,0.0,0.000000,0.000,...,Action,intrinsic_regulation,0.0,,,,,,,


In [29]:
data.to_pickle("../data/unified_dataframe/data_unprocessed.pkl")

----------------------------------------------
# Checking preprocessing steps 

In [30]:
data = pd.read_pickle('../data/unified_dataframe/data_unprocessed.pkl')
data

Unnamed: 0,id,date,nightly_temperature,nremhr,rmssd,spo2,full_sleep_breathing_rate,stress_score,sleep_points_percentage,exertion_points_percentage,...,label_ttm_stage,label_breq_self_determination,label_sema_negative_feelings,label_ipip_extraversion_category,label_ipip_agreeableness_category,label_ipip_conscientiousness_category,label_ipip_stability_category,label_ipip_intellect_category,label_stai_stress_category,label_panas_negative_affect
0,621e2e8e67b776a24055b564,2021-05-24,34.137687,57.432,89.603,,14.8,78.0,0.833333,0.675,...,Maintenance,intrinsic_regulation,,LOW,LOW,HIGH,HIGH,AVERAGE,,
1,621e328667b776a240281372,2021-05-24,,,,,,,,,...,Maintenance,intrinsic_regulation,,HIGH,AVERAGE,HIGH,LOW,LOW,,
2,621e326767b776a24012e179,2021-05-24,,46.120,53.968,95.3,14.4,80.0,0.666667,0.925,...,Maintenance,,,,,,,,,
3,621e332267b776a24092a584,2021-05-24,,,,,,,,,...,Maintenance,identified_regulation,,LOW,LOW,LOW,AVERAGE,HIGH,,
4,621e333567b776a240a0c217,2021-05-24,,,,,,,,,...,Contemplation,intrinsic_regulation,,AVERAGE,HIGH,AVERAGE,LOW,LOW,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7799,621e30b267b776a240c5e13f,2022-01-17,,,,,,,,,...,,,,,,,,,,
7800,621e312a67b776a240164d59,2022-01-17,34.237581,,,,,,,,...,Contemplation,intrinsic_regulation,,AVERAGE,LOW,HIGH,AVERAGE,LOW,,
7801,621e309b67b776a240b532b0,2022-01-17,31.815133,,,,,,,,...,,,,,,,,,,
7802,621e333567b776a240a0c217,2022-01-17,,,,,,,,,...,Contemplation,intrinsic_regulation,,AVERAGE,HIGH,AVERAGE,LOW,LOW,,


# date engineering

In [31]:
data = preprocessing_functions.date_engineering(data)
# We do not process the startTime and endTime columns as we only use them for feature engineering later and then drop them completely
data

Unnamed: 0,id,date,nightly_temperature,nremhr,rmssd,spo2,full_sleep_breathing_rate,stress_score,sleep_points_percentage,exertion_points_percentage,...,label_stai_stress_category,label_panas_negative_affect,month_sin,weekday_sin,week_sin,day_sin,month_cos,weekday_cos,week_cos,day_cos
0,621e2e8e67b776a24055b564,2021-05-24,34.137687,57.432,89.603,,14.8,78.0,0.833333,0.675,...,,,-0.866025,0.0,0.309017,-0.988468,0.5,1.0,0.951057,0.151428
1,621e328667b776a240281372,2021-05-24,,,,,,,,,...,,,-0.866025,0.0,0.309017,-0.988468,0.5,1.0,0.951057,0.151428
2,621e326767b776a24012e179,2021-05-24,,46.120,53.968,95.3,14.4,80.0,0.666667,0.925,...,,,-0.866025,0.0,0.309017,-0.988468,0.5,1.0,0.951057,0.151428
3,621e332267b776a24092a584,2021-05-24,,,,,,,,,...,,,-0.866025,0.0,0.309017,-0.988468,0.5,1.0,0.951057,0.151428
4,621e333567b776a240a0c217,2021-05-24,,,,,,,,,...,,,-0.866025,0.0,0.309017,-0.988468,0.5,1.0,0.951057,0.151428
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7799,621e30b267b776a240c5e13f,2022-01-17,,,,,,,,,...,,,0.866025,0.0,0.809017,-0.299363,0.5,1.0,0.587785,-0.954139
7800,621e312a67b776a240164d59,2022-01-17,34.237581,,,,,,,,...,,,0.866025,0.0,0.809017,-0.299363,0.5,1.0,0.587785,-0.954139
7801,621e309b67b776a240b532b0,2022-01-17,31.815133,,,,,,,,...,,,0.866025,0.0,0.809017,-0.299363,0.5,1.0,0.587785,-0.954139
7802,621e333567b776a240a0c217,2022-01-17,,,,,,,,,...,,,0.866025,0.0,0.809017,-0.299363,0.5,1.0,0.587785,-0.954139


# Οne-hot-encoding
CATEGORICAL VARIABLES
1. badgeType
2. activityType
3. mindfulness_session
4. age
5. gender
6. bmi
7. heart_rate_alert
-----------------------------
8. ttm_stage
9. breq_self_determination
10. sema_negative_feelings
11. ipip_extraversion_category
12. ipip_agreeableness_category
13. ipip_conscientiousness_category
14. ipip_stability_category
15. ipip_intellect_category
16. stai_stress_category
17. panas_negative_affect

In [32]:
data = preprocessing_functions.one_hot_encoding(data)
data

Unnamed: 0,id,date,nightly_temperature,nremhr,rmssd,spo2,full_sleep_breathing_rate,stress_score,sleep_points_percentage,exertion_points_percentage,...,Martial Arts,Run,Spinning,Sport,Swim,Treadmill,Walk,Weights,Workout,Yoga/Pilates
0,621e2e8e67b776a24055b564,2021-05-24,34.137687,57.432,89.603,,14.8,78.0,0.833333,0.675,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,621e328667b776a240281372,2021-05-24,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,621e326767b776a24012e179,2021-05-24,,46.120,53.968,95.3,14.4,80.0,0.666667,0.925,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,621e332267b776a24092a584,2021-05-24,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,621e333567b776a240a0c217,2021-05-24,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7799,621e30b267b776a240c5e13f,2022-01-17,,,,,,,,,...,,,,,,,,,,
7800,621e312a67b776a240164d59,2022-01-17,34.237581,,,,,,,,...,,,,,,,,,,
7801,621e309b67b776a240b532b0,2022-01-17,31.815133,,,,,,,,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
7802,621e333567b776a240a0c217,2022-01-17,,,,,,,,,...,,,,,,,,,,


In [33]:
data.to_pickle("../data/unified_dataframe/temp.pkl")

In [34]:
data = pd.read_pickle("../data/unified_dataframe/temp.pkl")

------------------------------
# Post-preprocessing

1. Creates the 2 new columns: 1 about early adaptive features and 1 about using fitbit while sleeping
2. Replace outliers with NaNs
3. Replace NaN values with column's median for continuous features
4. Replace NaN values with column's more frequent occurrence for categorical features

In [35]:
data = preprocessing_functions.post_preprocessing(data, frequency='daily')
data

Unnamed: 0,id,date,nightly_temperature,nremhr,rmssd,spo2,full_sleep_breathing_rate,stress_score,sleep_points_percentage,exertion_points_percentage,...,Sport,Swim,Treadmill,Walk,Weights,Workout,Yoga/Pilates,wear_day,early_features,used_during_night
0,621e2e8e67b776a24055b564,2021-05-24,34.137687,57.432,89.603,95.9,14.8,78.0,0.833333,0.675,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1,0,0.873563
1,621e328667b776a240281372,2021-05-24,33.973120,62.388,35.216,95.9,14.6,76.0,0.766667,0.725,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1,0,0.100000
2,621e326767b776a24012e179,2021-05-24,33.973120,46.120,53.968,95.3,14.4,80.0,0.666667,0.925,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1,1,0.991379
3,621e332267b776a24092a584,2021-05-24,33.973120,62.388,35.216,95.9,14.6,76.0,0.766667,0.725,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1,1,0.405405
4,621e333567b776a240a0c217,2021-05-24,33.973120,62.388,35.216,95.9,14.6,76.0,0.766667,0.725,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1,0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7799,621e30b267b776a240c5e13f,2022-01-17,33.973120,62.388,35.216,95.9,14.6,76.0,0.766667,0.725,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0,0.876923
7800,621e312a67b776a240164d59,2022-01-17,34.237581,62.388,35.216,95.9,14.6,76.0,0.766667,0.725,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1,1,0.490741
7801,621e309b67b776a240b532b0,2022-01-17,31.815133,62.388,35.216,95.9,14.6,76.0,0.766667,0.725,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1,0,0.500000
7802,621e333567b776a240a0c217,2022-01-17,33.973120,62.388,35.216,95.9,14.6,76.0,0.766667,0.725,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1,0,0.000000


In [36]:
special_zeros = ['stress_score', 'sleep_points_percentage', 'exertion_points_percentage', 
                 'responsiveness_points_percentage']

for col in special_zeros:
    data.loc[data[col] == 0, col] = data[col].median()

In [37]:
# these columns have only 0 values
data = data.drop(columns=['heart_rate_alert'])

In [38]:
# replace different nan formats
data = data.where(~pd.isna(data), np.nan)

In [39]:
#create_report(data).show_browser()

In [40]:
data = data.sort_values(by=['id', 'date'])
data['label_panas_negative_affect'] = data['label_panas_negative_affect'].bfill()
data['label_stai_stress_category'] = data['label_stai_stress_category'].bfill()

In [41]:
#create_report(data).show_browser()

# Add all the new features


In [42]:
data = feature_engineering_functions.add_features(data, 'daily')

In [43]:
data['stress_quantile'].value_counts()

1    3902
0    1951
2    1951
Name: stress_quantile, dtype: int64

In [44]:
data['user_stress_quantile'].value_counts()

1    3867
0    1978
2    1959
Name: user_stress_quantile, dtype: int64

In [45]:
data

Unnamed: 0,id,date,nightly_temperature,nremhr,rmssd,spo2,full_sleep_breathing_rate,stress_score,sleep_points_percentage,exertion_points_percentage,...,Steps_hour14,Steps_hour15,Steps_hour16,Steps_hour17,Steps_hour18,Steps_hour19,Steps_hour20,Steps_hour21,Steps_hour22,Steps_hour23
0,621e2e8e67b776a24055b564,2021-05-24,34.137687,57.432,89.603,95.9,14.8,78.0,0.833333,0.675,...,191.0,33.0,342.0,1712.0,1838.0,160.0,155.0,37.0,31.0,53.0
1,621e2e8e67b776a24055b564,2021-05-25,33.794544,57.681,94.303,95.9,15.8,80.0,0.833333,0.725,...,120.0,24.0,446.0,377.0,3165.0,332.0,97.0,9.0,24.0,58.0
2,621e2e8e67b776a24055b564,2021-05-25,33.794544,57.681,94.303,95.9,15.8,80.0,0.833333,0.725,...,120.0,24.0,446.0,377.0,3165.0,332.0,97.0,9.0,24.0,58.0
3,621e2e8e67b776a24055b564,2021-05-26,34.611011,57.481,35.216,95.9,14.6,84.0,0.966667,0.725,...,85.0,477.0,390.0,2821.0,293.0,158.0,66.0,0.0,70.0,74.0
4,621e2e8e67b776a24055b564,2021-05-27,34.408304,57.493,35.216,95.9,14.8,82.0,0.933333,0.725,...,58.0,296.0,401.0,21.0,3054.0,503.0,411.0,96.0,63.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7799,621e375b67b776a240290cdc,2021-07-22,33.973120,62.388,35.216,95.9,14.6,76.0,0.766667,0.725,...,7.0,60.0,7.0,163.0,92.0,106.0,2034.0,3453.0,49.0,170.0
7800,621e375b67b776a240290cdc,2021-07-23,34.011607,55.542,30.813,95.9,16.6,81.0,0.700000,0.800,...,1013.0,810.0,371.0,833.0,1411.0,93.0,424.0,4029.0,1083.0,42.0
7801,621e375b67b776a240290cdc,2021-07-24,33.687826,69.579,19.407,95.9,17.0,84.0,0.833333,0.900,...,1656.0,1276.0,1903.0,534.0,491.0,583.0,84.0,1103.0,621.0,737.0
7802,621e375b67b776a240290cdc,2021-07-25,34.112386,65.899,22.892,95.9,18.0,84.0,0.833333,0.900,...,42.0,1235.0,716.0,1130.0,1163.0,888.0,245.0,1401.0,54.0,1627.0


In [46]:
data.to_pickle("../data/unified_dataframe/data_preprocessed.pkl")

In [47]:
user_data = data.loc[:, 'DAILY_FLOORS':'LIFETIME_WEIGHT_GOAL_SETUP']
user_data.columns

Index(['DAILY_FLOORS', 'DAILY_STEPS', 'GOAL_BASED_WEIGHT_LOSS',
       'LIFETIME_DISTANCE', 'LIFETIME_FLOORS', 'LIFETIME_WEIGHT_GOAL_SETUP'],
      dtype='object')

In [49]:
create_report(data).show_browser()

  0%|                                                                                        | 0/40030 [00:00<…