# Creating the dataframe

In [28]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from dataprep.eda import create_report
import seaborn as sns
import numpy as np
from datetime import datetime

In [29]:
# load fitbit data
fitbit = pd.read_pickle('../data/daily_fitbit_df_unprocessed.pkl')

# Functions

In [30]:
def fitbit_basic_preprocessing(df):
    
    # selecting the experiment days
    df = df.sort_values(by='date', ascending=True)
    df['date'] = pd.to_datetime(df['date'].astype("str"), format='%Y-%m-%d')
    df = df.loc[((df['date'] > '2021-05-23') & (df['date'] < '2021-07-27')) | (
                (df['date'] > '2021-11-14') & (df['date'] < '2022-01-18'))]
    df.reset_index(inplace=True, drop=True)

    # drop duplicates
    df = df.loc[df.astype(str).drop_duplicates().index]

    # convert data types falsely described as categorical
    df[["lightly_active_minutes", "moderately_active_minutes", "very_active_minutes", "sedentary_minutes"]] = df[
        ["lightly_active_minutes", "moderately_active_minutes", "very_active_minutes", "sedentary_minutes"]].apply(
        pd.to_numeric)

    return df

In [31]:
def sin_transform(values):
    """
    Applies SIN transform to a series value.
    Args:
        values (pd.Series): A series to apply SIN transform on.
    Returns
        (pd.Series): The transformed series.
    """

    return np.sin(2 * np.pi * values / len(set(values)))


def cos_transform(values):
    """
    Applies COS transform to a series value.
    Args:
        values (pd.Series): A series to apply SIN transform on.
    Returns
        (pd.Series): The transformed series.
    """
    return np.cos(2 * np.pi * values / len(set(values)))


def date_engineering(data):  # data could be any dataframe that needs date engineering

    data['date'] = pd.to_datetime(data.date, format='%m/%d/%y %H:%M:%S')
    data = data.astype({"date": str})

    # Extract features from date
    data["year"] = data["date"].apply(lambda x: datetime.strptime(x, '%Y-%m-%d').year)
    data["month"] = data["date"].apply(lambda x: datetime.strptime(x, '%Y-%m-%d').month)
    data["weekday"] = data["date"].apply(lambda x: datetime.strptime(x, '%Y-%m-%d').weekday())
    data["week"] = data["date"].apply(lambda x: datetime.strptime(x, '%Y-%m-%d').isocalendar()[1])
    data["day"] = data["date"].apply(lambda x: datetime.strptime(x, '%Y-%m-%d').day)

    # Sin transformation in date features
    data["month_sin"] = sin_transform(data["month"])
    data["weekday_sin"] = sin_transform(data["weekday"])
    data["week_sin"] = sin_transform(data["week"])
    data["day_sin"] = sin_transform(data["day"])

    # Cosine transformation in date features
    data["month_cos"] = cos_transform(data["month"])
    data["weekday_cos"] = cos_transform(data["weekday"])
    data["week_cos"] = cos_transform(data["week"])
    data["day_cos"] = cos_transform(data["day"])

    data = data.drop(columns=['date', 'year', 'month', 'weekday', 'week', 'day'])

    return data

In [32]:
def sema_basic_preprocessing(df):
    df["negative_feelings"] = np.where(df['TENSE/ANXIOUS']== 1, 1, np.where(df['ALERT']==1,1, np.where(df['SAD']==1,1, np.where(df['TIRED']==1,1, 0))))
    df["positive_feelings"] = np.where(df['HAPPY']== 1, 1, np.where(df['NEUTRAL']==1,1, np.where(df['RESTED/RELAXED']==1,1, 0)))
    df = df.drop(columns=['ALERT', 'HAPPY', 'NEUTRAL', 'RESTED/RELAXED', 'SAD', 'TENSE/ANXIOUS', 'TIRED'])

    return df

In [33]:
def one_hot_encoding(fitbit):
    
    # badgeType encoding 
    s = fitbit['badgeType']
    dum = pd.get_dummies(s.apply(pd.Series).stack()).sum(level=0)
    df = pd.concat([s, dum], axis=1)
    fitbit = pd.concat([fitbit, df], axis=1)
    fitbit = fitbit.drop(columns='badgeType')
    
    # activity type encoding
    s = fitbit['activityType']
    dum = pd.get_dummies(s.apply(pd.Series).stack()).sum(level=0)
    df = pd.concat([s, dum], axis=1)
    fitbit = pd.concat([fitbit, df], axis=1)
    fitbit = fitbit.drop(columns='activityType')
        
    # mindfulness session encoding - highly imbalanced
    fitbit['mindfulness_session'].replace(to_replace=['False', True], value=[0, 1], inplace=True)
    
    # age encoding
    fitbit['age'].replace(to_replace=['<30', '>=30'], value=[0, 1], inplace=True)
    
    # gender encoding
    fitbit['gender'].replace(to_replace=['MALE', 'FEMALE'], value=[0, 1], inplace=True)     
    
    # bmi encoding
    fitbit['bmi'] = fitbit['bmi'].fillna(fitbit['bmi'].mode().iloc[0])
    fitbit["bmi"] = fitbit["bmi"].apply(lambda x: 31.0 if x == '>=30' else x)
    fitbit["bmi"] = fitbit["bmi"].apply(lambda x: 18.0 if x == '<19' else x)
    fitbit["bmi"] = fitbit["bmi"].apply(lambda x: 26.0 if x == '>=25' else x) # it belongs to overweight
    fitbit['bmi'] = fitbit.bmi.apply(lambda bmi: 'Underweight' if bmi < 18.5 else ('Normal' if bmi < 25 else ('Overweight' if bmi < 30 else 'Obese'))) # 0: Underweight, 1: Normal, 2: Overweight, 3: Obese
    
    # ECG alert encoding
    fitbit['heart_rate_alert'].replace(to_replace=['NONE', 'LOW_HR'], value=[0, 1], inplace=True)
    
    return fitbit

---------------------------------------------------------
Fitbit Basic Preprocessing
---------------------------------------------------------
1. Select the experiment days
2. Drop duplicates
3. Convert data types falsely described as categorical

In [34]:
fitbit = fitbit_basic_preprocessing(fitbit)
fitbit.columns

Index(['id', 'date', 'nightly_temperature', 'nremhr', 'rmssd', 'spo2',
       'full_sleep_breathing_rate', 'stress_score', 'sleep_points_percentage',
       'exertion_points_percentage', 'responsiveness_points_percentage',
       'daily_temperature_variation', 'badgeType', 'calories',
       'filteredDemographicVO2Max', 'distance', 'activityType', 'bpm',
       'lightly_active_minutes', 'moderately_active_minutes',
       'very_active_minutes', 'sedentary_minutes', 'mindfulness_session',
       'scl_avg', 'resting_hr', 'sleep_duration', 'minutesToFallAsleep',
       'minutesAsleep', 'minutesAwake', 'minutesAfterWakeup',
       'sleep_efficiency', 'sleep_deep_ratio', 'sleep_wake_ratio',
       'sleep_light_ratio', 'sleep_rem_ratio', 'steps',
       'minutes_in_default_zone_1', 'minutes_below_default_zone_1',
       'minutes_in_default_zone_2', 'minutes_in_default_zone_3', 'age',
       'gender', 'bmi', 'heart_rate_alert'],
      dtype='object')

In [35]:
fitbit["date"] = pd.to_datetime(pd.to_datetime(fitbit["date"]).dt.date)
fitbit = fitbit.sort_values(by='date', ascending=True)
fitbit.head()

Unnamed: 0,id,date,nightly_temperature,nremhr,rmssd,spo2,full_sleep_breathing_rate,stress_score,sleep_points_percentage,exertion_points_percentage,...,sleep_rem_ratio,steps,minutes_in_default_zone_1,minutes_below_default_zone_1,minutes_in_default_zone_2,minutes_in_default_zone_3,age,gender,bmi,heart_rate_alert
0,621e2e8e67b776a24055b564,2021-05-24,34.137687,57.432,89.603,,14.8,78.0,0.833333,0.675,...,1.341772,8833.0,83.0,1349.0,0.0,0.0,<30,MALE,<19,
24,621e328667b776a240281372,2021-05-24,,,,,,,,,...,,8550.0,278.0,766.0,29.0,1.0,>=30,MALE,>=30,
25,621e326767b776a24012e179,2021-05-24,,46.12,53.968,95.3,14.4,80.0,0.666667,0.925,...,,16992.0,131.0,1225.0,0.0,0.0,<30,FEMALE,<19,
26,621e332267b776a24092a584,2021-05-24,,,,,,,,,...,,21284.0,175.0,1130.0,9.0,0.0,<30,MALE,21.0,
27,621e333567b776a240a0c217,2021-05-24,,,,,,,,,...,,,,,,,<30,MALE,21.0,


-----------------------
# Merge all self-reported data with the fitbit data constructing a unified dataframe


# ttm

In [36]:
ttm = pd.read_pickle('../data/surveys/ttm_classification.pkl')
ttm = ttm.rename(columns = {"stage": "ttm_stage"})
ttm["date"] = pd.to_datetime(pd.to_datetime(ttm["date"]).dt.date)
ttm = ttm.sort_values(by='date', ascending=True)
ttm.head()

Unnamed: 0,id,date,ttm_stage
0,621e314867b776a24029ebf9,2021-05-31,Preparation
1,621e36c267b776a240ba2756,2021-05-31,Action
2,621e328667b776a240281372,2021-05-31,Maintenance
3,621e2f3967b776a240c654db,2021-05-31,Preparation
4,621e32d967b776a240627414,2021-05-31,Maintenance


In [37]:
data = fitbit.merge(ttm, how='left', on=['id','date'])
data

Unnamed: 0,id,date,nightly_temperature,nremhr,rmssd,spo2,full_sleep_breathing_rate,stress_score,sleep_points_percentage,exertion_points_percentage,...,steps,minutes_in_default_zone_1,minutes_below_default_zone_1,minutes_in_default_zone_2,minutes_in_default_zone_3,age,gender,bmi,heart_rate_alert,ttm_stage
0,621e2e8e67b776a24055b564,2021-05-24,34.137687,57.432,89.603,,14.8,78.0,0.833333,0.675,...,8833.0,83.0,1349.0,0.0,0.0,<30,MALE,<19,,
1,621e328667b776a240281372,2021-05-24,,,,,,,,,...,8550.0,278.0,766.0,29.0,1.0,>=30,MALE,>=30,,
2,621e326767b776a24012e179,2021-05-24,,46.120,53.968,95.3,14.4,80.0,0.666667,0.925,...,16992.0,131.0,1225.0,0.0,0.0,<30,FEMALE,<19,,
3,621e332267b776a24092a584,2021-05-24,,,,,,,,,...,21284.0,175.0,1130.0,9.0,0.0,<30,MALE,21.0,,
4,621e333567b776a240a0c217,2021-05-24,,,,,,,,,...,,,,,,<30,MALE,21.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4902,621e30b267b776a240c5e13f,2022-01-17,,,,,,,,,...,287.0,19.0,126.0,0.0,0.0,<30,FEMALE,21.0,,
4903,621e312a67b776a240164d59,2022-01-17,34.237581,,,,,,,,...,4966.0,114.0,563.0,0.0,0.0,>=30,MALE,25.0,,Contemplation
4904,621e309b67b776a240b532b0,2022-01-17,31.815133,,,,,,,,...,12096.0,129.0,1208.0,6.0,0.0,>=30,MALE,23.0,,
4905,621e333567b776a240a0c217,2022-01-17,,,,,,,,,...,,,,,,<30,MALE,21.0,,


# breq

In [38]:
breq = pd.read_pickle('../data/surveys/breq.pkl')
breq = breq.rename(columns = {"user_id": "id", "submitdate":"date"})
breq["date"] = pd.to_datetime(pd.to_datetime(breq["date"]).dt.date)
breq = breq.sort_values(by='date', ascending=True)
breq = breq.drop(columns=['type','breq_amotivation', 'breq_external_regulation', 'breq_introjected_regulation', 'breq_identified_regulation', 'breq_intrinsic_regulation'])
breq.head()

Unnamed: 0,id,date,breq_self_determination
0,621e2e8e67b776a24055b564,2021-05-31,intrinsic_regulation
3,621e301e67b776a240608a72,2021-05-31,intrinsic_regulation
0,621e314867b776a24029ebf9,2021-05-31,identified_regulation
0,621e328667b776a240281372,2021-05-31,intrinsic_regulation
0,621e329067b776a2402ffad2,2021-05-31,intrinsic_regulation


In [39]:
data = data.merge(breq, how='left', on=['id','date'])
data.head()

Unnamed: 0,id,date,nightly_temperature,nremhr,rmssd,spo2,full_sleep_breathing_rate,stress_score,sleep_points_percentage,exertion_points_percentage,...,minutes_in_default_zone_1,minutes_below_default_zone_1,minutes_in_default_zone_2,minutes_in_default_zone_3,age,gender,bmi,heart_rate_alert,ttm_stage,breq_self_determination
0,621e2e8e67b776a24055b564,2021-05-24,34.137687,57.432,89.603,,14.8,78.0,0.833333,0.675,...,83.0,1349.0,0.0,0.0,<30,MALE,<19,,,
1,621e328667b776a240281372,2021-05-24,,,,,,,,,...,278.0,766.0,29.0,1.0,>=30,MALE,>=30,,,
2,621e326767b776a24012e179,2021-05-24,,46.12,53.968,95.3,14.4,80.0,0.666667,0.925,...,131.0,1225.0,0.0,0.0,<30,FEMALE,<19,,,
3,621e332267b776a24092a584,2021-05-24,,,,,,,,,...,175.0,1130.0,9.0,0.0,<30,MALE,21.0,,,
4,621e333567b776a240a0c217,2021-05-24,,,,,,,,,...,,,,,<30,MALE,21.0,,,


# sema

In [40]:
sema = pd.read_pickle('../data/semas/semas_read_from_the_base_experiment_dates.pkl')
sema = sema_basic_preprocessing(sema)
sema["date"] = pd.to_datetime(pd.to_datetime(sema["date"]).dt.date)
sema = sema.sort_values(by='date', ascending=True)
sema = sema.drop(columns='positive_feelings')
sema = sema.rename(columns = {"negative_feelings": "sema_negative_feelings"})
sema.head()

Unnamed: 0,id,date,sema_negative_feelings
0,621e2f3967b776a240c654db,2021-05-24,0
11,621e362467b776a2404ad513,2021-05-24,0
10,621e301e67b776a240608a72,2021-05-24,0
9,621e30e467b776a240e817c7,2021-05-24,1
8,621e36f967b776a240e5e7c9,2021-05-24,1


In [41]:
data = data.merge(sema, how='left', on=['id','date'])
data.head()

Unnamed: 0,id,date,nightly_temperature,nremhr,rmssd,spo2,full_sleep_breathing_rate,stress_score,sleep_points_percentage,exertion_points_percentage,...,minutes_below_default_zone_1,minutes_in_default_zone_2,minutes_in_default_zone_3,age,gender,bmi,heart_rate_alert,ttm_stage,breq_self_determination,sema_negative_feelings
0,621e2e8e67b776a24055b564,2021-05-24,34.137687,57.432,89.603,,14.8,78.0,0.833333,0.675,...,1349.0,0.0,0.0,<30,MALE,<19,,,,
1,621e328667b776a240281372,2021-05-24,,,,,,,,,...,766.0,29.0,1.0,>=30,MALE,>=30,,,,
2,621e326767b776a24012e179,2021-05-24,,46.12,53.968,95.3,14.4,80.0,0.666667,0.925,...,1225.0,0.0,0.0,<30,FEMALE,<19,,,,
3,621e332267b776a24092a584,2021-05-24,,,,,,,,,...,1130.0,9.0,0.0,<30,MALE,21.0,,,,
4,621e333567b776a240a0c217,2021-05-24,,,,,,,,,...,,,,<30,MALE,21.0,,,,


# personality

In [42]:
big5 = pd.read_pickle('../data/surveys/personality.pkl')
big5 = big5.rename(columns={"user_id": "id", "submitdate": "date"})
big5["date"] = pd.to_datetime(pd.to_datetime(big5["date"]).dt.date)
big5 = big5.sort_values(by='date', ascending=True)
big5 = big5.drop(columns=['type','extraversion', 'agreeableness', 'conscientiousness', 'stability', 'intellect',  'gender'])
big5.head()

Unnamed: 0,id,date,ipip_extraversion_category,ipip_agreeableness_category,ipip_conscientiousness_category,ipip_stability_category,ipip_intellect_category
0,621e2e8e67b776a24055b564,2021-05-31,LOW,LOW,HIGH,HIGH,AVERAGE
48,621e36c267b776a240ba2756,2021-05-31,HIGH,AVERAGE,LOW,HIGH,LOW
39,621e34db67b776a240c9c2be,2021-05-31,AVERAGE,HIGH,AVERAGE,HIGH,HIGH
3,621e2f3967b776a240c654db,2021-05-31,AVERAGE,AVERAGE,LOW,HIGH,AVERAGE
37,621e341067b776a24037b105,2021-05-31,LOW,LOW,LOW,LOW,LOW


In [43]:
data = data.merge(big5, how='left', on=['id','date'])
data.head()

Unnamed: 0,id,date,nightly_temperature,nremhr,rmssd,spo2,full_sleep_breathing_rate,stress_score,sleep_points_percentage,exertion_points_percentage,...,bmi,heart_rate_alert,ttm_stage,breq_self_determination,sema_negative_feelings,ipip_extraversion_category,ipip_agreeableness_category,ipip_conscientiousness_category,ipip_stability_category,ipip_intellect_category
0,621e2e8e67b776a24055b564,2021-05-24,34.137687,57.432,89.603,,14.8,78.0,0.833333,0.675,...,<19,,,,,,,,,
1,621e328667b776a240281372,2021-05-24,,,,,,,,,...,>=30,,,,,,,,,
2,621e326767b776a24012e179,2021-05-24,,46.12,53.968,95.3,14.4,80.0,0.666667,0.925,...,<19,,,,,,,,,
3,621e332267b776a24092a584,2021-05-24,,,,,,,,,...,21.0,,,,,,,,,
4,621e333567b776a240a0c217,2021-05-24,,,,,,,,,...,21.0,,,,,,,,,


# stai

In [44]:
stai = pd.read_pickle('../data/surveys/stai.pkl')
stai = stai.rename(columns={"user_id": "id", "submitdate": "date"})
stai["date"] = pd.to_datetime(pd.to_datetime(stai["date"]).dt.date)
stai = stai.sort_values(by='date', ascending=True)
stai = stai.drop(columns=['type', 'stai_stress'])
stai.head()

Unnamed: 0,id,date,stai_stress_category
0,621e2e8e67b776a24055b564,2021-05-31,Below average
0,621e328667b776a240281372,2021-05-31,Average
0,621e329067b776a2402ffad2,2021-05-31,Above average
3,621e30e467b776a240e817c7,2021-05-31,Above average
4,621e32af67b776a24045b4cf,2021-05-31,Above average


In [45]:
data = data.merge(stai, how='left', on=['id','date'])
data.head()

Unnamed: 0,id,date,nightly_temperature,nremhr,rmssd,spo2,full_sleep_breathing_rate,stress_score,sleep_points_percentage,exertion_points_percentage,...,heart_rate_alert,ttm_stage,breq_self_determination,sema_negative_feelings,ipip_extraversion_category,ipip_agreeableness_category,ipip_conscientiousness_category,ipip_stability_category,ipip_intellect_category,stai_stress_category
0,621e2e8e67b776a24055b564,2021-05-24,34.137687,57.432,89.603,,14.8,78.0,0.833333,0.675,...,,,,,,,,,,
1,621e328667b776a240281372,2021-05-24,,,,,,,,,...,,,,,,,,,,
2,621e326767b776a24012e179,2021-05-24,,46.12,53.968,95.3,14.4,80.0,0.666667,0.925,...,,,,,,,,,,
3,621e332267b776a24092a584,2021-05-24,,,,,,,,,...,,,,,,,,,,
4,621e333567b776a240a0c217,2021-05-24,,,,,,,,,...,,,,,,,,,,


# panas

In [46]:
panas = pd.read_pickle('../data/surveys/panas_classification.pkl')
panas = panas.rename(columns={"user_id": "id", "submitdate": "date", "negative_affect_category":"panas_negative_affect"})
panas["date"] = pd.to_datetime(pd.to_datetime(panas["date"]).dt.date)
panas = panas.sort_values(by='date', ascending=True)
panas.head()

Unnamed: 0,id,date,panas_negative_affect
0,621e329067b776a2402ffad2,2021-05-31,Average
0,621e328667b776a240281372,2021-05-31,Above average
4,621e32af67b776a24045b4cf,2021-05-31,Above average
6,621e301e67b776a240608a72,2021-05-31,Average
2,621e32d967b776a240627414,2021-05-31,Average


In [47]:
data = data.merge(panas, how='left', on=['id','date'])
data.head()

Unnamed: 0,id,date,nightly_temperature,nremhr,rmssd,spo2,full_sleep_breathing_rate,stress_score,sleep_points_percentage,exertion_points_percentage,...,ttm_stage,breq_self_determination,sema_negative_feelings,ipip_extraversion_category,ipip_agreeableness_category,ipip_conscientiousness_category,ipip_stability_category,ipip_intellect_category,stai_stress_category,panas_negative_affect
0,621e2e8e67b776a24055b564,2021-05-24,34.137687,57.432,89.603,,14.8,78.0,0.833333,0.675,...,,,,,,,,,,
1,621e328667b776a240281372,2021-05-24,,,,,,,,,...,,,,,,,,,,
2,621e326767b776a24012e179,2021-05-24,,46.12,53.968,95.3,14.4,80.0,0.666667,0.925,...,,,,,,,,,,
3,621e332267b776a24092a584,2021-05-24,,,,,,,,,...,,,,,,,,,,
4,621e333567b776a240a0c217,2021-05-24,,,,,,,,,...,,,,,,,,,,


In [48]:
data.to_pickle("../data_unprocessed.pkl")

----------------------------------------------
# Checking preprocessing steps 

In [22]:
data = pd.read_pickle('data/data_unprocessed.pkl')
data

Unnamed: 0,id,date,nightly_temperature,nremhr,rmssd,spo2,full_sleep_breathing_rate,stress_score,sleep_points_percentage,exertion_points_percentage,...,ttm_stage,breq_self_determination,sema_negative_feelings,ipip_extraversion_category,ipip_agreeableness_category,ipip_conscientiousness_category,ipip_stability_category,ipip_intellect_category,stai_stress_category,panas_negative_affect
0,621e2e8e67b776a24055b564,2021-05-24,34.137687,57.432,89.603,,14.8,78.0,0.833333,0.675,...,,,,,,,,,,
1,621e328667b776a240281372,2021-05-24,,,,,,,,,...,,,,,,,,,,
2,621e326767b776a24012e179,2021-05-24,,46.120,53.968,95.3,14.4,80.0,0.666667,0.925,...,,,,,,,,,,
3,621e332267b776a24092a584,2021-05-24,,,,,,,,,...,,,,,,,,,,
4,621e333567b776a240a0c217,2021-05-24,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7628,621e339967b776a240e502de,2021-11-20,,,,,,,,,...,,,0.0,,,,,,,
7629,621e30e467b776a240e817c7,2021-07-31,,,,,,,,,...,,,,,,,,,Average,Average
7630,621e351a67b776a240f6204b,2021-08-02,,,,,,,,,...,,,,,,,,,Below average,Below average
7631,621e328667b776a240281372,2021-09-19,,,,,,,,,...,,,,,,,,,Above average,Above average


In [23]:
#create_report(data).show_browser()

# date engineering

In [24]:
data = date_engineering(data)
data

Unnamed: 0,id,nightly_temperature,nremhr,rmssd,spo2,full_sleep_breathing_rate,stress_score,sleep_points_percentage,exertion_points_percentage,responsiveness_points_percentage,...,stai_stress_category,panas_negative_affect,month_sin,weekday_sin,week_sin,day_sin,month_cos,weekday_cos,week_cos,day_cos
0,621e2e8e67b776a24055b564,34.137687,57.432,89.603,,14.8,78.0,0.833333,0.675,0.866667,...,,,-7.071068e-01,0.000000,-0.844328,-9.884683e-01,-0.707107,1.000000,0.535827,0.151428
1,621e328667b776a240281372,,,,,,,,,,...,,,-7.071068e-01,0.000000,-0.844328,-9.884683e-01,-0.707107,1.000000,0.535827,0.151428
2,621e326767b776a24012e179,,46.120,53.968,95.3,14.4,80.0,0.666667,0.925,0.766667,...,,,-7.071068e-01,0.000000,-0.844328,-9.884683e-01,-0.707107,1.000000,0.535827,0.151428
3,621e332267b776a24092a584,,,,,,,,,,...,,,-7.071068e-01,0.000000,-0.844328,-9.884683e-01,-0.707107,1.000000,0.535827,0.151428
4,621e333567b776a240a0c217,,,,,,,,,,...,,,-7.071068e-01,0.000000,-0.844328,-9.884683e-01,-0.707107,1.000000,0.535827,0.151428
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7628,621e339967b776a240e502de,,,,,,,,,,...,,,7.071068e-01,-0.974928,-0.844328,-7.907757e-01,-0.707107,-0.222521,0.535827,-0.612106
7629,621e30e467b776a240e817c7,,,,,,,,,,...,Average,Average,-7.071068e-01,-0.974928,0.951057,-2.449294e-16,0.707107,-0.222521,0.309017,1.000000
7630,621e351a67b776a240f6204b,,,,,,,,,,...,Below average,Below average,-2.449294e-16,0.000000,0.998027,3.943559e-01,1.000000,1.000000,0.062791,0.918958
7631,621e328667b776a240281372,,,,,,,,,,...,Above average,Above average,7.071068e-01,-0.781831,0.125333,-6.513725e-01,0.707107,0.623490,-0.992115,-0.758758


In [25]:
#create_report(data).show_browser()

# Οne-hot-encoding
CATEGORICAL VARIABLES
1. badgeType
2. activityType
3. mindfulness_session
4. age
5. gender
6. bmi
7. heart_rate_alert
8. ttm_stage
9. breq_self_determination
10. sema_negative_feelings
11. ipip_extraversion_category
12. ipip_agreeableness_category
13. ipip_conscientiousness_category
14. ipip_stability_category
15. ipip_intellect_category
16. stai_stress_category
17. panas_negative_affect

In [26]:
data = one_hot_encoding(data)
data

Unnamed: 0,id,nightly_temperature,nremhr,rmssd,spo2,full_sleep_breathing_rate,stress_score,sleep_points_percentage,exertion_points_percentage,responsiveness_points_percentage,...,Martial Arts,Run,Spinning,Sport,Swim,Treadmill,Walk,Weights,Workout,Yoga/Pilates
0,621e2e8e67b776a24055b564,34.137687,57.432,89.603,,14.8,78.0,0.833333,0.675,0.866667,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,621e328667b776a240281372,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,621e326767b776a24012e179,,46.120,53.968,95.3,14.4,80.0,0.666667,0.925,0.766667,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,621e332267b776a24092a584,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,621e333567b776a240a0c217,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7628,621e339967b776a240e502de,,,,,,,,,,...,,,,,,,,,,
7629,621e30e467b776a240e817c7,,,,,,,,,,...,,,,,,,,,,
7630,621e351a67b776a240f6204b,,,,,,,,,,...,,,,,,,,,,
7631,621e328667b776a240281372,,,,,,,,,,...,,,,,,,,,,


In [27]:
#create_report(data).show_browser()

------------------------------
# Post-preprocessing

1. Creates the 3 new columns about spo2, scl_avg, heart_rate_alert 
2. Replace outliers with NaNs
3. Replace NaN values with column's median for continuous features
4. Replace NaN values with column's more frequent occurrence for categorical features

In [28]:
# Creates 3 columns that represent if a user has tracked at least once its spo2 or eda or ecg
def use_EDA_SpO2_ECG(df):
    df['spo2_tracking'] = ""
    df['EDA_tracking'] = ""
    df['ECG_tracking'] = ""
    users = list(df['id'].unique())

    for user in users:
        user_df = df.loc[df['id'] == user]
        #spo2
        if user_df['spo2'].isnull().sum() == len(user_df):
            df.loc[df['id'] == user, 'spo2_tracking'] = 0
        else:
            df.loc[df['id'] == user, 'spo2_tracking'] = 1
        #EDA
        if user_df['scl_avg'].isnull().sum() == len(user_df):
            df.loc[df['id'] == user, 'EDA_tracking'] = 0
        else:
            df.loc[df['id'] == user, 'EDA_tracking'] = 1
        #ECG
        if user_df['heart_rate_alert'].isnull().sum() == len(user_df):
            df.loc[df['id'] == user, 'ECG_tracking'] = 0
        else:
            df.loc[df['id'] == user, 'ECG_tracking'] = 1
    return df

In [29]:
def post_preprocessing(df):

    df = use_EDA_SpO2_ECG(df)

    categorical = ['mindfulness_session','age','gender', 'bmi', 'heart_rate_alert', 'ttm_stage','breq_self_determination', 
               'sema_negative_feelings', 'ipip_extraversion_category', 'ipip_agreeableness_category',
               'ipip_conscientiousness_category', 'ipip_stability_category','ipip_intellect_category', 
               'stai_stress_category', 'panas_negative_affect', 'DAILY_FLOORS', 'DAILY_STEPS', 
               'GOAL_BASED_WEIGHT_LOSS', 'LIFETIME_DISTANCE', 'LIFETIME_FLOORS', 'LIFETIME_WEIGHT_GOAL_SETUP',
               'Aerobic Workout', 'Bike', 'Bootcamp', 'Circuit Training', 'Elliptical','Hike', 'Interval Workout', 
               'Martial Arts', 'Run', 'Spinning', 'Sport', 'Swim', 'Treadmill', 'Walk', 'Weights', 'Workout', 
               'Yoga/Pilates']

    # Replace outliers with NaNs
    
    # separately for each column in the dataframe
    columns = list(df.iloc[:, 1:].columns)  # excludes id
    # exclude categorical features
    for x in categorical:
        columns.remove(x)
    for col in columns:
        df[col] = df[col].mask(df[col].sub(df[col].mean()).div(df[col].std()).abs().gt(3))

    # Replace NaN values with column's median for continuous features
    columns = list(df.iloc[:, 1:].columns)  # excludes id 
    # exclude categorical features
    for x in categorical:
        columns.remove(x)
    for col in columns:
        df[col] = df[col].apply(pd.to_numeric, errors='coerce')
        df[col] = df[col].fillna(df[col].median())

    # Replace NaN values with column's more frequent occurrence for categorical features
    for col in categorical:
        df[col] = df[col].fillna(df[col].mode().iloc[0])
    
    return df

In [30]:
data = post_preprocessing(data)
data

Unnamed: 0,id,nightly_temperature,nremhr,rmssd,spo2,full_sleep_breathing_rate,stress_score,sleep_points_percentage,exertion_points_percentage,responsiveness_points_percentage,...,Sport,Swim,Treadmill,Walk,Weights,Workout,Yoga/Pilates,spo2_tracking,EDA_tracking,ECG_tracking
0,621e2e8e67b776a24055b564,34.137687,57.432,89.603,95.9,14.8,78.0,0.833333,0.675,0.866667,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0.0,0
1,621e328667b776a240281372,33.971195,62.262,35.216,95.9,14.6,76.0,0.766667,0.725,0.766667,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0.0,0
2,621e326767b776a24012e179,33.971195,46.120,53.968,95.3,14.4,80.0,0.666667,0.925,0.766667,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1,0.0,1
3,621e332267b776a24092a584,33.971195,62.262,35.216,95.9,14.6,76.0,0.766667,0.725,0.766667,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1,0.0,0
4,621e333567b776a240a0c217,33.971195,62.262,35.216,95.9,14.6,76.0,0.766667,0.725,0.766667,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7628,621e339967b776a240e502de,33.971195,62.262,35.216,95.9,14.6,76.0,0.766667,0.725,0.766667,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1,0.0,1
7629,621e30e467b776a240e817c7,33.971195,62.262,35.216,95.9,14.6,76.0,0.766667,0.725,0.766667,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1,0.0,0
7630,621e351a67b776a240f6204b,33.971195,62.262,35.216,95.9,14.6,76.0,0.766667,0.725,0.766667,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1,0.0,1
7631,621e328667b776a240281372,33.971195,62.262,35.216,95.9,14.6,76.0,0.766667,0.725,0.766667,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0.0,0


In [31]:
special_zeros = ['stress_score', 'sleep_points_percentage', 'exertion_points_percentage', 
                 'responsiveness_points_percentage']

for col in special_zeros:
    data.loc[data[col] == 0, col] = data[col].median()

In [32]:
data['EDA_tracking'].value_counts()

0.0    7633
Name: EDA_tracking, dtype: int64

In [33]:
data['heart_rate_alert'].value_counts()

0.0    7633
Name: heart_rate_alert, dtype: int64

In [34]:
# these columns have only 0 values
data = data.drop(columns=['heart_rate_alert', 'EDA_tracking'])

In [35]:
#create_report(data).show_browser()

In [36]:
data.to_pickle("data/data_preprocessed.pkl")