Connect securely to the database

In [None]:
import os
from dotenv import load_dotenv
from pymongo import MongoClient

load_dotenv("config.env")
MONGO_USER = os.getenv("MONGO_USER")
MONGO_PASSWORD = os.getenv("MONGO_PASSWORD")
client = MongoClient("mongodb://" + MONGO_USER + ":" + MONGO_PASSWORD + "@localhost:27017/")
database = client.rais

Find all the users

In [None]:
users = database.fitbit.distinct('id')

Create the stress score dataframe

In [None]:
import warnings
import statistics
import pandas as pd
from datetime import timedelta
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

stress_df = pd.DataFrame(columns=["id", "date", "stress_score"])

for user in users:
    # obtain the stress data
    user_data = pd.DataFrame(list(
        database.fitbit.find({"$and": [
            {"type": "Stress Score"},
            {"id": user},
            {"data.STRESS_SCORE": {"$gt": 0}} # this removes the calculation fail data (0s)
        ]},
            {"id": 1, "data.DATE": 1, "data.STRESS_SCORE": 1, "_id": 0}
        )
    ))

    # maintain only the users that have stress data / avoid the others
    if len(user_data.columns)>1:
        user_stress = pd.DataFrame(columns=["date", "stress_score"])
        user_stress["date"] = user_data["data"].apply(lambda d: d["DATE"])
        user_stress["date"] = pd.to_datetime(pd.to_datetime(user_stress["date"]).dt.date)
        user_stress["stress_score"] = user_data["data"].apply(lambda d: d["STRESS_SCORE"])

        # maintain only experiment's dates
        user_stress = user_stress.loc[((user_stress['date'] >'2021-05-23') & (user_stress['date'] <'2021-07-26')) | ((user_stress['date'] >'2021-11-14') & (user_stress['date'] <'2022-01-17'))]
        user_stress.reset_index(inplace=True, drop=True)

        # remove duplicates
        user_stress = user_stress.groupby(["date"]).first()
        user_stress.reset_index(inplace=True)

        # >=7 because we have weekly frequency (threshold)
        if len(user_stress)>=7:
            # end with the last appeared Sunday to shape a whole week
            if user_stress["date"][len(user_stress)-1].dayofweek!=6:
                now = user_stress["date"][len(user_stress)-1]
                sunday = now - timedelta(days=((now.isoweekday()) % 7))
                if sunday not in list(user_stress['date']): # add 0 stress score if Sunday not exists
                    user_stress = user_stress.loc[user_stress['date']<sunday]
                    user_stress.loc[len(user_stress.index)] = [sunday, 0]
                    user_stress.index = user_stress.index + 1
                    user_stress = user_stress.sort_index()
                    user_stress.reset_index(inplace=True, drop=True)
                else: # drop the days after Sunday
                    sunday_index = list(user_stress['date']).index(sunday)
                    user_stress = user_stress.iloc[:sunday_index+1]
                    user_stress.reset_index(inplace=True, drop=True)

            # resampling with weekly frequency that avoids 0s/NaNs
            user_stress = user_stress.set_index('date')
            user_stress = user_stress.resample('W-SUN').median()
            user_stress.reset_index(inplace=True)
            user_stress.insert(0, "id", user)

            # shift the date column one day after for better representation
            user_stress['date'] = user_stress['date'].apply(lambda x:x+timedelta(days=1))

            # maintain only experiment's dates / for the one that has 218 days
            user_stress = user_stress.loc[((user_stress['date'] >'2021-05-23') & (user_stress['date'] <'2021-07-26')) | ((user_stress['date'] >'2021-11-14') & (user_stress['date'] <'2022-01-17'))]
            user_stress.reset_index(inplace=True, drop=True)

            # drop NaNs & 0s
            user_stress.dropna(inplace=True)
            user_stress = user_stress[user_stress.stress_score != 0]
            user_stress.reset_index(inplace=True, drop=True)

            # cut to 2 decimals the stress scores and make them float
            user_stress['stress_score'] = user_stress['stress_score'].apply(lambda x:format(x,".2f"))
            user_stress['stress_score'] = pd.to_numeric(user_stress['stress_score'])

            # user average stress score and mean_variation, median_variation, passive_median_variation
            user_stress['average_stress_score'] = statistics.mean(list(user_stress['stress_score']))
            user_stress['median_stress_score'] = statistics.median(list(user_stress['stress_score']))
            user_stress['mean_variation'] = user_stress['stress_score'] - user_stress['average_stress_score']
            user_stress['median_variation'] = user_stress['stress_score'] - user_stress['median_stress_score']
            values = []
            user_stress['passive_median_stress_score'] = 0
            for it in range(len(user_stress['stress_score'])):
                values.append(user_stress['stress_score'][it])
                user_stress['passive_median_stress_score'][it] = statistics.mean(values)
            user_stress['passive_median_variation'] = user_stress['stress_score'] - user_stress['passive_median_stress_score']

            stress_df = pd.concat([stress_df, user_stress], axis=0)
stress_df["id"] = stress_df["id"].apply(lambda t: str(t))
stress_df

Create the stai dataframe

In [None]:
stai_df = pd.read_csv('stai.csv')
stai_df = stai_df.iloc[: , 1:-1]
stai_df.drop(["type"], axis=1, inplace=True)
stai_df.rename(columns={"user_id": "id", "submitdate": "date"}, inplace=True)
stai_df["date"] = pd.to_datetime(pd.to_datetime(stai_df["date"]).dt.date)
stai_df

Create the panas dataframe

In [None]:
panas_df = pd.read_csv('panas.csv')
panas_df = panas_df.iloc[: , 1:]
panas_df.drop(["type"], axis=1, inplace=True)
panas_df.rename(columns={"user_id": "id", "submitdate": "date"}, inplace=True)
panas_df["date"] = pd.to_datetime(pd.to_datetime(panas_df["date"]).dt.date)
panas_df

Merge stress scores and stai dataframe

In [None]:
df_fitbit_stai = stress_df.merge(stai_df, how='left', on=['id', 'date'])
# remove NaNs (no existence of stress score or surveys' response)
df_fitbit_stai.dropna(inplace=True)
df_fitbit_stai.reset_index(inplace=True, drop=True)
df_fitbit_stai

Merge stress scores and panas dataframe

In [None]:
df_fitbit_panas = stress_df.merge(panas_df, how='left', on=['id', 'date'])
# remove NaNs (no existence of stress score or surveys' response)
df_fitbit_panas.dropna(inplace=True)
df_fitbit_panas.reset_index(inplace=True, drop=True)
df_fitbit_panas

Scatter plot with stress score and stai score

In [None]:
import scipy as scipy
import matplotlib.pyplot as plt

stress_scores = list(df_fitbit_stai['stress_score'])
stai_scores = list(df_fitbit_stai['stai_stress'])

plt.scatter(stress_scores, stai_scores)
plt.xlabel("Stress scores")
plt.ylabel("Stai scores")
plt.title("Scatter plot")
plt.show()

Find the correlation between fitbit stress score and stai score

In [None]:
# Pearson correlation
print("Pearson correlation")
corr,p = scipy.stats.pearsonr(stress_scores, stai_scores)
print("Stress scores", corr, "\n")

# Spearman correlation
print("Spearman correlation")
corr = scipy.stats.spearmanr(stress_scores, stai_scores).correlation
print("Stress scores", corr, "\n")

# Kendall correlation
print("Kendall correlation")
corr = scipy.stats.kendalltau(stress_scores, stai_scores).correlation
print("Stress scores", corr, "\n")

Find the correlation between median variation from user baseline and stai score

In [None]:
median_variations = list(df_fitbit_stai['median_variation'])
# Pearson correlation
print("Pearson correlation")
corr_median, p_median = scipy.stats.pearsonr(median_variations, stai_scores)
print("Median variation", corr_median, "\n")
# Spearman correlation
print("Spearman correlation")
corr_median = scipy.stats.spearmanr(median_variations, stai_scores).correlation
print("Median variation", corr_median, "\n")
# Kendall correlation
print("Kendall correlation")
corr_median = scipy.stats.kendalltau(median_variations, stai_scores).correlation
print("Median variation", corr_median, "\n")

Find the correlation between mean variation from user baseline and stai score

In [None]:
mean_variations = list(df_fitbit_stai['mean_variation'])
# Pearson correlation
print("Pearson correlation")
corr_mean, p_mean = scipy.stats.pearsonr(mean_variations, stai_scores)
print("Mean variation", corr_mean, "\n")
# Spearman correlation
print("Spearman correlation")
corr_mean = scipy.stats.spearmanr(mean_variations, stai_scores).correlation
print("Mean variation", corr_mean, "\n")
# Kendall correlation
print("Kendall correlation")
corr_mean = scipy.stats.kendalltau(mean_variations, stai_scores).correlation
print("Mean variation", corr_mean, "\n")

Find the correlation between passive median variation from user baseline and stai score

In [None]:
passive_variations = list(df_fitbit_stai['passive_median_variation'])
# Pearson correlation
print("Pearson correlation")
corr_passive, p_passive = scipy.stats.pearsonr(passive_variations, stai_scores)
print("Passive median variation", corr_passive, "\n")
# Spearman correlation
print("Spearman correlation")
corr_passive = scipy.stats.spearmanr(passive_variations, stai_scores).correlation
print("Passive median variation", corr_passive, "\n")
# Kendall correlation
print("Kendall correlation")
corr_passive = scipy.stats.kendalltau(passive_variations, stai_scores).correlation
print("Passive median variation", corr_passive, "\n")

Scatter plot with stress score and panas positive score

In [None]:
panas_positive_scores = list(df_fitbit_panas['positive_affect_score'])
stress_scores = list(df_fitbit_panas['stress_score'])

plt.scatter(stress_scores, panas_positive_scores)
plt.xlabel("Stress scores")
plt.ylabel("Panas positive scores")
plt.title("Scatter plot")
plt.show()

Find the correlation between stress score and panas positive score

In [None]:
# Pearson correlation
print("Pearson correlation")
corr,p = scipy.stats.pearsonr(stress_scores, panas_positive_scores)
print("Stress scores", corr, "\n")

# Spearman correlation
print("Spearman correlation")
corr = scipy.stats.spearmanr(stress_scores, panas_positive_scores).correlation
print("Stress scores", corr, "\n")

# Kendall correlation
print("Kendall correlation")
corr = scipy.stats.kendalltau(stress_scores, panas_positive_scores).correlation
print("Stress scores", corr, "\n")

Scatter plot with stress score and panas negative score

In [None]:
panas_negative_scores = list(df_fitbit_panas['negative_affect_score'])

plt.scatter(stress_scores, panas_negative_scores)
plt.xlabel("Stress scores")
plt.ylabel("Panas negative scores")
plt.title("Scatter plot")
plt.show()

Find the correlation between stress score and panas negative score

In [None]:
# Pearson correlation
print("Pearson correlation")
corr,p = scipy.stats.pearsonr(stress_scores, panas_negative_scores)
print("Stress scores", corr, "\n")

# Spearman correlation
print("Spearman correlation")
corr = scipy.stats.spearmanr(stress_scores, panas_negative_scores).correlation
print("Stress scores", corr, "\n")

# Kendall correlation
print("Kendall correlation")
corr = scipy.stats.kendalltau(stress_scores, panas_negative_scores).correlation
print("Stress scores", corr, "\n")

Extra analysis/visualizations

In [None]:
from dataprep.eda import create_report
create_report(df_fitbit_stai).show_browser()

In [None]:
from pandas_profiling import ProfileReport
profile = ProfileReport(df_fitbit_stai, title="Pandas Profiling Report")
profile.to_notebook_iframe()