In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta, date
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import statsmodels.api as sm

import sys
import os.path as osp

SRC_SUBDIR = '../src/'
SRC_SUBDIR = osp.abspath(SRC_SUBDIR)
if SRC_SUBDIR not in sys.path:
    sys.path.insert(1, SRC_SUBDIR)


In [2]:
df = pd.read_csv("../data/processed/features_dataset.csv")

In [3]:
df.head()

Unnamed: 0,Record_Date,value,Eat Dinner,Eat Breakfast,Eat Lunch,Gratitude Jar,Meditation,Mindfulness Walk,Read For Fun,Take A Shower,...,Eat Lunch_lag1,Gratitude Jar_lag1,Meditation_lag1,Mindfulness Walk_lag1,Read For Fun_lag1,Take A Shower_lag1,Yoga_lag1,Yoga_Streak,Mindfulness Walk_Streak,Mindfulness
0,2022-10-27,2.833333,,,,1.0,,,,,...,,,,,,,,,,0.0
1,2022-10-28,3.125,,,,0.0,,,,,...,,1.0,,,,,,,,0.0
2,2022-10-29,3.375,,,,0.0,,,,,...,,0.0,,,,,,,,0.0
3,2022-10-30,3.571429,,,,0.0,,,,,...,,0.0,,,,,,,,0.0
4,2022-10-31,3.2,,,,0.0,,,,,...,,0.0,,,,,,,,0.0


In [4]:
lagged_1day_features = []
for col in df.columns:
    if 'lag1' in col:
        lagged_1day_features.append(col)


In [5]:
original_features = [
    'Eat Dinner',
    'Eat Breakfast',
    'Eat Lunch',
    'Gratitude Jar',
    'Meditation',
    'Mindfulness Walk',
    'Read For Fun',
    'Take A Shower',
    'Yoga',
]


### Linear Regression Analysis

In [6]:
# start with original features
original_features.append('value')
df_original = df[original_features]
df_original.head()

Unnamed: 0,Eat Dinner,Eat Breakfast,Eat Lunch,Gratitude Jar,Meditation,Mindfulness Walk,Read For Fun,Take A Shower,Yoga,value
0,,,,1.0,,,,,,2.833333
1,,,,0.0,,,,,,3.125
2,,,,0.0,,,,,,3.375
3,,,,0.0,,,,,,3.571429
4,,,,0.0,,,,,,3.2


In [7]:
df_original = df_original.dropna()
original_features.remove('value')

In [8]:
# avoid data leakage by sorting by Record_Date before splitting into train / test data
train_size = int(0.8 * len(df_original))
train_df = df_original.iloc[:train_size]
test_df = df_original.iloc[train_size:]

X_train = train_df.drop(columns=['value'])
y_train = train_df['value']

X_test = test_df.drop(columns=['value'])
y_test = test_df['value']

In [9]:
model1 = LinearRegression()
model1.fit(X_train, y_train)

In [10]:
y_pred = model1.predict(X_test)
print("R² Score:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))

R² Score: -0.21258289151315624
MSE: 0.2900115112082485


In [11]:
# Regression model, look at significance
X_with_const = sm.add_constant(X_train)  
# adds intercept term, i.e. baseline mood when I didn’t do any of the activities that day
model1 = sm.OLS(y_train, X_with_const).fit()
model1.summary()

0,1,2,3
Dep. Variable:,value,R-squared:,0.104
Model:,OLS,Adj. R-squared:,0.069
Method:,Least Squares,F-statistic:,3.019
Date:,"Wed, 16 Jul 2025",Prob (F-statistic):,0.00197
Time:,14:39:23,Log-Likelihood:,-174.16
No. Observations:,245,AIC:,368.3
Df Residuals:,235,BIC:,403.3
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.0538,0.261,11.721,0.000,2.541,3.567
Eat Dinner,0.1743,0.116,1.499,0.135,-0.055,0.403
Eat Breakfast,0.1326,0.231,0.573,0.567,-0.323,0.588
Eat Lunch,0.1799,0.077,2.329,0.021,0.028,0.332
Gratitude Jar,0.1348,0.165,0.817,0.415,-0.190,0.460
Meditation,0.0015,0.087,0.017,0.986,-0.169,0.172
Mindfulness Walk,0.2922,0.077,3.816,0.000,0.141,0.443
Read For Fun,-0.0018,0.077,-0.024,0.981,-0.153,0.149
Take A Shower,-0.1189,0.111,-1.075,0.284,-0.337,0.099

0,1,2,3
Omnibus:,48.059,Durbin-Watson:,1.739
Prob(Omnibus):,0.0,Jarque-Bera (JB):,81.979
Skew:,-1.06,Prob(JB):,1.58e-18
Kurtosis:,4.88,Cond. No.,24.6


### Linear Regression Analysis: Original Model w Yoga Streak

In [12]:
features2 = [
    'Eat Breakfast',
    'Eat Lunch',
    'Eat Dinner_lag1',
    'Yoga_Streak',
    'Mindfulness Walk',
    'value'
]

In [13]:
df_original2 = df[features2]
df_original2.head()

Unnamed: 0,Eat Breakfast,Eat Lunch,Eat Dinner_lag1,Yoga_Streak,Mindfulness Walk,value
0,,,,,,2.833333
1,,,,,,3.125
2,,,,,,3.375
3,,,,,,3.571429
4,,,,,,3.2


In [14]:
df_original2 = df_original2.dropna()
features2.remove('value')

In [15]:
# avoid data leakage by sorting by Record_Date before splitting into train / test data
train_size = int(0.8 * len(df_original2))
train_df = df_original2.iloc[:train_size]
test_df = df_original2.iloc[train_size:]

X_train = train_df.drop(columns=['value'])
y_train = train_df['value']

X_test = test_df.drop(columns=['value'])
y_test = test_df['value']

In [16]:
model2 = LinearRegression()
model2.fit(X_train, y_train)

In [17]:
y_pred = model2.predict(X_test)
print("R² Score:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))

R² Score: -0.12735303252925223
MSE: 0.2511735523235287


In [18]:
# Regression model, look at significance
X_with_const = sm.add_constant(X_train)  # adds intercept term, i.e. baseline mood when I didn’t do any of the activities that day
model2 = sm.OLS(y_train, X_with_const).fit()
model2.summary()

0,1,2,3
Dep. Variable:,value,R-squared:,0.079
Model:,OLS,Adj. R-squared:,0.069
Method:,Least Squares,F-statistic:,7.577
Date:,"Wed, 16 Jul 2025",Prob (F-statistic):,7.66e-07
Time:,14:39:24,Log-Likelihood:,-328.21
No. Observations:,446,AIC:,668.4
Df Residuals:,440,BIC:,693.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.1838,0.178,17.862,0.000,2.833,3.534
Eat Breakfast,0.1695,0.163,1.040,0.299,-0.151,0.490
Eat Lunch,0.1592,0.060,2.635,0.009,0.040,0.278
Eat Dinner_lag1,0.0012,0.070,0.018,0.986,-0.136,0.138
Yoga_Streak,0.0429,0.028,1.544,0.123,-0.012,0.098
Mindfulness Walk,0.2387,0.058,4.090,0.000,0.124,0.353

0,1,2,3
Omnibus:,111.155,Durbin-Watson:,1.73
Prob(Omnibus):,0.0,Jarque-Bera (JB):,251.625
Skew:,-1.28,Prob(JB):,2.29e-55
Kurtosis:,5.644,Cond. No.,20.4


In [19]:
X_train.corr()

Unnamed: 0,Eat Breakfast,Eat Lunch,Eat Dinner_lag1,Yoga_Streak,Mindfulness Walk
Eat Breakfast,1.0,-0.045105,0.025544,0.004989,0.021311
Eat Lunch,-0.045105,1.0,0.090676,0.181772,0.239312
Eat Dinner_lag1,0.025544,0.090676,1.0,0.07521,0.073654
Yoga_Streak,0.004989,0.181772,0.07521,1.0,0.122363
Mindfulness Walk,0.021311,0.239312,0.073654,0.122363,1.0


In [20]:
print("Mindfulness Walk p-value: " + str(model2.pvalues['Mindfulness Walk']))

Mindfulness Walk p-value: 5.126109463448978e-05


### Linear Regression Analysis: 1 day lagged features

In [21]:
# now look at 1 day lagged features
lagged_1day_features.append('value')
df_lag = df[lagged_1day_features]
df_lag.head()

Unnamed: 0,Eat Dinner_lag1,Eat Breakfast_lag1,Eat Lunch_lag1,Gratitude Jar_lag1,Meditation_lag1,Mindfulness Walk_lag1,Read For Fun_lag1,Take A Shower_lag1,Yoga_lag1,value
0,,,,,,,,,,2.833333
1,,,,1.0,,,,,,3.125
2,,,,0.0,,,,,,3.375
3,,,,0.0,,,,,,3.571429
4,,,,0.0,,,,,,3.2


In [22]:
df_lag = df_lag.dropna()

In [23]:
lagged_1day_features.remove('value')

In [24]:
# avoid data leakage by sorting by Record_Date before splitting into train / test data
train_size = int(0.8 * len(df_lag))
train_df = df_lag.iloc[:train_size]
test_df = df_lag.iloc[train_size:]

X_train = train_df.drop(columns=['value'])
y_train = train_df['value']

X_test = test_df.drop(columns=['value'])
y_test = test_df['value']

In [25]:
model_lag = LinearRegression()
model_lag.fit(X_train, y_train)

In [26]:
y_pred = model_lag.predict(X_test)
print("R² Score:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))

R² Score: -0.13682274029686958
MSE: 0.27189207698451207


In [27]:
# Regression model, look at significance
X_with_const = sm.add_constant(X_train)  
# adds intercept term, i.e. baseline mood when I didn’t do any of the activities that day
model_lag = sm.OLS(y_train, X_with_const).fit()
model_lag.summary()

0,1,2,3
Dep. Variable:,value,R-squared:,0.052
Model:,OLS,Adj. R-squared:,0.016
Method:,Least Squares,F-statistic:,1.438
Date:,"Wed, 16 Jul 2025",Prob (F-statistic):,0.173
Time:,14:39:24,Log-Likelihood:,-180.45
No. Observations:,244,AIC:,380.9
Df Residuals:,234,BIC:,415.9
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.9660,0.268,11.056,0.000,2.437,3.495
Eat Dinner_lag1,0.0762,0.120,0.637,0.525,-0.160,0.312
Eat Breakfast_lag1,0.6147,0.238,2.582,0.010,0.146,1.084
Eat Lunch_lag1,-0.0886,0.080,-1.112,0.267,-0.245,0.068
Gratitude Jar_lag1,-0.1041,0.170,-0.613,0.540,-0.439,0.230
Meditation_lag1,0.0313,0.089,0.352,0.725,-0.144,0.207
Mindfulness Walk_lag1,0.1489,0.079,1.888,0.060,-0.006,0.304
Read For Fun_lag1,-0.0510,0.079,-0.644,0.520,-0.207,0.105
Take A Shower_lag1,-0.0139,0.114,-0.122,0.903,-0.238,0.211

0,1,2,3
Omnibus:,64.787,Durbin-Watson:,1.8
Prob(Omnibus):,0.0,Jarque-Bera (JB):,135.228
Skew:,-1.299,Prob(JB):,4.3199999999999995e-30
Kurtosis:,5.56,Cond. No.,24.5


### Linear Regression Analysis: 'Mindfulness' Model

In [28]:
# now look at Mindfulness Model, with composite feature 'mindfulness'
mindful_features = [
    'value',
    'Mindfulness',
    'Eat Breakfast',
    'Eat Dinner',
    'Eat Lunch',
    'Take A Shower',
]
df_mindful = df[mindful_features]
df_mindful.head()

Unnamed: 0,value,Mindfulness,Eat Breakfast,Eat Dinner,Eat Lunch,Take A Shower
0,2.833333,0.0,,,,
1,3.125,0.0,,,,
2,3.375,0.0,,,,
3,3.571429,0.0,,,,
4,3.2,0.0,,,,


In [29]:
df_mindful = df_mindful.dropna()

In [30]:
mindful_features.remove('value')

In [31]:
df_mindful.head()

Unnamed: 0,value,Mindfulness,Eat Breakfast,Eat Dinner,Eat Lunch,Take A Shower
528,4.0,0.0,0.0,1.0,0.0,1.0
529,4.0,1.0,1.0,1.0,0.0,1.0
530,4.0,1.0,1.0,1.0,1.0,1.0
531,3.75,1.0,1.0,1.0,1.0,1.0
532,3.5,1.0,1.0,1.0,1.0,1.0


In [32]:
# avoid data leakage by sorting by Record_Date before splitting into train / test data
train_size = int(0.8 * len(df_mindful))
train_df = df_mindful.iloc[:train_size]
test_df = df_mindful.iloc[train_size:]

X_train = train_df.drop(columns=['value'])
y_train = train_df['value']

X_test = test_df.drop(columns=['value'])
y_test = test_df['value']

In [33]:
model_mindful = LinearRegression()
model_mindful.fit(X_train, y_train)

In [34]:
y_pred = model_mindful.predict(X_test)
print("R² Score:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))

R² Score: -0.20801737862695968
MSE: 0.28891958479164337


In [35]:
# Regression model, look at significance
X_with_const = sm.add_constant(X_train)
# adds intercept term, i.e. baseline mood when I didn’t do any of the activities that day
model_mindful = sm.OLS(y_train, X_with_const).fit()
model_mindful.summary()

0,1,2,3
Dep. Variable:,value,R-squared:,0.067
Model:,OLS,Adj. R-squared:,0.047
Method:,Least Squares,F-statistic:,3.416
Date:,"Wed, 16 Jul 2025",Prob (F-statistic):,0.00532
Time:,14:39:24,Log-Likelihood:,-179.11
No. Observations:,245,AIC:,370.2
Df Residuals:,239,BIC:,391.2
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.1184,0.261,11.964,0.000,2.605,3.632
Mindfulness,0.2477,0.099,2.490,0.013,0.052,0.444
Eat Breakfast,0.0852,0.232,0.367,0.714,-0.372,0.543
Eat Dinner,0.1419,0.117,1.216,0.225,-0.088,0.372
Eat Lunch,0.2090,0.076,2.741,0.007,0.059,0.359
Take A Shower,-0.0951,0.110,-0.861,0.390,-0.313,0.122

0,1,2,3
Omnibus:,56.04,Durbin-Watson:,1.704
Prob(Omnibus):,0.0,Jarque-Bera (JB):,103.726
Skew:,-1.182,Prob(JB):,2.99e-23
Kurtosis:,5.139,Cond. No.,22.8


### Linear Regression Analysis: 'Streaks' Model

In [36]:
# now look at Mindfulness Model, with composite feature 'mindfulness'
streak_features = [
    'value',
    'Mindfulness Walk_Streak',
    'Yoga_Streak',
    #'Eat Breakfast',
    #'Eat Dinner',
    #'Eat Lunch',
]
df_streak= df[streak_features]
df_streak.head()

Unnamed: 0,value,Mindfulness Walk_Streak,Yoga_Streak
0,2.833333,,
1,3.125,,
2,3.375,,
3,3.571429,,
4,3.2,,


In [37]:
df_streak = df_streak.dropna()
streak_features.remove('value')

In [38]:
df_streak.head()

Unnamed: 0,value,Mindfulness Walk_Streak,Yoga_Streak
274,3.5,0.0,0.0
275,3.25,1.0,0.0
276,3.5,2.0,1.0
277,3.0,0.0,0.0
278,3.5,1.0,0.0


In [39]:
# avoid data leakage by sorting by Record_Date before splitting into train / test data
train_size = int(0.8 * len(df_streak))
train_df = df_streak.iloc[:train_size]
test_df = df_streak.iloc[train_size:]

X_train = train_df.drop(columns=['value'])
y_train = train_df['value']

X_test = test_df.drop(columns=['value'])
y_test = test_df['value']

In [40]:
model_streak = LinearRegression()
model_streak.fit(X_train, y_train)

In [41]:
y_pred = model_streak.predict(X_test)
print("R² Score:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))

R² Score: -0.1387036662514476
MSE: 0.2541826364993293


In [42]:
# Regression model, look at significance
X_with_const = sm.add_constant(X_train) 
# adds intercept term, i.e. baseline mood when I didn’t do any of the activities that day
model_streak = sm.OLS(y_train, X_with_const).fit()
model_streak.summary()

0,1,2,3
Dep. Variable:,value,R-squared:,0.035
Model:,OLS,Adj. R-squared:,0.03
Method:,Least Squares,F-statistic:,8.012
Date:,"Wed, 16 Jul 2025",Prob (F-statistic):,0.000382
Time:,14:39:24,Log-Likelihood:,-339.51
No. Observations:,448,AIC:,685.0
Df Residuals:,445,BIC:,697.3
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.5925,0.032,111.599,0.000,3.529,3.656
Mindfulness Walk_Streak,0.0159,0.005,3.123,0.002,0.006,0.026
Yoga_Streak,0.0536,0.029,1.868,0.062,-0.003,0.110

0,1,2,3
Omnibus:,130.315,Durbin-Watson:,1.78
Prob(Omnibus):,0.0,Jarque-Bera (JB):,343.423
Skew:,-1.42,Prob(JB):,2.67e-75
Kurtosis:,6.214,Cond. No.,8.87
