In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta, date
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import statsmodels.api as sm

import sys
import os.path as osp

SRC_SUBDIR = '../src/'
SRC_SUBDIR = osp.abspath(SRC_SUBDIR)
if SRC_SUBDIR not in sys.path:
    sys.path.insert(1, SRC_SUBDIR)


In [2]:
df = pd.read_csv("../data/processed/features_dataset.csv")

In [3]:
df.head()

Unnamed: 0,Record_Date,value,Eat Dinner,Eat Breakfast,Eat Lunch,Gratitude Jar,Meditation,Mindfulness Walk,Read For Fun,Take A Shower,...,Eat Dinner_lag1,Eat Breakfast_lag1,Eat Lunch_lag1,Gratitude Jar_lag1,Meditation_lag1,Mindfulness Walk_lag1,Read For Fun_lag1,Take A Shower_lag1,Yoga_lag1,Mindfulness
0,2022-10-27,2.833333,,,,1.0,,,,,...,,,,,,,,,,0
1,2022-10-28,3.125,,,,0.0,,,,,...,,,,1.0,,,,,,0
2,2022-10-29,3.375,,,,0.0,,0.0,,,...,,,,0.0,,,,,,0
3,2022-10-30,3.571429,,,,0.0,,0.0,,,...,,,,0.0,,0.0,,,,0
4,2022-10-31,3.2,,,,0.0,,0.0,,,...,,,,0.0,,0.0,,,,0


In [4]:
lagged_1day_features = []
original_features = []
for col in df.columns:
    if 'lag1' in col:
        lagged_1day_features.append(col)
    else:
        original_features.append(col)
original_features.remove('Mindfulness')
#original_features.remove('Record_Date')
#original_features.remove('value')


### Linear Regression Analysis

In [5]:
# start with original features
df_original = df[original_features]
df_original.head()

Unnamed: 0,Record_Date,value,Eat Dinner,Eat Breakfast,Eat Lunch,Gratitude Jar,Meditation,Mindfulness Walk,Read For Fun,Take A Shower,Yoga
0,2022-10-27,2.833333,,,,1.0,,,,,
1,2022-10-28,3.125,,,,0.0,,,,,
2,2022-10-29,3.375,,,,0.0,,0.0,,,
3,2022-10-30,3.571429,,,,0.0,,0.0,,,
4,2022-10-31,3.2,,,,0.0,,0.0,,,


In [6]:
df_original = df_original.dropna()

In [7]:
original_features.remove('Record_Date')
original_features.remove('value')

In [8]:
X = df_original[original_features]
y = df_original['value']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
model = LinearRegression()
model.fit(X_train, y_train)

In [10]:
y_pred = model.predict(X_test)
print("R² Score:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))

R² Score: 0.04038118716572192
MSE: 0.23342791419259068


In [11]:
# Pair coefficients with feature names
coeff_df = pd.DataFrame({
    'Feature': original_features,
    'Coefficient': model.coef_
}).sort_values(by='Coefficient', ascending=False)
coeff_df

Unnamed: 0,Feature,Coefficient
5,Mindfulness Walk,0.218634
2,Eat Lunch,0.164181
0,Eat Dinner,0.13731
3,Gratitude Jar,0.120716
1,Eat Breakfast,0.117367
8,Yoga,0.008428
6,Read For Fun,-0.051393
4,Meditation,-0.072471
7,Take A Shower,-0.136292


In [12]:
# Regression model, look at significance
X_with_const = sm.add_constant(X)  # adds intercept term, i.e. baseline mood when I didn’t do any of the activities that day
model = sm.OLS(y, X_with_const).fit()
model.summary()

0,1,2,3
Dep. Variable:,value,R-squared:,0.077
Model:,OLS,Adj. R-squared:,0.049
Method:,Least Squares,F-statistic:,2.744
Date:,"Tue, 24 Jun 2025",Prob (F-statistic):,0.0043
Time:,15:33:11,Log-Likelihood:,-222.67
No. Observations:,307,AIC:,465.3
Df Residuals:,297,BIC:,502.6
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.1911,0.256,12.483,0.000,2.688,3.694
Eat Dinner,0.1251,0.109,1.143,0.254,-0.090,0.340
Eat Breakfast,0.1213,0.233,0.520,0.603,-0.338,0.580
Eat Lunch,0.1499,0.069,2.188,0.029,0.015,0.285
Gratitude Jar,0.1502,0.159,0.946,0.345,-0.162,0.463
Meditation,-0.0070,0.083,-0.085,0.932,-0.169,0.155
Mindfulness Walk,0.2348,0.067,3.507,0.001,0.103,0.367
Read For Fun,-0.0348,0.076,-0.461,0.645,-0.184,0.114
Take A Shower,-0.1291,0.105,-1.233,0.219,-0.335,0.077

0,1,2,3
Omnibus:,50.677,Durbin-Watson:,1.728
Prob(Omnibus):,0.0,Jarque-Bera (JB):,80.44
Skew:,-0.974,Prob(JB):,3.41e-18
Kurtosis:,4.579,Cond. No.,27.1


### Linear Regression Analysis: 1 day lagged features

In [13]:
# now look at 1 day lagged features
lagged_1day_features.append('value')
df_lag = df[lagged_1day_features]
df_lag.head()

Unnamed: 0,Eat Dinner_lag1,Eat Breakfast_lag1,Eat Lunch_lag1,Gratitude Jar_lag1,Meditation_lag1,Mindfulness Walk_lag1,Read For Fun_lag1,Take A Shower_lag1,Yoga_lag1,value
0,,,,,,,,,,2.833333
1,,,,1.0,,,,,,3.125
2,,,,0.0,,,,,,3.375
3,,,,0.0,,0.0,,,,3.571429
4,,,,0.0,,0.0,,,,3.2


In [14]:
df_lag = df_lag.dropna()

In [15]:
lagged_1day_features.remove('value')

In [16]:
X = df_lag[lagged_1day_features]
y = df_lag['value']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
model_lag = LinearRegression()
model_lag.fit(X_train, y_train)

In [18]:
y_pred = model_lag.predict(X_test)
print("R² Score:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))

R² Score: -0.08930716978674647
MSE: 0.3184611362506882


In [19]:
# Pair coefficients with feature names
coeff_df = pd.DataFrame({
    'Feature': lagged_1day_features,
    'Coefficient': model_lag.coef_
}).sort_values(by='Coefficient', ascending=False)
coeff_df

Unnamed: 0,Feature,Coefficient
1,Eat Breakfast_lag1,0.630881
5,Mindfulness Walk_lag1,0.201793
0,Eat Dinner_lag1,0.087914
4,Meditation_lag1,0.087044
7,Take A Shower_lag1,0.077051
3,Gratitude Jar_lag1,0.063693
2,Eat Lunch_lag1,-0.02882
6,Read For Fun_lag1,-0.075417
8,Yoga_lag1,-0.13104


In [20]:
# Regression model, look at significance
X_with_const = sm.add_constant(X)  # adds intercept term, i.e. baseline mood when I didn’t do any of the activities that day
model_lag = sm.OLS(y, X_with_const).fit()
model_lag.summary()

0,1,2,3
Dep. Variable:,value,R-squared:,0.055
Model:,OLS,Adj. R-squared:,0.026
Method:,Least Squares,F-statistic:,1.898
Date:,"Tue, 24 Jun 2025",Prob (F-statistic):,0.0519
Time:,15:33:12,Log-Likelihood:,-225.75
No. Observations:,306,AIC:,471.5
Df Residuals:,296,BIC:,508.7
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.8823,0.259,11.129,0.000,2.373,3.392
Eat Dinner_lag1,0.0715,0.111,0.644,0.520,-0.147,0.290
Eat Breakfast_lag1,0.6056,0.236,2.565,0.011,0.141,1.070
Eat Lunch_lag1,0.0098,0.069,0.141,0.888,-0.127,0.146
Gratitude Jar_lag1,-0.0760,0.161,-0.473,0.637,-0.392,0.240
Meditation_lag1,0.0450,0.084,0.538,0.591,-0.119,0.209
Mindfulness Walk_lag1,0.1724,0.068,2.541,0.012,0.039,0.306
Read For Fun_lag1,-0.0901,0.077,-1.176,0.240,-0.241,0.061
Take A Shower_lag1,0.0161,0.106,0.151,0.880,-0.193,0.225

0,1,2,3
Omnibus:,69.061,Durbin-Watson:,1.77
Prob(Omnibus):,0.0,Jarque-Bera (JB):,129.011
Skew:,-1.204,Prob(JB):,9.67e-29
Kurtosis:,5.079,Cond. No.,27.1
