In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta, date
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import statsmodels.api as sm

import sys
import os.path as osp

SRC_SUBDIR = '../src/'
SRC_SUBDIR = osp.abspath(SRC_SUBDIR)
if SRC_SUBDIR not in sys.path:
    sys.path.insert(1, SRC_SUBDIR)
    
from load_data import LoadData
from clean_data import CleanData

from project_config import CONFIG

# Config variables
RAW_DATA_PATH = CONFIG["raw_data_path"]


In [2]:
data_location = '/raw/'
#data_location = RAW_DATA_PATH
export_date = '2025-03-01'
load_data = LoadData(data_location, export_date)

### Response variables

In [3]:
mood_data = load_data.load_mood_data()
mood_data.head()

Unnamed: 0,dt,mood_type,value,updated_time
0,"Sat, 6 May 2023 14:00:58",feeling,4,"Sat, 6 May 2023 14:00:58"
1,"Sun, 11 Feb 2024 04:06:37",satisfaction,4,"Sun, 11 Feb 2024 04:06:37"
2,"Wed, 2 Nov 2022 09:04:11",motivation,3,"Wed, 2 Nov 2022 09:04:11"
3,"Wed, 12 Jul 2023 19:02:37",feeling,3,"Wed, 12 Jul 2023 19:02:37"
4,"Sat, 1 Mar 2025 13:39:47",motivation,2,"Sat, 1 Mar 2025 13:39:47"


In [4]:
clean_data = CleanData()
mood_df = clean_data.clean_mood_data(mood_data)
mood_df.head()

Unnamed: 0,Record_Date,average_feeling,satisfaction,motivation
0,2022-10-26,3.0,3.0,3.0
1,2022-10-27,2.833333,2.0,4.0
2,2022-10-28,3.125,4.0,2.0
3,2022-10-29,3.375,3.0,3.0
4,2022-10-30,3.571429,4.0,4.0


### explanatory variables: self-care activities

In [5]:
exvar_data = load_data.load_activity_data()
exvar_df = clean_data.clean_activity_data(exvar_data)

exvar_df.head()

Unnamed: 0,Record_Date,text,bullet_status
0,2024-04-28,Meditation,0.0
1,2024-04-11,Think about a positive moment with yoga,0.0
2,2024-05-17,Yoga,0.0
3,2024-01-28,Think about a positive moment with yoga,0.0
4,2023-09-05,Read for fun,1.0


In [6]:
exvar_df.dtypes

Record_Date       object
text              object
bullet_status    float64
dtype: object

In [7]:
# TODO: why isn't record_date a datetime type? that should happen in clean_activity_data

In [8]:
# keep track of when activity goals were created
creation_dates_df = exvar_df[['text','Record_Date']].groupby('text').min().reset_index()

In [9]:
completed_variables_df = exvar_df.query("bullet_status == 1.0").drop(['bullet_status'],axis=1)
completed_variables_df = completed_variables_df.rename(columns={'text':'variable'})
completed_variables_df.head()

Unnamed: 0,Record_Date,variable
4,2023-09-05,Read for fun
5,2024-07-16,Yoga
6,2024-02-11,Start a load of laundry
10,2024-12-25,Just be
12,2024-05-15,Start a load of laundry


In [10]:
ex_variables = [
    'Eat breakfast',
    'Eat lunch',
    'Eat Dinner',
    'Meditation',
    'Mindfulness Walk',
    'Read for fun',
    'Take a shower',
    'Yoga',
    'Gratitude Jar',
]

In [11]:
ex_variable_names = {}
for var in ex_variables:
    ex_variable_names[var] = string.capwords(var)

In [12]:
completed_var_limit_df = completed_variables_df[completed_variables_df['variable'].isin(ex_variables)]
completed_encoded_exvar_df = pd.get_dummies(completed_var_limit_df.variable).join(completed_var_limit_df).drop('variable',axis=1)
completed_encoded_exvar_df = completed_encoded_exvar_df.rename(columns=ex_variable_names)
completed_encoded_exvar_df.head()

Unnamed: 0,Eat Dinner,Eat Breakfast,Eat Lunch,Gratitude Jar,Meditation,Mindfulness Walk,Read For Fun,Take A Shower,Yoga,Record_Date
4,False,False,False,False,False,False,True,False,False,2023-09-05
5,False,False,False,False,False,False,False,False,True,2024-07-16
13,False,False,True,False,False,False,False,False,False,2024-10-17
15,False,True,False,False,False,False,False,False,False,2023-08-19
16,True,False,False,False,False,False,False,False,False,2024-11-05


In [13]:
exvar_encoded_df = completed_encoded_exvar_df.groupby('Record_Date').any()
exvar_encoded_df.head()

Unnamed: 0_level_0,Eat Dinner,Eat Breakfast,Eat Lunch,Gratitude Jar,Meditation,Mindfulness Walk,Read For Fun,Take A Shower,Yoga
Record_Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-10-27,False,False,False,True,False,False,False,False,False
2022-11-01,False,False,False,False,True,False,False,False,False
2022-11-02,False,False,False,False,True,False,False,False,False
2022-11-03,False,False,False,False,True,False,False,False,False
2022-11-04,False,False,False,False,True,False,False,False,False


In [14]:
for col in ex_variable_names.values():
    exvar_encoded_df[col] = exvar_encoded_df[col].astype(int)
exvar_encoded_df.head()    

Unnamed: 0_level_0,Eat Dinner,Eat Breakfast,Eat Lunch,Gratitude Jar,Meditation,Mindfulness Walk,Read For Fun,Take A Shower,Yoga
Record_Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-10-27,0,0,0,1,0,0,0,0,0
2022-11-01,0,0,0,0,1,0,0,0,0
2022-11-02,0,0,0,0,1,0,0,0,0
2022-11-03,0,0,0,0,1,0,0,0,0
2022-11-04,0,0,0,0,1,0,0,0,0


In [15]:
all_days = pd.date_range(start=exvar_encoded_df.index.min(), end=exvar_encoded_df.index.max(), freq='D')
exvar_encoded_df = exvar_encoded_df.reindex(all_days)
exvar_encoded_df.head()

Unnamed: 0,Eat Dinner,Eat Breakfast,Eat Lunch,Gratitude Jar,Meditation,Mindfulness Walk,Read For Fun,Take A Shower,Yoga
2022-10-27,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2022-10-28,,,,,,,,,
2022-10-29,,,,,,,,,
2022-10-30,,,,,,,,,
2022-10-31,,,,,,,,,


In [16]:
exvar_encoded_df.index.name = 'Record_Date'
exvar_encoded_df = exvar_encoded_df.reset_index()


In [17]:
exvar_encoded_df.head(10)

Unnamed: 0,Record_Date,Eat Dinner,Eat Breakfast,Eat Lunch,Gratitude Jar,Meditation,Mindfulness Walk,Read For Fun,Take A Shower,Yoga
0,2022-10-27,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,2022-10-28,,,,,,,,,
2,2022-10-29,,,,,,,,,
3,2022-10-30,,,,,,,,,
4,2022-10-31,,,,,,,,,
5,2022-11-01,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6,2022-11-02,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7,2022-11-03,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
8,2022-11-04,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9,2022-11-05,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


### write cleaned dataframes to csv

In [18]:
# TODO: os path to data/processed
exvar_encoded_df.to_csv("../data/processed/cleaned_activities.csv",index=False)
creation_dates_df.to_csv("../data/processed/creation_dates.csv",index=False)
mood_df.to_csv("../data/processed/cleaned_moods.csv",index=False)