In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta, date
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import statsmodels.api as sm

import sys
import os.path as osp

SRC_SUBDIR = '../src/'
SRC_SUBDIR = osp.abspath(SRC_SUBDIR)
if SRC_SUBDIR not in sys.path:
    sys.path.insert(1, SRC_SUBDIR)
    

In [2]:
activities_df = pd.read_csv("../data/processed/cleaned_activities.csv")
ex_variable_names = activities_df.columns.to_list()
ex_variable_names.remove('Record_Date')
activities_df.head()

Unnamed: 0,Record_Date,Eat Dinner,Eat Breakfast,Eat Lunch,Gratitude Jar,Meditation,Mindfulness Walk,Read For Fun,Take A Shower,Yoga
0,2022-10-27,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,2022-10-28,,,,,,,,,
2,2022-10-29,,,,,,,,,
3,2022-10-30,,,,,,,,,
4,2022-10-31,,,,,,,,,


In [3]:
mood_df = pd.read_csv("../data/processed/cleaned_moods.csv")
mood_df.head()

Unnamed: 0,Record_Date,average_feeling,satisfaction,motivation
0,2022-10-26,3.0,3.0,3.0
1,2022-10-27,2.833333,2.0,4.0
2,2022-10-28,3.125,4.0,2.0
3,2022-10-29,3.375,3.0,3.0
4,2022-10-30,3.571429,4.0,4.0


In [4]:
avg_feelings_df = mood_df[['Record_Date','average_feeling']]
avg_feelings_df = avg_feelings_df.rename(columns={'average_feeling':'value'})

### Combine datasets

In [5]:
df = pd.merge(avg_feelings_df, activities_df, on='Record_Date', how='inner')
df['Record_Date'] = pd.to_datetime(df['Record_Date'])
df.head()

Unnamed: 0,Record_Date,value,Eat Dinner,Eat Breakfast,Eat Lunch,Gratitude Jar,Meditation,Mindfulness Walk,Read For Fun,Take A Shower,Yoga
0,2022-10-27,2.833333,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,2022-10-28,3.125,,,,,,,,,
2,2022-10-29,3.375,,,,,,,,,
3,2022-10-30,3.571429,,,,,,,,,
4,2022-10-31,3.2,,,,,,,,,


In [6]:
fill_values = {}
for col in ex_variable_names:
    fill_values[col] = 0
    
df = df.fillna(value=fill_values)
df = df.dropna(subset=['value'])
# assuming that if I'm missing records then I just didn't do that activity

In [7]:
df.head()

Unnamed: 0,Record_Date,value,Eat Dinner,Eat Breakfast,Eat Lunch,Gratitude Jar,Meditation,Mindfulness Walk,Read For Fun,Take A Shower,Yoga
0,2022-10-27,2.833333,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,2022-10-28,3.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2022-10-29,3.375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2022-10-30,3.571429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2022-10-31,3.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Creation Dates

In [8]:
# only include activities after the goal was created, ie after I started tracking the activity
# so 0's and 1's are meaningful, NaN implies the tracking hadn't started for that activity
creation_dates = pd.read_csv("../data/processed/creation_dates.csv")
creation_dates = creation_dates.set_index('text')

In [9]:
activities = df.columns.to_list()
activities.remove('Record_Date')
activities.remove('value')

In [10]:
for col in activities:
    start_date = pd.to_datetime(creation_dates.loc[col].Record_Date)
    if start_date:
        df.loc[df.Record_Date < start_date, col] = pd.NA

In [11]:
df.head(10)

Unnamed: 0,Record_Date,value,Eat Dinner,Eat Breakfast,Eat Lunch,Gratitude Jar,Meditation,Mindfulness Walk,Read For Fun,Take A Shower,Yoga
0,2022-10-27,2.833333,,,,1.0,,,,,
1,2022-10-28,3.125,,,,0.0,,,,,
2,2022-10-29,3.375,,,,0.0,,0.0,,,
3,2022-10-30,3.571429,,,,0.0,,0.0,,,
4,2022-10-31,3.2,,,,0.0,,0.0,,,
5,2022-11-01,3.428571,,,,0.0,1.0,0.0,,,
6,2022-11-02,2.75,,,,0.0,1.0,0.0,,,
7,2022-11-03,3.0,,,,0.0,1.0,0.0,,,
8,2022-11-04,3.333333,,,,0.0,1.0,0.0,,,
9,2022-11-05,2.8,,,,0.0,1.0,0.0,,,


### 1 day lag

In [12]:
lagged_1day_features = []
for col in ex_variable_names:
    col_name = f'{col}_lag1'
    df[col_name] = df[col].shift(1)
    lagged_1day_features.append(col_name)

In [13]:
#df = pd.merge(avg_feelings_df, activities_df, on='Record_Date', how='inner')

In [14]:
df.head()

Unnamed: 0,Record_Date,value,Eat Dinner,Eat Breakfast,Eat Lunch,Gratitude Jar,Meditation,Mindfulness Walk,Read For Fun,Take A Shower,Yoga,Eat Dinner_lag1,Eat Breakfast_lag1,Eat Lunch_lag1,Gratitude Jar_lag1,Meditation_lag1,Mindfulness Walk_lag1,Read For Fun_lag1,Take A Shower_lag1,Yoga_lag1
0,2022-10-27,2.833333,,,,1.0,,,,,,,,,,,,,,
1,2022-10-28,3.125,,,,0.0,,,,,,,,,1.0,,,,,
2,2022-10-29,3.375,,,,0.0,,0.0,,,,,,,0.0,,,,,
3,2022-10-30,3.571429,,,,0.0,,0.0,,,,,,,0.0,,0.0,,,
4,2022-10-31,3.2,,,,0.0,,0.0,,,,,,,0.0,,0.0,,,


### Mindfulness

In [15]:
#TODO: mindfulness NaN before any mindful activity had started being tracked
mindful_activities = ['Meditation', 'Mindfulness Walk', 'Yoga']
df['Mindfulness'] = df[mindful_activities].any(axis=1).astype(int)
df.head()

Unnamed: 0,Record_Date,value,Eat Dinner,Eat Breakfast,Eat Lunch,Gratitude Jar,Meditation,Mindfulness Walk,Read For Fun,Take A Shower,...,Eat Dinner_lag1,Eat Breakfast_lag1,Eat Lunch_lag1,Gratitude Jar_lag1,Meditation_lag1,Mindfulness Walk_lag1,Read For Fun_lag1,Take A Shower_lag1,Yoga_lag1,Mindfulness
0,2022-10-27,2.833333,,,,1.0,,,,,...,,,,,,,,,,0
1,2022-10-28,3.125,,,,0.0,,,,,...,,,,1.0,,,,,,0
2,2022-10-29,3.375,,,,0.0,,0.0,,,...,,,,0.0,,,,,,0
3,2022-10-30,3.571429,,,,0.0,,0.0,,,...,,,,0.0,,0.0,,,,0
4,2022-10-31,3.2,,,,0.0,,0.0,,,...,,,,0.0,,0.0,,,,0


In [16]:
#TODO: add variabilty metrics, eg std, range

### write cleaned dataframes to csv

In [17]:
# TODO: os path to data/processed
df.to_csv("../data/processed/features_dataset.csv",index=False)