In [1]:
import pandas as pd
from datetime import datetime, timedelta, date
from sklearn import linear_model, model_selection, metrics

In [2]:
data_location = '../Sample Data/'
mood_data_location = data_location+'FinchExport_2025-03-01/Mood.json'
exvar_data_location = data_location+'FinchExport_2025-03-01/Bullet.json'

### response

In [3]:
mood_series = pd.read_json(mood_data_location, typ='series')
mood_data = pd.json_normalize(mood_series.data)
mood_data['value'] = pd.to_numeric(mood_data.value)

In [4]:
# note: a pandas update didn't like the non-zero-padded 'day' of the Finch data
# so I'm using datetime.strptime instead of pd.to_datetime for now
mood_data['Record_Date'] = mood_data.dt.map(lambda x: datetime.strptime(x, "%a, %d %b %Y %H:%M:%S")).dt.date
mood_data = mood_data.drop(['dt','updated_time'],axis=1)

In [5]:
feeling_data = mood_data.query("mood_type == 'feeling'")
feeling_data = feeling_data.drop('mood_type',axis=1)
feeling_data.head()

Unnamed: 0,value,Record_Date
0,4,2023-05-06
3,3,2023-07-12
6,4,2024-12-10
8,3,2023-10-29
10,3,2022-10-27


In [6]:
avg_feelings_df = feeling_data[['value','Record_Date']].groupby('Record_Date').mean().reset_index()
avg_feelings_df.head()

Unnamed: 0,Record_Date,value
0,2022-10-26,3.0
1,2022-10-27,3.0
2,2022-10-28,2.875
3,2022-10-29,3.5
4,2022-10-30,3.5


### explanatory

In [7]:
exvar_series = pd.read_json(exvar_data_location, typ='series')
exvar_data = pd.json_normalize(exvar_series.data)

In [8]:
exvar_df = exvar_data[['dt','bullet_type','text','bullet_status']]
exvar_df = exvar_df.query("bullet_type == 1")
exvar_df = exvar_df.drop('bullet_type',axis=1)

In [9]:
# convert to datetime
exvar_df['Record_Date'] = exvar_df.dt.map(lambda x: datetime.strptime(x, "%a, %d %b %Y %H:%M:%S")).dt.date
exvar_df = exvar_df.drop(['dt'],axis=1)

In [10]:
exvar_df.head()

Unnamed: 0,text,bullet_status,Record_Date
0,#Meditation Timer,0.0,2024-04-28
1,Think about a positive moment with #yoga,0.0,2024-04-11
2,#Yoga,0.0,2024-05-17
3,Think about a positive moment with #yoga,0.0,2024-01-28
4,#Read for #fun,1.0,2023-09-05


In [11]:
# replace {old_name: new name}
duplicates_dict = {
    "Laundry":"Start a load of laundry",
    "Create a Finch_Backup_File":"Create a Finch Backup File",
    "Walk around the neighborhood":"Mindfulness Walk"
}

exvar_df['text'] = exvar_df['text'].str.replace("#","")
exvar_df['text'] = exvar_df['text'].replace(duplicates_dict)

In [12]:
#March 1,2025 is incomplete date, the date I downloaded the data
cut_off_date = datetime(2025,3,1).date()
exvar_df = exvar_df[exvar_df['Record_Date'] < cut_off_date]

In [13]:
exvar_df.head()

Unnamed: 0,text,bullet_status,Record_Date
0,Meditation Timer,0.0,2024-04-28
1,Think about a positive moment with yoga,0.0,2024-04-11
2,Yoga,0.0,2024-05-17
3,Think about a positive moment with yoga,0.0,2024-01-28
4,Read for fun,1.0,2023-09-05


### narrow explanatory variables to Yoga

In [14]:
yoga_df = exvar_df[exvar_df['text'] == 'Yoga']
yoga_df = yoga_df.drop('text',axis=1)
yoga_df.head()

Unnamed: 0,bullet_status,Record_Date
2,0.0,2024-05-17
5,1.0,2024-07-16
26,0.0,2024-06-24
31,0.0,2024-03-17
113,0.0,2025-01-26


### cut response and explan data to same dates

In [15]:
x_min_date = yoga_df['Record_Date'].min()
x_max_date = yoga_df['Record_Date'].max()

print("X min date: " + str(x_min_date))
print("X max date: " + str(x_max_date))

X min date: 2023-08-07
X max date: 2025-02-28


In [16]:
y_min_date = avg_feelings_df['Record_Date'].min()
y_max_date = avg_feelings_df['Record_Date'].max()

print("Y min date: " + str(y_min_date))
print("Y max date: " + str(y_max_date))

Y min date: 2022-10-26
Y max date: 2025-02-28


In [17]:
min_date = max(x_min_date,y_min_date)
max_date = min(x_max_date,y_max_date)

print("min date: " + str(min_date))
print("max date: " + str(max_date))

min date: 2023-08-07
max date: 2025-02-28


In [18]:
print("avg_feelings_df length before is " + str(len(avg_feelings_df)))
avg_feelings_df = avg_feelings_df[(avg_feelings_df['Record_Date'] > min_date) & (avg_feelings_df['Record_Date'] < max_date)]
print("avg_feelings_df length after is " + str(len(avg_feelings_df)))

avg_feelings_df length before is 832
avg_feelings_df length after is 557


In [19]:
print("yoga_df length before is " + str(len(yoga_df)))
yoga_df = yoga_df[(yoga_df['Record_Date'] > min_date) & (yoga_df['Record_Date'] < max_date)]
print("yoga_df length after is " + str(len(yoga_df)))

yoga_df length before is 552
yoga_df length after is 550


In [23]:
y_missing_dates = pd.date_range(start=min_date, end=max_date).difference(avg_feelings_df['Record_Date'])
x_missing_dates = pd.date_range(start=min_date, end=max_date).difference(yoga_df['Record_Date'])
print("missing yoga days: " + str(len(x_missing_dates)))
print("missing feelings days: " + str(len(y_missing_dates)))

missing_dates = set(())
for i in y_missing_dates:
    missing_dates.add(i)
for i in x_missing_dates:
    missing_dates.add(i)
    
print(len(missing_dates))

missing yoga days: 36
missing feelings days: 15
36


In [24]:
print("feelings records before reducing dates: " + str(len(avg_feelings_df)))
avg_feelings_df = avg_feelings_df[~avg_feelings_df['Record_Date'].isin(missing_dates)].sort_values('Record_Date')
print("feelings records after reducing dates: " + str(len(avg_feelings_df)))

feelings records before reducing dates: 557
feelings records after reducing dates: 557


In [25]:
print("yoga records before reducing dates: " + str(len(yoga_df)))
yoga_df = yoga_df[~yoga_df['Record_Date'].isin(missing_dates)].sort_values('Record_Date')
print("yoga records after reducing dates: " + str(len(yoga_df)))

yoga records before reducing dates: 537
yoga records after reducing dates: 537
