In [1]:
import pandas as pd
from datetime import datetime, timedelta, date
from sklearn import linear_model, model_selection, metrics

In [4]:
data_location = '../Sample Data/'
mood_data_location = data_location+'FinchExport_2025-03-01/Mood.json'
exvar_data_location = data_location+'FinchExport_2025-03-01/Bullet.json'

### Response variables

In [5]:
mood_series = pd.read_json(mood_data_location, typ='series')
mood_data = pd.json_normalize(mood_series.data)
mood_data['value'] = pd.to_numeric(mood_data.value)

In [6]:
mood_data.head()

Unnamed: 0,dt,mood_type,value,updated_time
0,"Sat, 6 May 2023 14:00:58",feeling,4,"Sat, 6 May 2023 14:00:58"
1,"Sun, 11 Feb 2024 04:06:37",satisfaction,4,"Sun, 11 Feb 2024 04:06:37"
2,"Wed, 2 Nov 2022 09:04:11",motivation,3,"Wed, 2 Nov 2022 09:04:11"
3,"Wed, 12 Jul 2023 19:02:37",feeling,3,"Wed, 12 Jul 2023 19:02:37"
4,"Sat, 1 Mar 2025 13:39:47",motivation,2,"Sat, 1 Mar 2025 13:39:47"


In [7]:
# note: a pandas update didn't like the non-zero-padded 'day' of the Finch data
# so I'm using datetime.strptime instead of pd.to_datetime for now
mood_data['Record_Datetime'] = mood_data.dt.map(lambda x: datetime.strptime(x, "%a, %d %b %Y %H:%M:%S"))
#mood_data['Updated_Datetime'] = mood_data.updated_time.map(lambda x: datetime.strptime(x, "%a, %d %b %Y %H:%M:%S"))
mood_data = mood_data.drop(['dt','updated_time'],axis=1)

In [8]:
mood_data.head()

Unnamed: 0,mood_type,value,Record_Datetime
0,feeling,4,2023-05-06 14:00:58
1,satisfaction,4,2024-02-11 04:06:37
2,motivation,3,2022-11-02 09:04:11
3,feeling,3,2023-07-12 19:02:37
4,motivation,2,2025-03-01 13:39:47


### Satisfaction

In [9]:
satisfaction_data = mood_data.query("mood_type == 'satisfaction'")
satisfaction_data.head()

Unnamed: 0,mood_type,value,Record_Datetime
1,satisfaction,4,2024-02-11 04:06:37
7,satisfaction,1,2022-11-02 22:06:19
11,satisfaction,3,2024-11-03 01:17:46
14,satisfaction,4,2022-12-14 03:26:55
16,satisfaction,4,2023-05-20 00:25:57


In [10]:
# subtract 7 hrs from datetime
# 7 is good because at 7am the next day starts ( according to my Finch app settings ),
# so the satisfaction score of the previous day must be submitted by then

In [11]:
seven_hrs = timedelta(hours=7)

In [12]:
satisfaction_data['Adjusted_Datetime'] = satisfaction_data['Record_Datetime'] - seven_hrs
satisfaction_data['Record_Date'] = satisfaction_data['Adjusted_Datetime'].dt.date

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  satisfaction_data['Adjusted_Datetime'] = satisfaction_data['Record_Datetime'] - seven_hrs
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  satisfaction_data['Record_Date'] = satisfaction_data['Adjusted_Datetime'].dt.date


In [13]:
satisfaction_data.head()

Unnamed: 0,mood_type,value,Record_Datetime,Adjusted_Datetime,Record_Date
1,satisfaction,4,2024-02-11 04:06:37,2024-02-10 21:06:37,2024-02-10
7,satisfaction,1,2022-11-02 22:06:19,2022-11-02 15:06:19,2022-11-02
11,satisfaction,3,2024-11-03 01:17:46,2024-11-02 18:17:46,2024-11-02
14,satisfaction,4,2022-12-14 03:26:55,2022-12-13 20:26:55,2022-12-13
16,satisfaction,4,2023-05-20 00:25:57,2023-05-19 17:25:57,2023-05-19


In [14]:
# check for duplicate dates
satisfaction_data.groupby('Record_Date').count().sort_values('value', ascending=False).head()

Unnamed: 0_level_0,mood_type,value,Record_Datetime,Adjusted_Datetime
Record_Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-12-02,2,2,2,2
2024-12-29,2,2,2,2
2022-10-26,1,1,1,1
2024-05-02,1,1,1,1
2024-05-03,1,1,1,1


In [15]:
#hmm
bad_day1 = date(2022, 12, 2)
satisfaction_data[satisfaction_data['Record_Date'] == bad_day1]

Unnamed: 0,mood_type,value,Record_Datetime,Adjusted_Datetime,Record_Date
23,satisfaction,2,2022-12-03 01:05:45,2022-12-02 18:05:45,2022-12-02
2636,satisfaction,4,2022-12-02 10:40:24,2022-12-02 03:40:24,2022-12-02


In [16]:
bad_day2 = date(2024,12,29)
satisfaction_data[satisfaction_data['Record_Date'] == bad_day2]

Unnamed: 0,mood_type,value,Record_Datetime,Adjusted_Datetime,Record_Date
1501,satisfaction,3,2024-12-30 01:33:36,2024-12-29 18:33:36,2024-12-29
2622,satisfaction,3,2024-12-29 09:54:59,2024-12-29 02:54:59,2024-12-29


In [17]:
# ooh I think these 'bad days' are days that I was in BC, so + 3hrs to time for days I was in BC ?

In [18]:
# conversely, I can remove these days as outliers, there are only a few out of 3000~ data records
# ooor manually adjust the record date 

# I believe the only reason these 2 days are off, despite the 3 hr shift and the 7 hr shift, 
#     is because I went and added the satisfaction score much later by going and changing it in 'history'

In [19]:
# doing this manually, will need another solution if I try to automate this process at some point
bad_indexes = [2636,2622]
one_day = timedelta(days=1)
for i in bad_indexes:
    satisfaction_data.at[i,'Record_Date'] = satisfaction_data.at[i,'Record_Date'] - one_day

In [20]:
# check for duplicate dates again
satisfaction_data.groupby('Record_Date').count().sort_values('value', ascending=False).head()

Unnamed: 0_level_0,mood_type,value,Record_Datetime,Adjusted_Datetime
Record_Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-10-26,1,1,1,1
2024-05-10,1,1,1,1
2024-05-02,1,1,1,1
2024-05-03,1,1,1,1
2024-05-04,1,1,1,1


In [21]:
satisfaction_df = satisfaction_data.drop(['mood_type','Record_Datetime','Adjusted_Datetime'],axis=1)
satisfaction_df.head()

Unnamed: 0,value,Record_Date
1,4,2024-02-10
7,1,2022-11-02
11,3,2024-11-02
14,4,2022-12-13
16,4,2023-05-19


### Feelings

In [22]:
feeling_data = mood_data.query("mood_type == 'feeling'")
feeling_data = feeling_data.drop('mood_type',axis=1)
feeling_data['Record_Date'] = feeling_data['Record_Datetime'].dt.date
feeling_data.head()

Unnamed: 0,value,Record_Datetime,Record_Date
0,4,2023-05-06 14:00:58,2023-05-06
3,3,2023-07-12 19:02:37,2023-07-12
6,4,2024-12-10 16:48:10,2024-12-10
8,3,2023-10-29 20:50:48,2023-10-29
10,3,2022-10-27 00:48:53,2022-10-27


In [23]:
avg_feelings_df = feeling_data[['value','Record_Date']].groupby('Record_Date').mean()
avg_feelings_df.head()

Unnamed: 0_level_0,value
Record_Date,Unnamed: 1_level_1
2022-10-26,3.0
2022-10-27,3.0
2022-10-28,2.875
2022-10-29,3.5
2022-10-30,3.5


### Explanatory variables

In [24]:
exvar_series = pd.read_json(exvar_data_location, typ='series')
exvar_data = pd.json_normalize(exvar_series.data)
exvar_data.head()

Unnamed: 0,dt,bullet_type,text,score,is_processed,bullet_status,preferred_emoji_char,completed_time,creation_time,position,...,linked_action_config.bullet.targeting_power_up_uuids,linked_action_config.bullet.associated_power_up_uuids,linked_action_config.bullet.linked_action_config,preferred_area_names,action_name,story_id,is_persistent,preferred_sentiment,gift_ids,linked_community_habit_id
0,"Sun, 28 Apr 2024 01:00:00",1,#Meditation Timer,0.0,True,0.0,🧘🏾,,"Sun, 28 Apr 2024 08:50:29",7.0,...,,,,,,,,,,
1,"Thu, 11 Apr 2024 01:00:00",1,Think about a positive moment with #yoga,6.666667,True,0.0,,,"Thu, 11 Apr 2024 09:01:23",0.0,...,[],[],,,,,,,,
2,"Fri, 17 May 2024 01:00:00",1,#Yoga,0.0,True,0.0,,,"Fri, 17 May 2024 10:26:16",3.0,...,,,,,,,,,,
3,"Sun, 28 Jan 2024 01:00:00",1,Think about a positive moment with #yoga,6.666667,True,0.0,,,"Sun, 28 Jan 2024 07:45:41",0.0,...,[],[],,,,,,,,
4,"Tue, 5 Sep 2023 01:00:00",1,#Read for #fun,6.666667,True,1.0,,"Tue, 5 Sep 2023 14:56:34","Tue, 5 Sep 2023 10:16:36",8.0,...,,,,[],,,,,,


In [25]:
exvar_df = exvar_data[['dt','bullet_type','text','bullet_status']]
exvar_df = exvar_df.query("bullet_type == 1")
exvar_df = exvar_df.drop('bullet_type',axis=1)

In [26]:
exvar_df.head()

Unnamed: 0,dt,text,bullet_status
0,"Sun, 28 Apr 2024 01:00:00",#Meditation Timer,0.0
1,"Thu, 11 Apr 2024 01:00:00",Think about a positive moment with #yoga,0.0
2,"Fri, 17 May 2024 01:00:00",#Yoga,0.0
3,"Sun, 28 Jan 2024 01:00:00",Think about a positive moment with #yoga,0.0
4,"Tue, 5 Sep 2023 01:00:00",#Read for #fun,1.0


In [27]:
# convert to datetime
exvar_df['Record_Datetime'] = exvar_df.dt.map(lambda x: datetime.strptime(x, "%a, %d %b %Y %H:%M:%S"))
exvar_df['Record_Date']= exvar_df['Record_Datetime'].dt.date
exvar_df = exvar_df.drop(['dt'],axis=1)

In [28]:
exvar_df.head()

Unnamed: 0,text,bullet_status,Record_Datetime,Record_Date
0,#Meditation Timer,0.0,2024-04-28 01:00:00,2024-04-28
1,Think about a positive moment with #yoga,0.0,2024-04-11 01:00:00,2024-04-11
2,#Yoga,0.0,2024-05-17 01:00:00,2024-05-17
3,Think about a positive moment with #yoga,0.0,2024-01-28 01:00:00,2024-01-28
4,#Read for #fun,1.0,2023-09-05 01:00:00,2023-09-05


In [29]:
exvar_df['text'] = exvar_df['text'].str.replace("#","")
exvar_df.head()

Unnamed: 0,text,bullet_status,Record_Datetime,Record_Date
0,Meditation Timer,0.0,2024-04-28 01:00:00,2024-04-28
1,Think about a positive moment with yoga,0.0,2024-04-11 01:00:00,2024-04-11
2,Yoga,0.0,2024-05-17 01:00:00,2024-05-17
3,Think about a positive moment with yoga,0.0,2024-01-28 01:00:00,2024-01-28
4,Read for fun,1.0,2023-09-05 01:00:00,2023-09-05


In [30]:
# replace {old_name: new name}
duplicates_dict = {
    "Laundry":"Start a load of laundry",
    "Create a Finch_Backup_File":"Create a Finch Backup File",
    "Walk around the neighborhood":"Mindfulness Walk"
}

In [31]:
exvar_df['text'] = exvar_df['text'].replace(duplicates_dict)

In [32]:
exvar_df.head()

Unnamed: 0,text,bullet_status,Record_Datetime,Record_Date
0,Meditation Timer,0.0,2024-04-28 01:00:00,2024-04-28
1,Think about a positive moment with yoga,0.0,2024-04-11 01:00:00,2024-04-11
2,Yoga,0.0,2024-05-17 01:00:00,2024-05-17
3,Think about a positive moment with yoga,0.0,2024-01-28 01:00:00,2024-01-28
4,Read for fun,1.0,2023-09-05 01:00:00,2023-09-05


In [33]:
exvar_df = exvar_df.sort_values('Record_Date',ascending=False)
exvar_df.head()

Unnamed: 0,text,bullet_status,Record_Datetime,Record_Date
10684,Apply to 1 Job,0.0,2025-03-01 01:00:00,2025-03-01
5820,Think about a positive moment with yoga,0.0,2025-03-01 01:00:00,2025-03-01
6729,Remind myself of the challenges I've overcome ...,0.0,2025-03-01 01:00:00,2025-03-01
13418,Create a dedicated workspace to draw better bo...,0.0,2025-03-01 01:00:00,2025-03-01
11989,Just be,1.0,2025-03-01 01:00:00,2025-03-01


In [34]:
#March 1,2025 is incomplete date, the date I downloaded the data
cut_off_date = datetime(2025,3,1).date()
exvar_df = exvar_df[exvar_df['Record_Date'] < cut_off_date]

In [35]:
exvar_df.head()

Unnamed: 0,text,bullet_status,Record_Datetime,Record_Date
12610,Start a load of laundry,0.0,2025-02-28 01:00:00,2025-02-28
2582,Eat breakfast,1.0,2025-02-28 01:00:00,2025-02-28
2265,Make art 🎨,0.0,2025-02-28 01:00:00,2025-02-28
3309,Take a shower,0.0,2025-02-28 01:00:00,2025-02-28
12651,Step outside once,1.0,2025-02-28 01:00:00,2025-02-28


In [36]:
completed_variables_df = exvar_df.query("bullet_status == 1.0").drop(['bullet_status','Record_Datetime'],axis=1)
completed_variables_df = completed_variables_df.rename(columns={'text':'variable'})
completed_variables_df.head()

Unnamed: 0,variable,Record_Date
2582,Eat breakfast,2025-02-28
12651,Step outside once,2025-02-28
3128,Do 1 Job Hunt Related Task,2025-02-28
13348,Take Vitamin_D,2025-02-28
5500,Just be,2025-02-28


In [37]:
completed_variables_df.shape

(6811, 2)

In [38]:
# try smaller amount of variables for now, using the most commonly completed which could be an issue
num_variables = 15
ex_variables = completed_variables_df.groupby('variable').count().sort_values('Record_Date', ascending=False).head(num_variables).index.to_list()

In [39]:
ex_variables

['Drink water',
 'Just be',
 'Put away one item that is not in its place',
 'Eat breakfast',
 'Eat lunch',
 'Eat Dinner',
 'Meditation Timer',
 'Mindfulness Walk',
 'Read for fun',
 '7000 steps',
 'Take a shower',
 'Step outside once',
 'Yoga',
 'Take meds',
 'Start a load of laundry']

In [40]:
completed_var_limit_df = completed_variables_df[completed_variables_df['variable'].isin(ex_variables)]
completed_encoded_exvar_df = pd.get_dummies(completed_var_limit_df.variable).join(completed_var_limit_df).drop('variable',axis=1)
completed_encoded_exvar_df.head()

Unnamed: 0,7000 steps,Drink water,Eat Dinner,Eat breakfast,Eat lunch,Just be,Meditation Timer,Mindfulness Walk,Put away one item that is not in its place,Read for fun,Start a load of laundry,Step outside once,Take a shower,Take meds,Yoga,Record_Date
2582,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,2025-02-28
12651,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,2025-02-28
5500,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,2025-02-28
5454,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,2025-02-28
5492,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,2025-02-28


In [41]:
exvar_encoded_df = completed_encoded_exvar_df.groupby('Record_Date').any().reset_index()
exvar_encoded_df.head()

Unnamed: 0,Record_Date,7000 steps,Drink water,Eat Dinner,Eat breakfast,Eat lunch,Just be,Meditation Timer,Mindfulness Walk,Put away one item that is not in its place,Read for fun,Start a load of laundry,Step outside once,Take a shower,Take meds,Yoga
0,2022-10-26,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False
1,2022-10-27,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False
2,2022-10-28,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False
3,2022-10-29,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False
4,2022-10-30,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False


In [42]:
#check some days in my Finch app:
exvar_encoded_df.sort_values('Record_Date', ascending=False).head()

Unnamed: 0,Record_Date,7000 steps,Drink water,Eat Dinner,Eat breakfast,Eat lunch,Just be,Meditation Timer,Mindfulness Walk,Put away one item that is not in its place,Read for fun,Start a load of laundry,Step outside once,Take a shower,Take meds,Yoga
849,2025-02-28,False,True,False,True,True,True,False,True,True,True,False,True,False,True,False
848,2025-02-27,False,True,True,True,True,True,False,True,True,True,True,True,True,True,False
847,2025-02-26,False,True,True,True,True,True,False,True,True,True,False,True,True,True,True
846,2025-02-25,False,True,False,True,True,True,False,True,True,True,False,True,False,True,True
845,2025-02-24,False,True,True,True,True,True,False,True,True,True,False,True,True,True,True


### Regression

In [43]:
x_min_date = exvar_encoded_df['Record_Date'].min()
x_max_date = exvar_encoded_df['Record_Date'].max()

print("X min date: " + str(x_min_date))
print("X max date: " + str(x_max_date))

X min date: 2022-10-26
X max date: 2025-02-28


In [44]:
y_min_date = satisfaction_df['Record_Date'].min()
y_max_date = satisfaction_df['Record_Date'].max()

print("Y min date: " + str(y_min_date))
print("Y max date: " + str(y_max_date))

Y min date: 2022-10-26
Y max date: 2025-02-27


In [45]:
min_date = max(x_min_date,y_min_date)
max_date = min(x_max_date,y_max_date)

print("min date: " + str(min_date))
print("max date: " + str(max_date))

min date: 2022-10-26
max date: 2025-02-27


In [46]:
print("satisfaction_df length before is " + str(len(satisfaction_df)))
satisfaction_df = satisfaction_df[(satisfaction_df['Record_Date'] > min_date) & (satisfaction_df['Record_Date'] < max_date)]
print("satisfaction_df length after is " + str(len(satisfaction_df)))

satisfaction_df length before is 679
satisfaction_df length after is 677


In [47]:
print("exvar_encoded_df length before is " + str(len(exvar_encoded_df)))
exvar_encoded_df = exvar_encoded_df[(exvar_encoded_df['Record_Date'] > min_date) & (exvar_encoded_df['Record_Date'] < max_date)]
print("exvar_encoded_df length after is " + str(len(exvar_encoded_df)))

exvar_encoded_df length before is 850
exvar_encoded_df length after is 847


In [48]:
y_missing_dates = pd.date_range(start=min_date, end=max_date).difference(satisfaction_df['Record_Date'])
x_missing_dates = pd.date_range(start=min_date, end=max_date).difference(exvar_encoded_df['Record_Date'])
missing_dates = set(())
for i in y_missing_dates:
    missing_dates.add(i.date())
for i in x_missing_dates:
        missing_dates.add(i)
print(len(missing_dates))

188


In [49]:
print("satisfaction records before reducing dates: " + str(len(satisfaction_df)))
satisfaction_trunc_df = satisfaction_df[~satisfaction_df['Record_Date'].isin(missing_dates)].sort_values('Record_Date')
print("satisfaction records after reducing dates: " + str(len(satisfaction_trunc_df)))

satisfaction records before reducing dates: 677
satisfaction records after reducing dates: 677


In [50]:
print("satisfaction records before reducing dates: " + str(len(exvar_encoded_df)))
exvar_trunc_df = exvar_encoded_df[~exvar_encoded_df['Record_Date'].isin(missing_dates)].sort_values('Record_Date')
print("satisfaction records after reducing dates: " + str(len(exvar_trunc_df)))

satisfaction records before reducing dates: 847
satisfaction records after reducing dates: 677


In [51]:
feature_cols = exvar_trunc_df.columns.to_list()
feature_cols.remove('Record_Date')
feature_cols

['7000 steps',
 'Drink water',
 'Eat Dinner',
 'Eat breakfast',
 'Eat lunch',
 'Just be',
 'Meditation Timer',
 'Mindfulness Walk',
 'Put away one item that is not in its place',
 'Read for fun',
 'Start a load of laundry',
 'Step outside once',
 'Take a shower',
 'Take meds',
 'Yoga']

In [52]:
logr = linear_model.LogisticRegression(random_state=16)
X = exvar_trunc_df[feature_cols]
y = satisfaction_trunc_df['value']
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25, random_state=16)
logr.fit(X_train,y_train)

In [53]:
y_pred = logr.predict(X_test)

In [54]:
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

array([[ 0,  0,  2,  1,  0],
       [ 0,  0,  7,  4,  0],
       [ 0,  0, 18, 40,  2],
       [ 0,  0,  8, 61,  6],
       [ 0,  0,  2, 17,  2]], dtype=int64)

In [55]:
target_names = ['bad','meh','average','good','great']
print(metrics.classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

         bad       0.00      0.00      0.00         3
         meh       0.00      0.00      0.00        11
     average       0.49      0.30      0.37        60
        good       0.50      0.81      0.62        75
       great       0.20      0.10      0.13        21

    accuracy                           0.48       170
   macro avg       0.24      0.24      0.22       170
weighted avg       0.42      0.48      0.42       170



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [56]:
logr2 = linear_model.LogisticRegression(random_state=16)
X = exvar_trunc_df[feature_cols]
y = satisfaction_trunc_df['value']
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=16)
logr2.fit(X_train,y_train)

In [57]:
y_pred = logr2.predict(X_test)

In [58]:
target_names = ['bad','meh','average','good','great']
print(metrics.classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

         bad       0.00      0.00      0.00         1
         meh       0.00      0.00      0.00         6
     average       0.57      0.32      0.41        53
        good       0.52      0.88      0.65        59
       great       0.25      0.06      0.10        17

    accuracy                           0.51       136
   macro avg       0.27      0.25      0.23       136
weighted avg       0.48      0.51      0.46       136



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
