In [1]:
import pandas as pd
from datetime import datetime, timedelta, date
from sklearn import linear_model, model_selection, metrics
import matplotlib.pyplot as plt  

from load_data import LoadData

In [2]:
data_location = '../Sample Data/'
export_date = '2025-03-01'
load_data = LoadData(data_location, export_date)

### Response variables

In [3]:
mood_data = load_data.load_mood_data()
mood_data.head()

Unnamed: 0,dt,mood_type,value,updated_time
0,"Sat, 6 May 2023 14:00:58",feeling,4,"Sat, 6 May 2023 14:00:58"
1,"Sun, 11 Feb 2024 04:06:37",satisfaction,4,"Sun, 11 Feb 2024 04:06:37"
2,"Wed, 2 Nov 2022 09:04:11",motivation,3,"Wed, 2 Nov 2022 09:04:11"
3,"Wed, 12 Jul 2023 19:02:37",feeling,3,"Wed, 12 Jul 2023 19:02:37"
4,"Sat, 1 Mar 2025 13:39:47",motivation,2,"Sat, 1 Mar 2025 13:39:47"


In [4]:
mood_data['value'] = pd.to_numeric(mood_data.value)

In [5]:
# note: a pandas update didn't like the non-zero-padded 'day' of the Finch data
# so I'm using datetime.strptime instead of pd.to_datetime for now
mood_data['Record_Datetime'] = mood_data.dt.map(lambda x: datetime.strptime(x, "%a, %d %b %Y %H:%M:%S"))
mood_data = mood_data.drop(['dt','updated_time'],axis=1)

In [6]:
mood_data.head()

Unnamed: 0,mood_type,value,Record_Datetime
0,feeling,4,2023-05-06 14:00:58
1,satisfaction,4,2024-02-11 04:06:37
2,motivation,3,2022-11-02 09:04:11
3,feeling,3,2023-07-12 19:02:37
4,motivation,2,2025-03-01 13:39:47


In [7]:
# subtract 7 hrs from datetime
# 7 is good because at 7am the next day starts ( according to my Finch app settings ),
# so the satisfaction score of the previous day must be submitted by then
MORNING_CUTOFF_HOUR = 7
time_shift = timedelta(hours=MORNING_CUTOFF_HOUR)

In [8]:
mood_data['Adjusted_Datetime'] = mood_data['Record_Datetime'] - time_shift
mood_data['Record_Date'] = mood_data['Adjusted_Datetime'].dt.date

In [9]:
#find date range to use
# here I cut off the last day because usually the export date isn't a completed day
start_date = mood_data['Record_Date'].min()
end_date = mood_data['Record_Date'].max() - timedelta(days=1)

In [10]:
all_days = pd.date_range(start=start_date, end=end_date, freq='D')

In [11]:
dfs = {}
for mood_type in ('feeling','satisfaction','motivation'):
    type_df = mood_data[mood_data['mood_type'] == mood_type].copy()
    
    if mood_type == 'feeling':
        type_df = type_df.groupby('Record_Date')['value'].mean()
    else:
        # ugh these should only have 1 value a day, shouldn't do first!
        # TODO
        type_df = type_df.groupby('Record_Date')['value'].first() 
    
    # Reindex to full range
    type_df = type_df.reindex(all_days)
    type_df.name = mood_type
    dfs[mood_type] = type_df

In [12]:
# Combine into one DataFrame
mood_df = pd.concat(dfs.values(), axis=1)
mood_df.index.name = 'Record_Date'
mood_df = mood_df.reset_index()
mood_df = mood_df.rename(columns={'feeling':'average_feeling'})

In [13]:
mood_df.head(10)

Unnamed: 0,Record_Date,average_feeling,satisfaction,motivation
0,2022-10-26,3.0,3.0,3.0
1,2022-10-27,2.833333,2.0,4.0
2,2022-10-28,3.125,4.0,2.0
3,2022-10-29,3.375,3.0,3.0
4,2022-10-30,3.571429,4.0,4.0
5,2022-10-31,3.2,4.0,3.0
6,2022-11-01,3.428571,3.0,3.0
7,2022-11-02,2.75,1.0,3.0
8,2022-11-03,3.0,3.0,2.0
9,2022-11-04,3.333333,2.0,3.0


### Satisfaction

In [14]:
satisfaction_data = mood_df[['Record_Date','satisfaction']]
satisfaction_data = satisfaction_data.rename(columns={'satisfaction':'value'})
satisfaction_data.head()

Unnamed: 0,Record_Date,value
0,2022-10-26,3.0
1,2022-10-27,2.0
2,2022-10-28,4.0
3,2022-10-29,3.0
4,2022-10-30,4.0


### Feelings

In [15]:
avg_feelings_df = mood_df[['Record_Date','average_feeling']]
avg_feelings_df = avg_feelings_df.rename(columns={'average_feeling':'value'})
avg_feelings_df.head()

Unnamed: 0,Record_Date,value
0,2022-10-26,3.0
1,2022-10-27,2.833333
2,2022-10-28,3.125
3,2022-10-29,3.375
4,2022-10-30,3.571429


### Explanatory variables

In [16]:
exvar_data = load_data.load_activity_data()
exvar_data.head()

Unnamed: 0,dt,bullet_type,text,score,is_processed,bullet_status,preferred_emoji_char,completed_time,creation_time,position,...,linked_action_config.bullet.targeting_power_up_uuids,linked_action_config.bullet.associated_power_up_uuids,linked_action_config.bullet.linked_action_config,preferred_area_names,action_name,story_id,is_persistent,preferred_sentiment,gift_ids,linked_community_habit_id
0,"Sun, 28 Apr 2024 01:00:00",1,#Meditation Timer,0.0,True,0.0,🧘🏾,,"Sun, 28 Apr 2024 08:50:29",7.0,...,,,,,,,,,,
1,"Thu, 11 Apr 2024 01:00:00",1,Think about a positive moment with #yoga,6.666667,True,0.0,,,"Thu, 11 Apr 2024 09:01:23",0.0,...,[],[],,,,,,,,
2,"Fri, 17 May 2024 01:00:00",1,#Yoga,0.0,True,0.0,,,"Fri, 17 May 2024 10:26:16",3.0,...,,,,,,,,,,
3,"Sun, 28 Jan 2024 01:00:00",1,Think about a positive moment with #yoga,6.666667,True,0.0,,,"Sun, 28 Jan 2024 07:45:41",0.0,...,[],[],,,,,,,,
4,"Tue, 5 Sep 2023 01:00:00",1,#Read for #fun,6.666667,True,1.0,,"Tue, 5 Sep 2023 14:56:34","Tue, 5 Sep 2023 10:16:36",8.0,...,,,,[],,,,,,


In [17]:
exvar_df = exvar_data[['dt','bullet_type','text','bullet_status']]
exvar_df = exvar_df.query("bullet_type == 1")
exvar_df = exvar_df.drop('bullet_type',axis=1)

In [18]:
exvar_df.head()

Unnamed: 0,dt,text,bullet_status
0,"Sun, 28 Apr 2024 01:00:00",#Meditation Timer,0.0
1,"Thu, 11 Apr 2024 01:00:00",Think about a positive moment with #yoga,0.0
2,"Fri, 17 May 2024 01:00:00",#Yoga,0.0
3,"Sun, 28 Jan 2024 01:00:00",Think about a positive moment with #yoga,0.0
4,"Tue, 5 Sep 2023 01:00:00",#Read for #fun,1.0


In [19]:
# convert to datetime
exvar_df['Record_Date'] = exvar_df.dt.map(lambda x: datetime.strptime(x, "%a, %d %b %Y %H:%M:%S")).dt.date
exvar_df = exvar_df.drop(['dt'],axis=1)

In [20]:
exvar_df.head()

Unnamed: 0,text,bullet_status,Record_Date
0,#Meditation Timer,0.0,2024-04-28
1,Think about a positive moment with #yoga,0.0,2024-04-11
2,#Yoga,0.0,2024-05-17
3,Think about a positive moment with #yoga,0.0,2024-01-28
4,#Read for #fun,1.0,2023-09-05


In [21]:
exvar_df['text'] = exvar_df['text'].str.replace("#","")
exvar_df.head()

Unnamed: 0,text,bullet_status,Record_Date
0,Meditation Timer,0.0,2024-04-28
1,Think about a positive moment with yoga,0.0,2024-04-11
2,Yoga,0.0,2024-05-17
3,Think about a positive moment with yoga,0.0,2024-01-28
4,Read for fun,1.0,2023-09-05


In [22]:
# replace {old_name: new name}
duplicates_dict = {
    "Laundry":"Start a load of laundry",
    "Create a Finch_Backup_File":"Create a Finch Backup File",
    "Walk around the neighborhood":"Mindfulness Walk"
}

In [23]:
exvar_df['text'] = exvar_df['text'].replace(duplicates_dict)

In [24]:
exvar_df.head()

Unnamed: 0,text,bullet_status,Record_Date
0,Meditation Timer,0.0,2024-04-28
1,Think about a positive moment with yoga,0.0,2024-04-11
2,Yoga,0.0,2024-05-17
3,Think about a positive moment with yoga,0.0,2024-01-28
4,Read for fun,1.0,2023-09-05


In [25]:
exvar_df = exvar_df.sort_values('Record_Date',ascending=False)
exvar_df.head()

Unnamed: 0,text,bullet_status,Record_Date
10684,Apply to 1 Job,0.0,2025-03-01
5820,Think about a positive moment with yoga,0.0,2025-03-01
6729,Remind myself of the challenges I've overcome ...,0.0,2025-03-01
13418,Create a dedicated workspace to draw better bo...,0.0,2025-03-01
11989,Just be,1.0,2025-03-01


In [26]:
completed_variables_df = exvar_df.query("bullet_status == 1.0").drop(['bullet_status'],axis=1)
completed_variables_df = completed_variables_df.rename(columns={'text':'variable'})
completed_variables_df.head()

Unnamed: 0,variable,Record_Date
11989,Just be,2025-03-01
5652,Eat breakfast,2025-03-01
7609,Read for fun,2025-03-01
2804,Take meds,2025-03-01
10732,Take Vitamin_D,2025-03-01


In [27]:
completed_variables_df.shape

(6817, 2)

In [28]:
# try smaller amount of variables for now, using the most commonly completed which could be an issue
num_variables = 15
ex_variables = completed_variables_df.groupby('variable').count().sort_values('Record_Date', ascending=False).head(num_variables).index.to_list()

In [29]:
ex_variables

['Drink water',
 'Just be',
 'Put away one item that is not in its place',
 'Eat breakfast',
 'Eat lunch',
 'Eat Dinner',
 'Meditation Timer',
 'Mindfulness Walk',
 'Read for fun',
 '7000 steps',
 'Take a shower',
 'Step outside once',
 'Yoga',
 'Take meds',
 'Start a load of laundry']

In [30]:
completed_var_limit_df = completed_variables_df[completed_variables_df['variable'].isin(ex_variables)]
completed_encoded_exvar_df = pd.get_dummies(completed_var_limit_df.variable).join(completed_var_limit_df).drop('variable',axis=1)
completed_encoded_exvar_df.head()

Unnamed: 0,7000 steps,Drink water,Eat Dinner,Eat breakfast,Eat lunch,Just be,Meditation Timer,Mindfulness Walk,Put away one item that is not in its place,Read for fun,Start a load of laundry,Step outside once,Take a shower,Take meds,Yoga,Record_Date
11989,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,2025-03-01
5652,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,2025-03-01
7609,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,2025-03-01
2804,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,2025-03-01
5930,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,2025-03-01


In [31]:
exvar_encoded_df = completed_encoded_exvar_df.groupby('Record_Date').any()
exvar_encoded_df.head()

Unnamed: 0_level_0,7000 steps,Drink water,Eat Dinner,Eat breakfast,Eat lunch,Just be,Meditation Timer,Mindfulness Walk,Put away one item that is not in its place,Read for fun,Start a load of laundry,Step outside once,Take a shower,Take meds,Yoga
Record_Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2022-10-26,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False
2022-10-27,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False
2022-10-28,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False
2022-10-29,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False
2022-10-30,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False


In [32]:
# find date range to use
# here I cut off the last day because usually the export date isn't a completed day
start_date_exvar = exvar_encoded_df.index.min() + timedelta(days=1)
end_date_exvar = exvar_encoded_df.index.max() - timedelta(days=1)

exvar_encoded_df = exvar_encoded_df[(exvar_encoded_df.index > start_date_exvar) & (exvar_encoded_df.index < end_date_exvar)]
all_days_exvar = pd.date_range(start=start_date_exvar, end=end_date_exvar, freq='D')

exvar_encoded_df = exvar_encoded_df.reindex(all_days_exvar)
exvar_encoded_df.index.name = 'Record_Date'
exvar_encoded_df = exvar_encoded_df.reset_index()

In [33]:
exvar_encoded_df.head()

Unnamed: 0,Record_Date,7000 steps,Drink water,Eat Dinner,Eat breakfast,Eat lunch,Just be,Meditation Timer,Mindfulness Walk,Put away one item that is not in its place,Read for fun,Start a load of laundry,Step outside once,Take a shower,Take meds,Yoga
0,2022-10-27,,,,,,,,,,,,,,,
1,2022-10-28,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False
2,2022-10-29,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False
3,2022-10-30,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False
4,2022-10-31,True,True,False,False,False,False,False,False,True,False,False,False,False,False,False


In [39]:
#check some days in my Finch app:
exvar_encoded_df.sort_values('Record_Date', ascending=False).head()

#TODO, why first and last row (chronilogically) are all null values?

Unnamed: 0,Record_Date,7000 steps,Drink water,Eat Dinner,Eat breakfast,Eat lunch,Just be,Meditation Timer,Mindfulness Walk,Put away one item that is not in its place,Read for fun,Start a load of laundry,Step outside once,Take a shower,Take meds,Yoga
855,2025-02-28,,,,,,,,,,,,,,,
854,2025-02-27,False,True,True,True,True,True,False,True,True,True,True,True,True,True,False
853,2025-02-26,False,True,True,True,True,True,False,True,True,True,False,True,True,True,True
852,2025-02-25,False,True,False,True,True,True,False,True,True,True,False,True,False,True,True
851,2025-02-24,False,True,True,True,True,True,False,True,True,True,False,True,True,True,True


### Regression

In [35]:
x_min_date = exvar_encoded_df['Record_Date'].min()
x_max_date = exvar_encoded_df['Record_Date'].max()

print("X min date: " + str(x_min_date))
print("X max date: " + str(x_max_date))

X min date: 2022-10-27 00:00:00
X max date: 2025-02-28 00:00:00


In [40]:
y_min_date = satisfaction_data['Record_Date'].min()
y_max_date = satisfaction_data['Record_Date'].max()

print("Y min date: " + str(y_min_date))
print("Y max date: " + str(y_max_date))

Y min date: 2022-10-26 00:00:00
Y max date: 2025-02-28 00:00:00


In [41]:
min_date = max(x_min_date,y_min_date)
max_date = min(x_max_date,y_max_date)

print("min date: " + str(min_date))
print("max date: " + str(max_date))

min date: 2022-10-27 00:00:00
max date: 2025-02-28 00:00:00


In [42]:
print("satisfaction_data length before is " + str(len(satisfaction_data)))
satisfaction_data = satisfaction_data[(satisfaction_data['Record_Date'] > min_date) & (satisfaction_data['Record_Date'] < max_date)]
print("satisfaction_data length after is " + str(len(satisfaction_data)))

satisfaction_data length before is 857
satisfaction_data length after is 854


In [43]:
print("exvar_encoded_df length before is " + str(len(exvar_encoded_df)))
exvar_encoded_df = exvar_encoded_df[(exvar_encoded_df['Record_Date'] > min_date) & (exvar_encoded_df['Record_Date'] < max_date)]
print("exvar_encoded_df length after is " + str(len(exvar_encoded_df)))

exvar_encoded_df length before is 856
exvar_encoded_df length after is 854


In [44]:
y_missing_dates = pd.date_range(start=min_date, end=max_date).difference(satisfaction_data['Record_Date'])
x_missing_dates = pd.date_range(start=min_date, end=max_date).difference(exvar_encoded_df['Record_Date'])
missing_dates = set(())
for i in y_missing_dates:
    missing_dates.add(i.date())
for i in x_missing_dates:
        missing_dates.add(i)
print(len(missing_dates))

4


In [45]:
print("satisfaction records before reducing dates: " + str(len(satisfaction_data)))
satisfaction_trunc_df = satisfaction_data[~satisfaction_data['Record_Date'].isin(missing_dates)].sort_values('Record_Date')
print("satisfaction records after reducing dates: " + str(len(satisfaction_trunc_df)))

satisfaction records before reducing dates: 854
satisfaction records after reducing dates: 854


  satisfaction_trunc_df = satisfaction_data[~satisfaction_data['Record_Date'].isin(missing_dates)].sort_values('Record_Date')


In [46]:
print("exp variables records before reducing dates: " + str(len(exvar_encoded_df)))
exvar_trunc_df = exvar_encoded_df[~exvar_encoded_df['Record_Date'].isin(missing_dates)].sort_values('Record_Date')
print("exp variables records after reducing dates: " + str(len(exvar_trunc_df)))

exp variables records before reducing dates: 854
exp variables records after reducing dates: 854


  exvar_trunc_df = exvar_encoded_df[~exvar_encoded_df['Record_Date'].isin(missing_dates)].sort_values('Record_Date')


In [47]:
feature_cols = exvar_trunc_df.columns.to_list()
feature_cols.remove('Record_Date')
feature_cols

['7000 steps',
 'Drink water',
 'Eat Dinner',
 'Eat breakfast',
 'Eat lunch',
 'Just be',
 'Meditation Timer',
 'Mindfulness Walk',
 'Put away one item that is not in its place',
 'Read for fun',
 'Start a load of laundry',
 'Step outside once',
 'Take a shower',
 'Take meds',
 'Yoga']

In [48]:
logr = linear_model.LogisticRegression(random_state=16)
X = exvar_trunc_df[feature_cols]
y = satisfaction_trunc_df['value']
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25, random_state=16)
logr.fit(X_train,y_train)

ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
y_pred = logr.predict(X_test)

In [49]:
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

NameError: name 'y_pred' is not defined

In [None]:
target_names = ['bad','meh','average','good','great']
print(metrics.classification_report(y_test, y_pred, target_names=target_names))

In [None]:
logr2 = linear_model.LogisticRegression(random_state=16)
X = exvar_trunc_df[feature_cols]
y = satisfaction_trunc_df['value']
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=16)
logr2.fit(X_train,y_train)

In [None]:
y_pred = logr2.predict(X_test)

In [None]:
target_names = ['bad','meh','average','good','great']
print(metrics.classification_report(y_test, y_pred, target_names=target_names))

### feeling frequency

In [None]:
avg_daily_feelings_df = avg_feelings_df.reset_index()

In [None]:
avg_daily_feelings_df.head()

In [None]:
max_feel = max(avg_daily_feelings_df.value)
min_feel = min(avg_daily_feelings_df.value)
num_bins = 4
jumps = (max_feel - min_feel)//num_bins

plt.hist(avg_daily_feelings_df.value, bins=num_bins)
plt.grid(True)
#plt.yticks(range(0,10,2))
plt.xticks(range(1,6,int(jumps)))
plt.xlabel('Daily Feels')
plt.ylabel('Number of Days')
plt.title('Feeling Ranges')
plt.show()

In [None]:
month_feels = feeling_data[['value','Record_Datetime']]
month_feels.head()

In [None]:
month_feels['Record_Month'] = month_feels['Record_Datetime'].dt.month

In [None]:
month_feels.head()

In [None]:
month_feels_2024 = month_feels[month_feels['Record_Datetime'].dt.year == 2024]

In [None]:
avg_month_feels_2024 = month_feels_2024.drop('Record_Datetime',axis=1).groupby('Record_Month').mean().reset_index()

In [None]:
plt.bar(avg_month_feels_2024.Record_Month, avg_month_feels_2024.value)
plt.xticks(range(1,13))

plt.show()

In [None]:
month_feels['Record_Year'] = month_feels['Record_Datetime'].dt.year
month_feels.head()

In [None]:
mon_year_avg_feels = month_feels.groupby(['Record_Month','Record_Year']).mean().reset_index().drop('Record_Datetime',axis=1)
mon_2024_avg_feels = mon_year_avg_feels[mon_year_avg_feels['Record_Year'] == 2024]
mon_2023_avg_feels = mon_year_avg_feels[mon_year_avg_feels['Record_Year'] == 2023]

In [None]:
monthList = range(1,13)

plt.bar([a - 0.25 for a in monthList], mon_2023_avg_feels.value, width= 0.25, label = '2023', align='edge')
plt.bar([a + 0.25 for a in monthList], mon_2024_avg_feels.value, width= -0.25, label = '2024', align='edge')
plt.xticks(monthList)

plt.xlabel('Month')
plt.ylabel('average feeling')
plt.legend(loc='upper left')
plt.grid(True, linewidth= 1, linestyle="--")

plt.show()