In [8]:
import pandas as pd
import numpy as np
import datetime
from sklearn import metrics

In [9]:
# import dataset
df = pd.read_csv('dataset_mood_smartphone.csv', index_col=0)
df

Unnamed: 0,id,time,variable,value
1,AS14.01,2014-02-26 13:00:00.000,mood,6.000
2,AS14.01,2014-02-26 15:00:00.000,mood,6.000
3,AS14.01,2014-02-26 18:00:00.000,mood,6.000
4,AS14.01,2014-02-26 21:00:00.000,mood,7.000
5,AS14.01,2014-02-27 09:00:00.000,mood,6.000
...,...,...,...,...
2770399,AS14.30,2014-04-11 07:51:16.948,appCat.weather,8.032
2772465,AS14.30,2014-04-19 11:00:32.747,appCat.weather,3.008
2774026,AS14.30,2014-04-26 10:19:07.434,appCat.weather,7.026
2774133,AS14.30,2014-04-27 00:44:48.450,appCat.weather,23.033


# Get Y_true-values

In [23]:
# check which date has the second to most entries, date after that will be the date to predict for
# second to most date with most entries does not have all users
df['date'] = pd.to_datetime(df['time']).dt.date
max_entries = df.groupby('date')['time'].count().sort_values().index[-2]
df_ydate = df[df['date'] == max_entries  + datetime.timedelta(days=1)]

# set y_true to the last entry of mood for every user on that date
y_true = df_ydate[df_ydate['variable'] == 'mood'].sort_values('time').groupby('id').last()['value']
y_true

id
AS14.01    7.0
AS14.02    7.0
AS14.03    8.0
AS14.05    8.0
AS14.06    8.0
AS14.07    6.0
AS14.08    5.0
AS14.09    7.0
AS14.12    7.0
AS14.13    3.0
AS14.14    6.0
AS14.15    6.0
AS14.16    7.0
AS14.17    7.0
AS14.19    6.0
AS14.20    7.0
AS14.23    7.0
AS14.24    8.0
AS14.25    6.0
AS14.26    8.0
AS14.27    9.0
AS14.28    5.0
AS14.29    9.0
AS14.30    8.0
AS14.31    7.0
AS14.32    7.0
AS14.33    9.0
Name: value, dtype: float64

# Get Y_pred-values

In [24]:
# date of x is the day before date of y
xdate = max_entries
df_xdate = df[df['date'] == xdate]

# set y_pred to the last entry of mood for every user on that date
y_pred = df_xdate[df_xdate['variable'] == 'mood'].sort_values('time').groupby('id').last()['value']
y_pred

id
AS14.01    7.0
AS14.02    8.0
AS14.03    7.0
AS14.05    7.0
AS14.06    8.0
AS14.07    5.0
AS14.08    6.0
AS14.09    8.0
AS14.12    7.0
AS14.13    6.0
AS14.14    6.0
AS14.15    6.0
AS14.16    7.0
AS14.17    7.0
AS14.19    6.0
AS14.20    8.0
AS14.23    7.0
AS14.24    7.0
AS14.25    7.0
AS14.26    7.0
AS14.27    7.0
AS14.28    7.0
AS14.29    8.0
AS14.30    7.0
AS14.31    7.0
AS14.32    9.0
AS14.33    8.0
Name: value, dtype: float64

In [25]:
# check if all users are present on both dates
num_users = len(df["id"].unique())
print(f'Y_true and Y_pred have the same and correct length ({num_users}): {len(y_true) == len(y_pred) == num_users}')

# calculate regression metrics
mse = sklearn.metrics.mean_squared_error(y_true, y_pred, squared=True)
rmse = sklearn.metrics.mean_squared_error(y_true, y_pred, squared=False)
r2 = sklearn.metrics.r2_score(y_true, y_pred)

print(f'MSE = {mse}, RMSE = {rmse}, R2 = {r2}')

Y_true and Y_pred have the same and correct length (27): True
MSE = 1.2592592592592593, RMSE = 1.1221672153735642, R2 = 0.2760252365930599
