In [2]:
import pandas as pd
import numpy as np
import datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
import seaborn as sns


# to be able to use .head(100) to see more rows of df
pd.set_option("display.max_rows", 100, "display.max_columns", None)
pd.set_option('use_inf_as_na', True)

In [3]:
# read the data
df = pd.read_csv('dataset_mood_smartphone.csv', index_col=0)

# get dates from timestamp
df['time']= pd.to_datetime(df['time'])
df['date'] = df['time'].dt.date

# Feature Engineering setup

In [4]:
# turn 'variable' into separate columns, taking sum of values per day per user
df_sum = df.groupby(['id', 'date', 'variable'])['value'].sum().unstack()

# turn 'variable' into separate columns, taking mean of values per day per user
df_mean = df.groupby(['id', 'date', 'variable'])['value'].mean().unstack()

In [5]:
# combine mean of score values and sum of time values into one dataframe
df_combi = df_sum
df_combi['mood'] = df_mean['mood']
df_combi['circumplex.arousal'] = df_mean['circumplex.arousal']
df_combi['circumplex.valence'] = df_mean['circumplex.valence']
df_combi['activity'] = df_mean['activity']
df_combi.columns.name = None

In [6]:
df_rolled = df_combi.dropna(subset=['mood','screen'], how='all')
df_rolled = df_rolled.reset_index(level='id')
df_rolled = df_rolled.reset_index(level='date')

In [7]:
df_sub = df_rolled.dropna(subset=['mood'], how='all')
df_sub = df_sub.replace([np.inf, -np.inf],0)
df_sub = df_sub.fillna(0)

In [8]:
df_sub['benchmark'] =  df_sub['mood'].shift().where(df_sub.id.eq(df_sub.id.shift()))

In [9]:
df_sub.loc[0, 'benchmark'] = df_sub.loc[0, 'mood']

# Models

In [30]:
X_train_all = pd.DataFrame()
X_test_all = pd.DataFrame()
X_val_all = pd.DataFrame()
y_train_all = pd.DataFrame()
y_test_all = pd.DataFrame()
y_val_all = pd.DataFrame()

X_test_user = pd.DataFrame()
y_test_user = pd.DataFrame()

for i in df.id.unique():
    dfid = df_sub[df_sub['id'] == i]
    
#     y = dfid['mood']
    y = dfid[['id', 'mood']]
    
#     X = dfid['benchmark']
    X = dfid[['id', 'benchmark']]
    
    len_1 = int(len(X) * 0.8)
    len_11 = len(X) - len_1
    
    X_train_sub = X.head(len_1)
    X_test = X.tail(len_11)['benchmark']
    
    y_train_sub = y.head(len_1)
    y_test = y.tail(len_11)['mood']
    
    len_2 = int(len(X_train_sub)* 0.8)
    len_22 = len(X_train_sub) - len_2 
    
    X_train = X_train_sub.head(len_2)['benchmark']
    X_val = X_train_sub.tail(len_22)['benchmark']
    
    y_train = y_train_sub.head(len_2)['mood']
    y_val = y_train_sub.tail(len_22)['mood']
    
    X_user = X.tail(len_11)
    y_user = y.tail(len_11)


    X_train_all = pd.concat([X_train_all, X_train])
    X_test_all = pd.concat([X_test_all, X_test])
    X_val_all= pd.concat([X_val_all,X_val])
    y_train_all= pd.concat([y_train_all,y_train])
    y_test_all = pd.concat([y_test_all,y_test])
    y_val_all = pd.concat([y_val_all,y_val])
    X_test_user = pd.concat([X_test_user, X_user])
    y_test_user = pd.concat([y_test_user, y_user])

In [31]:
 len(X_train_all), len(X_test_all), len(X_val_all)

(790, 266, 212)

In [32]:
len(y_train_all),len( y_test_all ), len( y_val_all )

(790, 266, 212)

### Benchmark Model

In [35]:


bench_mse = metrics.mean_squared_error(X_test_all, y_test_all, squared=True)
bench_rmse = metrics.mean_squared_error(X_test_all, y_test_all, squared=False)
bench_r2 = metrics.r2_score(X_test_all, y_test_all)
bench_acc = '-'

print(f'MSE = {bench_mse}, RMSE = {bench_rmse}, R2 = {bench_r2}, accuracy = {bench_acc}')
bench_scores = {'model': 'Benchmark', 'MSE':bench_mse, 'RMSE':bench_rmse, 'R2':bench_r2, 'accuracy':bench_acc}

MSE = 0.4863982873851294, RMSE = 0.6974226031504351, R2 = 0.1373223261339751, accuracy = -


In [36]:
scores = [bench_scores]
df_scores = pd.DataFrame(scores).set_index('model')
df_scores.index.name = None
df_scores

Unnamed: 0,MSE,RMSE,R2,accuracy
Benchmark,0.486398,0.697423,0.137322,-


In [42]:
mae = []
rmse = []
r2 = []
for user in df.id.unique():
    X_user = X_test_user[X_test_user['id'] == user]['benchmark']
    y_user = y_test_user[y_test_user['id'] == user]['mood']
    mae.append(metrics.mean_absolute_error(X_user, y_user))
    rmse.append(metrics.mean_squared_error(X_user, y_user, squared=False))
    r2.append(metrics.r2_score(X_user, y_user))
    
scores_b = {'mae':mae, 'rmse':rmse, 'r2':r2}
scores_b

{'mae': [0.33999999999999986,
  0.6666666666666667,
  0.27666666666666667,
  0.3060606060606061,
  0.7055555555555555,
  1.1733333333333333,
  0.3666666666666668,
  0.575,
  0.5888888888888888,
  0.5181818181818184,
  0.40740740740740744,
  0.29090909090909095,
  0.7893939393939394,
  0.4583333333333333,
  0.5783333333333335,
  0.24666666666666667,
  0.5999999999999998,
  0.5272727272727273,
  0.3777777777777776,
  0.6714285714285714,
  0.5703703703703702,
  0.45416666666666683,
  0.5185185185185184,
  0.16999999999999993,
  0.12962962962962957,
  0.7666666666666665,
  0.672222222222222],
 'rmse': [0.41231056256176596,
  0.926296222251837,
  0.3148191720831358,
  0.35795434503762225,
  0.7987837977547839,
  1.4722431864335455,
  0.4781105976547098,
  0.7614131598547532,
  0.6964194138592059,
  0.5946733251427746,
  0.5272804674535411,
  0.45126085985421294,
  0.9917207777444721,
  0.5841660722773961,
  0.7029501325761942,
  0.3160520350968949,
  0.729725975966321,
  0.6746866088800194,