In [16]:
import pandas as pd
from pymongo import MongoClient
import json

In [17]:
with open('credentials.json') as f:
    data = json.load(f)
    username = data['username']
    password = data['password']

In [18]:
client = MongoClient('mongodb://%s:%s@127.0.0.1' % (username, password))
db = client.rais

In [19]:
# read the sema collection
collection = db.sema
df = pd.DataFrame(list(collection.find()))

In [20]:
# Deleting the no-responses and nones
df=pd.DataFrame(list(collection.find( { '$and': [ {"data.MOOD": { "$ne": "<no-response>" }},{"data.MOOD": { "$ne": None }} ] } )))

In [21]:
df=df[['user_id']].join(pd.DataFrame(df['data'].tolist(), index=df.index).add_prefix('data.'))

In [22]:
df['date'] = pd.to_datetime(df['data.CREATED_TS']).dt.date
df['time'] = pd.to_datetime(df['data.CREATED_TS']).dt.time
users=df['user_id']
days=df['date']
time=df['time']
mood=df['data.MOOD']
df = pd.concat([users, days, time, mood],axis=1)
df = df.sort_values(by='date', ascending=True)
df['date'] = pd.to_datetime(df['date'].astype("str"), format='%Y-%m-%d')
df

Unnamed: 0,user_id,date,time,data.MOOD
2898,621e362467b776a2404ad513,2021-04-22,20:46:00,JOY
3670,621e362467b776a2404ad513,2021-04-23,11:06:00,SURPRISE
4076,621e362467b776a2404ad513,2021-04-23,15:06:00,NEUTRAL
3187,621e362467b776a2404ad513,2021-04-23,10:43:00,NEUTRAL
3296,621e362467b776a2404ad513,2021-04-24,18:40:00,ANGER
...,...,...,...,...
138,621e2eaf67b776a2406b14ac,2022-01-16,18:38:00,SAD
649,621e346f67b776a24081744f,2022-01-17,11:00:00,TENSE/ANXIOUS
1113,621e30f467b776a240f22944,2022-01-17,10:19:00,NEUTRAL
4947,621e2f9167b776a240011ccb,2022-01-17,11:09:00,TENSE/ANXIOUS


In [23]:
# checking for duplicates
print("Duplicates based on id, date, time and answer: {}".format(df[df.duplicated(subset=['user_id','date','time','data.MOOD'], keep=False)].shape[0]))
print("Duplicates based on id, date, time: {}".format(df[df.duplicated(subset=['user_id','date','time'], keep=False)].shape[0]))
print("Duplicates based on id, date: {}".format(df[df.duplicated(subset=['user_id','date'], keep=False)].shape[0]))

Duplicates based on id, date, time and answer: 0
Duplicates based on id, date, time: 0
Duplicates based on id, date: 4435


SEMAS-Mood were sent thrice per day, thus the duplicates based on id, date are acceptable.

In [24]:
# Selecting experiment dates
df1 = df[(df['date'] >= "2021-05-24") & (df['date'] <= "2021-07-26")] #Round1
df2 = df[(df['date'] >= "2021-11-15") & (df['date'] <= "2022-01-17")] #Round2
df=pd.concat([df1, df2], ignore_index=True)
df

Unnamed: 0,user_id,date,time,data.MOOD
0,621e2f3967b776a240c654db,2021-05-24,20:16:00,RESTED/RELAXED
1,621e362467b776a2404ad513,2021-05-24,10:31:00,HAPPY
2,621e32d967b776a240627414,2021-05-24,20:16:00,RESTED/RELAXED
3,621e2f7a67b776a240f14425,2021-05-24,21:55:00,RESTED/RELAXED
4,621e34ca67b776a240be3b69,2021-05-24,20:36:00,TIRED
...,...,...,...,...
4943,621e2eaf67b776a2406b14ac,2022-01-16,18:38:00,SAD
4944,621e346f67b776a24081744f,2022-01-17,11:00:00,TENSE/ANXIOUS
4945,621e30f467b776a240f22944,2022-01-17,10:19:00,NEUTRAL
4946,621e2f9167b776a240011ccb,2022-01-17,11:09:00,TENSE/ANXIOUS


In [25]:
sema = pd.get_dummies(df['data.MOOD'])
sema

Unnamed: 0,ALERT,HAPPY,NEUTRAL,RESTED/RELAXED,SAD,TENSE/ANXIOUS,TIRED
0,0,0,0,1,0,0,0
1,0,1,0,0,0,0,0
2,0,0,0,1,0,0,0
3,0,0,0,1,0,0,0
4,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...
4943,0,0,0,0,1,0,0
4944,0,0,0,0,0,1,0
4945,0,0,1,0,0,0,0
4946,0,0,0,0,0,1,0


In [26]:
# combine one-hot encoding with actual df
sema_stress = pd.concat([df, sema], axis=1)
sema_stress.drop(['data.MOOD'], axis=1, inplace=True)
sema_stress.drop(['time'], axis=1, inplace=True)

In [27]:
sema_stress

Unnamed: 0,user_id,date,ALERT,HAPPY,NEUTRAL,RESTED/RELAXED,SAD,TENSE/ANXIOUS,TIRED
0,621e2f3967b776a240c654db,2021-05-24,0,0,0,1,0,0,0
1,621e362467b776a2404ad513,2021-05-24,0,1,0,0,0,0,0
2,621e32d967b776a240627414,2021-05-24,0,0,0,1,0,0,0
3,621e2f7a67b776a240f14425,2021-05-24,0,0,0,1,0,0,0
4,621e34ca67b776a240be3b69,2021-05-24,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...
4943,621e2eaf67b776a2406b14ac,2022-01-16,0,0,0,0,1,0,0
4944,621e346f67b776a24081744f,2022-01-17,0,0,0,0,0,1,0
4945,621e30f467b776a240f22944,2022-01-17,0,0,1,0,0,0,0
4946,621e2f9167b776a240011ccb,2022-01-17,0,0,0,0,0,1,0


In [28]:
sema_stress.to_pickle('data/sema_stress_read_from_the_base_experiment_dates.pkl')

In [29]:
list(sema_stress)

['user_id',
 'date',
 'ALERT',
 'HAPPY',
 'NEUTRAL',
 'RESTED/RELAXED',
 'SAD',
 'TENSE/ANXIOUS',
 'TIRED']