In [2]:
import sys
import os
import pandas as pd
import numpy as np
import biosignalsnotebooks as bsnb
import biosignalsnotebooks.signal_samples as bsnb_ss
import re
import pickle
import scipy as sp
import datetime

In [8]:
df = pd.read_csv('/Users/htr365/Documents/Side_Projects/09_founding_lab/amanda_johanna/quantified_self.nosync/pipeline/survey_data/raw_survey_data.csv')

In [9]:
names = ['timestamp','person']  + [f'cis_{i}' for i in range(1, 21)] \
+ ['hooper_muscles_sore','extra_muscles_tired','hooper_sleep_quality','hooper_fatigue_level','hooper_stress_level']  \
+ ['sleep_hours','coffee_count','physical_activity','step_count'] \
+ ['f_illness','f_changes','f_day_description','f_measurement_comments','f_other']  \
+ ['alcohol','fatigue_expected','X','muscles_sore_expected','muscles_tired_expected'] 

In [11]:
df.columns = names
df.head()

Unnamed: 0,timestamp,person,cis_1,cis_2,cis_3,cis_4,cis_5,cis_6,cis_7,cis_8,...,f_illness,f_changes,f_day_description,f_measurement_comments,f_other,alcohol,fatigue_expected,X,muscles_sore_expected,muscles_tired_expected
0,24/06/2024 23:40:05,Johanna,Disagree,Strongly agree,Disagree,Strongly agree,Strongly agree,Strongly agree,Somewhat agree,Somewhat agree,...,,,measurement + work from home ~1h\n20min cycle ...,"- first measurement I did without button, prot...",this was a bit of a test day,,,,,
1,25/06/2024 13:52:02,Amanda,Somewhat agree,Neither agree or disagree,Somewhat disagree,Somewhat disagree,Somewhat agree,Somewhat agree,Neither agree or disagree,Somewhat disagree,...,"No, sore muscles and inflamed hand.",No,"Travelled to Brisbane 2 hours, install show 6 ...",Did first measurement twice as I wasn't on the...,Only 29 days to go!!!!,,,,,
2,25/06/2024 22:33:22,Johanna,Disagree,Agree,Disagree,Neither agree or disagree,Somewhat agree,Somewhat agree,Neither agree or disagree,Agree,...,no,,- 20min yoga stretching\n- 20min cycling to wo...,"pedometer showed 10500 steps, phone showed 140...",,no alcohol,,,,
3,26/06/2024 13:51:24,Amanda,Agree,Somewhat disagree,Agree,Agree,Somewhat disagree,Somewhat disagree,Somewhat agree,Disagree,...,"Yes, dizzy, fatigued sore muscles",Not really different from yesterday,Install exhibition,Could not get open signals to work in PM,,no alcohol,,,,
4,27/06/2024 01:17:04,Johanna,Disagree,Somewhat disagree,Disagree,Disagree,Somewhat agree,Somewhat agree,Somewhat agree,Agree,...,,,"cycled to work 20min\nworked 2.5 hours, lunch ...",Wasn't able to conduct a measurement in the ev...,,no alcohol,,,,


In [12]:
# drop unnecessary column
#df = df.drop('X',axis=1)

In [13]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['date'] = df['timestamp'].dt.date

  df['timestamp'] = pd.to_datetime(df['timestamp'])


In [14]:
# adjust for measurement errors
# Amanda, 17th of July: This survey is for the 16.07. Forgot to press SUBMIT 
df.loc[(df['date']==datetime.date(2024,7,17))* (df['person']=='Amanda'),'date'] = datetime.date(2024,7,16)

# Amanda, 18th of July: THIS IS FOR 17th. NOT 18th
df.loc[(df['timestamp']==datetime.datetime(2024,7,18,12,20,48))* (df['person']=='Amanda'),'date'] = datetime.date(2024,7,17)

# Amanda 21st of July
df.loc[(df['timestamp']==datetime.datetime(2024,7,21,11,1,52))* (df['person']=='Amanda'),'date']= datetime.date(2024,7,21)

# correct for Johanna often taken the survey after midnight
df['date'] = np.where((df['person']=='Johanna')*(df['timestamp'].dt.hour<20),df['date']-datetime.timedelta(days=1),df['date'])


In [15]:
# convert cis  items + hooper items + extra question to numeric scale

agree_likert_scale = {'strongly disagree': 1,
                'disagree': 2,
                'somewhat disagree': 3,
                'neither agree or disagree': 4,
                'somewhat agree': 5,
                'agree': 6,
                'strongly agree': 7}

quality_likert_scale = {'very poor': 1,
                'poor': 2,
                'below average': 3,
                'average': 4,
                'above average': 5,
                'good': 6,
                'excellent': 7}

intensity_likert_scale = {'very low': 1,
                'low': 2,
                'below average': 3,
                'average': 4,
                'above average': 5,
                'high': 6,
                'very high': 7}

# fetch columns to change
items_agree = [col for col in df.columns if col.startswith('cis')] + ['hooper_muscles_sore','extra_muscles_tired','muscles_sore_expected','muscles_tired_expected']
items_quality = ['hooper_sleep_quality']
items_intensity = ['hooper_fatigue_level','hooper_stress_level','fatigue_expected']
# convert to numeric
df[items_agree] = df[items_agree].applymap(lambda x: x.lower() if isinstance(x, str) else x).replace(agree_likert_scale).apply(pd.to_numeric, errors='coerce')
df[items_quality] = df[items_quality].applymap(lambda x: x.lower() if isinstance(x, str) else x).replace(quality_likert_scale).apply(pd.to_numeric, errors='coerce')
df[items_intensity] = df[items_intensity].applymap(lambda x: x.lower() if isinstance(x, str) else x).replace(intensity_likert_scale).apply(pd.to_numeric, errors='coerce')

In [16]:
# subscales of CIS
# Subjective feeling of fatigue items 1, 4, 6, 9, 12, 14, 16, 20
df['cis_subjective_fatigue'] = df[[f'cis_{i}' for i in [1,4,6,9,12,14,16,20]]].mean(axis=1)

# Concentration items 3, 8, 11, 13, 19
df['cis_concentration'] = df[[f'cis_{i}' for i in [3,8,11,13,19]]].mean(axis=1)

# Motivation items 2, 5, 15, 18
df['cis_motivation'] = df[[f'cis_{i}' for i in [2,5,15,18]]].mean(axis=1)

# Physical activity items 7, 10, 17
df['cis_physical_activity'] = df[[f'cis_{i}' for i in [7,10,17]]].mean(axis=1)

In [17]:
# map alcohol consumption to numeric values
alcohol_dict = {'no alcohol': 0,
            'one light alcoholic drink (e.g. beer, wine)': 1,
            'two light alcoholic drinks or one strong alcoholic drink (e.g. cocktail, shot)': 2,
             'more than three alcoholic drinks':3
            }
df['alcohol'] = df['alcohol'].map(alcohol_dict).fillna(0)

In [18]:
# convert sleep time to minutes
df['sleep_time'] = pd.to_datetime(df['sleep_hours']).dt.hour*60 +pd.to_datetime(df['sleep_hours']).dt.minute

  df['sleep_time'] = pd.to_datetime(df['sleep_hours']).dt.hour*60 +pd.to_datetime(df['sleep_hours']).dt.minute
  df['sleep_time'] = pd.to_datetime(df['sleep_hours']).dt.hour*60 +pd.to_datetime(df['sleep_hours']).dt.minute


In [19]:
def split_physical_activity(cell):
    if isinstance(cell, str):
        list_activities = cell.split(',')
        for i in range(len(list_activities)):
            list_activities[i] = list_activities[i].strip()
        return list_activities

# convert physical activites from string list to actual list
df['physical_activity_list'] = df['physical_activity'].apply(split_physical_activity)

In [20]:
activities_list = []
for i in range(len(df['physical_activity_list'])):
    if df.loc[i, 'physical_activity_list'] is not None:
        activities_list += df.loc[i, 'physical_activity_list']

activities_list = pd.Series(activities_list)


In [21]:
activities_list[activities_list=='Spinning Class'] = 'High Intensive Interval Training'
activities_list[activities_list=='Boxing Class'] = 'High Intensive Interval Training'
activities_list[activities_list=='8 hours of manual labour (smashing rocks)'] = 'Manual Labour'
activities_list[activities_list=='N/A manual labour'] = 'Manual Labour'
activities_list[activities_list=='Biketrip ~40km']='Cycling for 30min or longer'
activities_list[activities_list=='Hiking ~7h'] = 'Hiking'

In [22]:
# count ocurrence of activities
activities_counts = activities_list.value_counts()
# keep only those occurring more than once
activities_list = activities_counts[activities_counts>1].index

In [23]:
activities_counts

Walking for 30min or longer           23
Cycling for 30min or longer           13
Ball Sports                            7
Weight Lifting                         7
High Intensive Interval Training       6
Running                                6
                                       3
Hiking                                 2
Manual Labour                          2
Yoga                                   2
Nil                                    1
Physio session 1 hour                  1
Garden work + walking in the woods     1
Install manual labour                  1
Dancing with my niece 20 minutes       1
Minimal was at airports all day        1
Bodywork Class                         1
Upper Body Workout                     1
Studio                                 1
not much activity                      1
N/a                                    1
Full Body Workout with Weights         1
Swimming for 30min or longer           1
Name: count, dtype: int64

In [24]:
activities_list.unique()

Index(['Walking for 30min or longer', 'Cycling for 30min or longer',
       'Ball Sports', 'Weight Lifting', 'High Intensive Interval Training',
       'Running', '', 'Hiking', 'Manual Labour', 'Yoga'],
      dtype='object')

In [25]:
def activity_in_list(row, activity):
    if row is not None:
        if activity in row:
            return 1
        else:
            0
    else:
        0
# create one column for each activity listed more than once and put 1 if it happened 0 otherwise
for activity in activities_list:
    df[activity] = df['physical_activity_list'].apply(lambda x: activity_in_list(x, activity)).fillna(0)


In [26]:
df = df.rename(columns= {'Walking for 30min or longer': 'walking',
'Cycling for 30min or longer': 'cycling',
'Ball Sports': 'ball_sports',
'Weight Lifting': 'weight_lifting',
'Running':'running',
'Yoga': 'yoga',
'High Intensive Interval Training':'HITT',
'Manual Labour': 'manual_labour',
'Hiking':'hiking'})
df = df.drop('',axis=1)

In [27]:
# create lagged fatigue
df['cis_subjective_fatigue_lead1']  = df.groupby('person')['cis_subjective_fatigue'].shift(-1)
df['hooper_fatigue_level_lead1']  = df.groupby('person')['hooper_fatigue_level'].shift(-1)

In [48]:
# get max number of activities on a given day
#df[['walking','cycling','ball_sports','weight_lifting','running','yoga','spinning']].sum(axis=1).max()

KeyError: "['spinning'] not in index"

In [28]:
df.to_csv('/Users/htr365/Documents/Side_Projects/09_founding_lab/amanda_johanna/quantified_self.nosync/pipeline/survey_data/pre_processed_survey.csv')

In [25]:
df.columns

Index(['timestamp', 'person', 'cis_1', 'cis_2', 'cis_3', 'cis_4', 'cis_5',
       'cis_6', 'cis_7', 'cis_8', 'cis_9', 'cis_10', 'cis_11', 'cis_12',
       'cis_13', 'cis_14', 'cis_15', 'cis_16', 'cis_17', 'cis_18', 'cis_19',
       'cis_20', 'hooper_muscles_sore', 'extra_muscles_tired',
       'hooper_sleep_quality', 'hooper_fatigue_level', 'hooper_stress_level',
       'sleep_hours', 'coffee_count', 'physical_activity', 'step_count',
       'f_illness', 'f_changes', 'f_day_description', 'f_measurement_comments',
       'f_other', 'alcohol', 'fatigue_expected', 'muscles_sore_expected',
       'muscles_tired_expected', 'date', 'cis_subjective_fatigue',
       'cis_concentration', 'cis_motivation', 'cis_physical_activity',
       'sleep_time', 'physical_activity_list', 'walking', 'cycling',
       'ball_sports', 'weight_lifting', 'running', 'yoga', 'spinning',
       'cis_subjective_fatigue_lead1', 'hooper_fatigue_level_lead1'],
      dtype='object')

In [78]:
df[['person','date','cis_1', 'cis_2', 'cis_3', 'cis_4', 'cis_5',
       'cis_6', 'cis_7', 'cis_8', 'cis_9', 'cis_10', 'cis_11', 'cis_12',
       'cis_13', 'cis_14', 'cis_15', 'cis_16', 'cis_17', 'cis_18', 'cis_19',
       'cis_20', 'hooper_muscles_sore', 'extra_muscles_tired',
       'hooper_sleep_quality', 'hooper_fatigue_level', 'hooper_stress_level',
       'sleep_hours', 'coffee_count', 'physical_activity', 'step_count','alcohol', 'fatigue_expected', 'muscles_sore_expected',
       'muscles_tired_expected', 'cis_subjective_fatigue', 'cis_concentration',
       'cis_motivation', 'cis_physical_activity', 'sleep_time','walking', 'cycling', 'ball_sports',
       'weight_lifting', 'running', 'yoga', 'HITT','manual_labour','hiking', 'cis_subjective_fatigue_lead1', 'hooper_fatigue_level_lead1']].to_csv('/Users/htr365/Documents/Side_Projects/09_founding_lab/amanda_johanna/quantified_self.nosync/pipeline/survey_data/pre_processed_survey_features.csv')