In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline  

import pandas as pd
import numpy as np

from datetime import datetime, timedelta, date
import parseIntervalFiles as pif
import parseActivityFiles as paf
#pun intended :)
import consolidateFiles as cf
import datacleaning as cl

## ! Parse activity files and parse interval files are to be replaced with the corresponding database queries as soon as they are available 

## 0 - Pipeline configuration 
* set the input/output directories, user id and verbose level

In [2]:
verbose = True
    
DATA_PATH = "C:\\Users\\ju\\GDrive\\Projects\\HeRV\\Data\\" 
# DATA_PATH = "/home/ju/GDrive/Projects/HeRV/Data/"
    
RAW_PATH = DATA_PATH + "Raw"
PRE_PATH = DATA_PATH + "PreProcessed" 

dt1 = datetime(2017,9,28)
dt2 = datetime.now()

## 1 - Extract sessions from raw data (intervals + annotations files)

### 1.1 - List activities with session start/stop datetime
Parses activities files and prints all errors found (activities without start or stop, invalid dates, etc)

In [3]:
sessions = []
for user in range(7):
    print ("------- parsing activity files for user ", user, " -------")
    user_sess = cf.get_user_sessions(user, dt1, dt2, RAW_PATH, verbose=False)
    print ("------- parsing interval files for user ", user, " -------")
    user_sess = cf.sessions_add_beats(user_sess, RAW_PATH, verbose=False)
    sessions.extend(user_sess)

------- parsing activity files for user  0  -------
------- parsing interval files for user  0  -------
------- parsing activity files for user  1  -------
------- parsing interval files for user  1  -------
------- parsing activity files for user  2  -------
------- parsing interval files for user  2  -------
------- parsing activity files for user  3  -------
------- parsing interval files for user  3  -------
------- parsing activity files for user  4  -------
------- parsing interval files for user  4  -------
------- parsing activity files for user  5  -------
------- parsing interval files for user  5  -------
------- parsing activity files for user  6  -------
------- parsing interval files for user  6  -------


In [4]:
df = pd.DataFrame(sessions)

### Include column with beats count for ease of use 

In [5]:
df['beatscount'] = df['rr'].apply(len)

In [6]:
df.sample(2)

Unnamed: 0,activity,duration,notes,posture,rr,start,stop,user,beatscount
497,rest-passive,1080,taxi,sit,"[{'date': 2017-12-27 12:52:01, 'interval': 943...",2017-12-27 12:52:00,2017-12-27 13:10:00,5,1174
410,leisure-passive,900,,lie,"[{'date': 2017-10-11 13:32:01, 'interval': 907...",2017-10-11 13:32:00,2017-10-11 13:47:00,2,1173


## 2 - Cleaning data

### 2.1 Removing outliers and artifacts

(Note: For now, beats won't be saved to the output file, so, when loading the sessions file and getting
beats from DB again, for example for fragmentation, they will have to be removed again. But this is a reasonably low-cost op)

In [7]:
df['rr'] = df['rr'].apply(cl.clean_rr_series)

In [8]:
df['beatscount_clean'] = df['rr'].apply(len)

In [9]:
df['removed_artifacts'] = df['beatscount'] - df['beatscount_clean']
df['beatscount'] = df['beatscount_clean']
df = df.drop(['beatscount_clean'], axis=1)

In [10]:
df.describe()

Unnamed: 0,duration,user,beatscount,removed_artifacts
count,532.0,532.0,532.0,532.0
mean,3043.43609,1.18797,3361.99812,19.199248
std,6849.224136,1.647462,6319.132273,57.303876
min,0.0,0.0,0.0,0.0
25%,625.25,0.0,670.0,0.0
50%,1195.5,0.0,1507.5,0.0
75%,2234.5,2.0,3205.0,9.0
max,80732.0,6.0,46856.0,591.0


### 2.2 Removing sessions without beats recorded or with too few of them

In [11]:
l = len(df)
df = df[df['beatscount'] > 100]
print(l - len(df), ' sessions out of', l, 'removed for lack of interval data')

44  sessions out of 532 removed for lack of interval data


### 2.3 Removing wrong activities and grouping the ones with too few examples

In [12]:
df.groupby('activity').count()['duration']

activity
commute               3
eat                  47
exercise-high        19
exercise-low          5
focused-active      105
focused-passive      48
household-chores     30
leisure-active        4
leisure-passive      46
movement             54
not-recorded          4
rest-active          27
rest-passive         18
sleep                34
train-baseline       12
train-breathe        16
train-focus          16
Name: duration, dtype: int64

In [13]:
# we have too few exercise sessions to classify intensity
df.at[df['activity']=='exercise-low', 'activity'] = 'exercise'
df.at[df['activity']=='exercise-high', 'activity'] = 'exercise'

# also we have too few leisure active sessions
df.at[df['activity']=='leisure-active', 'activity'] = 'leisure'
df.at[df['activity']=='leisure-passive', 'activity'] = 'leisure'

# these were just mispelled
df.at[df['activity']=='focused-passive ', 'activity'] = 'focused-passive'
df.at[df['activity']=='focused-active ', 'activity'] = 'focused-active'
df.at[df['activity']=='moving', 'activity'] = 'movement'

# and these are not categorized
df = df[df['activity']!='not-recorded']

In [14]:
df.groupby('activity').count()['duration']

activity
commute               3
eat                  47
exercise             24
focused-active      105
focused-passive      48
household-chores     30
leisure              50
movement             54
rest-active          27
rest-passive         18
sleep                34
train-baseline       12
train-breathe        16
train-focus          16
Name: duration, dtype: int64

In [15]:
df.groupby('posture').count()['duration']

posture
lie            37
lying down     14
sit           273
sitting        53
stand          81
standing       26
Name: duration, dtype: int64

In [16]:
# correcting diverse spellings in postures
df.at[df.posture == 'lying down', 'posture'] = 'lie'
df.at[df.posture == 'sitting', 'posture'] = 'sit'
df.at[df.posture.isin(['standing', 'standing ']), 'posture'] = 'stand'

In [17]:
df.groupby('posture').count()['duration']

posture
lie       51
sit      326
stand    107
Name: duration, dtype: int64

## 3 - Aggregating data

In [18]:
dic = df.to_dict(orient='records')
for i in dic:
    i.update(cf.features_from_dic(i['rr']))
print(dic[0]['rmssd'])

239.316870462


In [20]:
df = pd.DataFrame(dic)
df.sample(2)

Unnamed: 0,activity,beatscount,duration,hf,hfnu,lf,lf_hf,lfnu,mhr,mrri,...,posture,removed_artifacts,rmssd,rr,sdnn,start,stop,total_power,user,vlf
402,focused-active,1713,1440,743.667299,30.097725,1727.174903,2.32251,69.902275,73.71871,824.269119,...,sit,0,42.193909,"[{'date': 2017-10-30 14:08:00, 'interval': 821...",88.688059,2017-10-30 14:07:00,2017-10-30 14:31:00,3955.046988,2,1484.204786
75,rest-active,375,270,837.726244,36.232724,1474.344569,1.759936,63.767276,82.616601,732.248,...,sit,1,47.052678,"[{'date': 2017-10-14 18:41:31, 'interval': 707...",66.722907,2017-10-14 18:41:30,2017-10-14 18:46:00,3650.866471,0,1338.795658


In [21]:
df['sess_id'] = df.index

In [22]:
df_export = df.drop(['rr'], axis = 1)
df_export.sample(1)
df_export.describe()

Unnamed: 0,beatscount,duration,hf,hfnu,lf,lf_hf,lfnu,mhr,mrri,nn50,pnn50,removed_artifacts,rmssd,sdnn,total_power,user,vlf,sess_id
count,484.0,484.0,484.0,484.0,484.0,484.0,484.0,484.0,484.0,484.0,484.0,484.0,484.0,484.0,484.0,484.0,484.0,484.0
mean,3668.192149,2921.464876,1889.967914,40.707411,1639.95823,2.208712,59.292589,88.084332,719.915459,847.438017,15.929233,21.0,61.686185,85.07374,4708.694617,1.235537,1178.768472,241.5
std,6529.407654,5857.589297,3621.520731,17.859961,1728.754108,2.181079,17.859961,18.699766,131.361043,2568.624824,15.081087,59.750135,49.369763,41.86438,5878.129793,1.67502,1034.925166,139.863028
min,105.0,88.0,14.709334,5.716321,36.995074,0.172079,14.681505,54.190501,368.977202,0.0,0.0,0.0,5.402758,21.083352,150.076646,0.0,52.162541,0.0
25%,951.5,725.5,331.464097,27.364273,593.771793,0.822263,45.12317,76.10274,646.987098,51.75,5.111446,0.0,31.104931,57.82865,1569.029033,0.0,483.727786,120.75
50%,1714.5,1258.5,609.345694,37.861523,1149.164425,1.641211,62.138477,83.622505,728.134056,188.5,11.900859,1.0,45.705088,74.35115,2977.145911,0.0,916.468465,241.5
75%,3536.75,2270.25,1304.282538,54.87683,1923.512912,2.654479,72.635727,94.271131,802.770606,428.5,21.226555,11.0,71.926649,97.353373,4719.36678,2.0,1533.214681,362.25
max,46856.0,34980.0,22414.537885,85.318495,10846.43129,16.493768,94.283679,164.397688,1144.581425,20895.0,81.780538,591.0,290.138931,253.78725,36726.950587,6.0,6926.126266,483.0


In [23]:
df_export.to_excel(PRE_PATH + "\\sessions.xlsx")