In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline  

import pandas as pd
import numpy as np

from datetime import datetime, timedelta
import parseIntervalFiles as pif
import parseActivityFiles as paf
#pun intended :)
import consolidateFiles as cf
import datacleaning as cl

## ! Parse activity files and parse interval files are to be replaced with the corresponding database queries as soon as they are available 

## 0 - Pipeline configuration 
* set the input/output directories, user id and verbose level

In [2]:
verbose = True
    
DATA_PATH = "C:\\Users\\ju\\GDrive\\Projects\\HeRV\\Data\\" 
# DATA_PATH = "/home/ju/GDrive/Projects/HeRV/Data/"
    
RAW_PATH = DATA_PATH + "Raw"
PRE_PATH = DATA_PATH + "PreProcessed"     

## 1 - Extract sessions from raw data (intervals + annotations files)

### 1.1 - List activities with session start/stop datetime
Parses activities files and prints all errors found (activities without start or stop, invalid dates, etc)

In [3]:
sessions = []
for user in range(7):
    print ("------- parsing activity files for user ", user, " -------")
    user_sess = paf.get_user_sessions(user, dirname=RAW_PATH)
    print ("------- parsing interval files for user ", user, " -------")
    user_sess = cf.sessions_add_beats(sessions=user_sess, dirname=RAW_PATH+"\\"+str(user), verbose=False)
    sessions.extend(user_sess)

------- parsing activity files for user  0  -------
reading act170929.csv ... 
reading act170930.csv ... 
reading act171001.csv ... 
reading act171003.csv ... 
reading act171004.csv ... 
reading act171005.csv ... 
orphan start in: ['2017-10-05 12:54:00', 'start', 'eat', 'sitting', '']
orphan start in: ['2017-10-05 13:48:39', 'start', 'rest-active', 'sitting', '']
orphan start in: ['2017-10-05 13:57:37', 'start', 'movement', 'standing', '']
orphan start in: ['2017-10-05 17:55:44', 'start', 'focused-active', 'sitting', '']
orphan start in: ['2017-10-05 21:53:45', 'start', 'rest-active', 'sitting', '']
orphan start in: ['2017-10-05 23:47:45', 'start', 'focused-active', 'sitting', 'sitting on bed, installing and using pandas']
reading act171006.csv ... 
reading act171010.csv ... 
reading act171011.csv ... 
reading act171012.csv ... 
orphan stop in: ['2017-10-12 14:09:16', 'stop', '', '']
reading act171013.csv ... 
reading act171014.csv ... 
orphan stop in: ['2017-10-14 2:14:21', 'stop', ''

In [4]:
df = pd.DataFrame(sessions)

### Include column with beats count for ease of use 

In [5]:
df['beatscount'] = df['rr'].apply(len)

In [6]:
df.sample(5)

Unnamed: 0,activity,duration,notes,posture,rr,start,stop,user,beatscount
40,focused-passive,10409,,sitting,"[{'date': 2017-10-05 11:08:12, 'interval': 770...",2017-10-05 11:08:11,2017-10-05 14:01:40,0,10954
457,leisure-passive,1590,,sit,"[{'date': 2017-12-21 11:02:10, 'interval': 562...",2017-12-21 11:02:05,2017-12-21 11:28:35,4,1484
352,focused-passive,900,,sit,"[{'date': 2017-10-06 15:54:01, 'interval': 672...",2017-10-06 15:54:00,2017-10-06 16:09:00,2,1331
441,focused-active,2461,,sit,"[{'date': 2017-11-01 10:52:10, 'interval': 893...",2017-11-01 10:52:09,2017-11-01 11:33:10,3,2696
90,movement,3100,Mercado,standing,"[{'date': 2017-10-14 15:38:21, 'interval': 480...",2017-10-14 15:38:20,2017-10-14 16:30:00,0,5440


## 2 - Cleaning data

### 2.1 Removing outliers and artifacts

(Note: For now, beats won't be saved to the output file, so, when loading the sessions file and getting
beats from DB again, for example for fragmentation, they will have to be removed again. But this is a reasonably low-cost op)

In [7]:
df['rr'] = df['rr'].apply(cl.clean_rr_series)

In [8]:
df['beatscount_clean'] = df['rr'].apply(len)

In [9]:
df['removed_artifacts'] = df['beatscount'] - df['beatscount_clean']
df['beatscount'] = df['beatscount_clean']
df = df.drop(['beatscount_clean'], axis=1)

In [10]:
df.describe()

Unnamed: 0,duration,user,beatscount,removed_artifacts
count,509.0,509.0,509.0,509.0
mean,3126.332024,1.298625,3535.013752,21.011788
std,6650.273623,1.660453,6347.479887,59.930733
min,0.0,0.0,0.0,0.0
25%,840.0,0.0,925.0,0.0
50%,1293.0,1.0,1618.0,1.0
75%,2347.0,2.0,3459.0,11.0
max,80732.0,6.0,46856.0,591.0


### 2.2 Removing sessions without beats recorded or with too few of them

In [11]:
l = len(df)
df = df[df['beatscount'] > 100]
print(l - len(df), ' sessions out of', l, 'removed for lack of interval data')

44  sessions out of 509 removed for lack of interval data


### 2.3 Removing wrong activities and grouping the ones with too few examples

In [12]:
df.groupby('activity').count()['duration']

activity
eat                  52
exercise-high        18
exercise-low          3
focused-active      104
focused-passive      44
household-chores     30
leisure-active        7
leisure-passive      47
movement             54
moving                1
not-recorded          4
rest-active          27
rest-passive         21
sleep                53
Name: duration, dtype: int64

In [13]:
df[df['activity'] == 'moving']

Unnamed: 0,activity,duration,notes,posture,rr,start,stop,user,beatscount,removed_artifacts
56,moving,3350,,standing,"[{'date': 2017-10-11 12:30:11, 'interval': 825...",2017-10-11 12:30:10,2017-10-11 13:26:00,0,5622,5


In [16]:
# we have too few exercise sessions to classify intensity
df.at[df['activity']=='exercise-low', 'activity'] = 'exercise'
df.at[df['activity']=='exercise-high', 'activity'] = 'exercise'

# also we have too few leisure active sessions
df.at[df['activity']=='leisure-active', 'activity'] = 'leisure'
df.at[df['activity']=='leisure-passive', 'activity'] = 'leisure'

# these were just mispelled
df.at[df['activity']=='focused-passive ', 'activity'] = 'focused-passive'
df.at[df['activity']=='focused-active ', 'activity'] = 'focused-active'
df.at[df['activity']=='moving', 'activity'] = 'movement'

# and these are not categorized
df = df[df['activity']!='not-recorded']

In [17]:
df.groupby('activity').count()['duration']

activity
eat                  52
exercise             21
focused-active      104
focused-passive      44
household-chores     30
leisure              54
movement             55
rest-active          27
rest-passive         21
sleep                53
Name: duration, dtype: int64

In [18]:
df.groupby('posture').count()['duration']

posture
lie            56
lying down     13
sit           229
sitting        59
stand          76
standing       28
Name: duration, dtype: int64

In [19]:
# correcting diverse spellings in postures
df.at[df.posture == 'lying down', 'posture'] = 'lie'
df.at[df.posture == 'sitting', 'posture'] = 'sit'
df.at[df.posture.isin(['standing', 'standing ']), 'posture'] = 'stand'

In [20]:
df.groupby('posture').count()['duration']

posture
lie       69
sit      288
stand    104
Name: duration, dtype: int64

## 3 - Aggregating data

In [21]:
dic = df.to_dict(orient='records')
for i in dic:
    i.update(cf.features_from_dic(i['rr']))
print(dic[0]['rmssd'])

239.316870462


In [22]:
df = pd.DataFrame(dic)
df.sample(3)

Unnamed: 0,activity,beatscount,duration,hf,hfnu,lf,lf_hf,lfnu,mhr,mrri,...,posture,removed_artifacts,rmssd,rr,sdnn,start,stop,total_power,user,vlf
84,focused-active,3778,2520,305.186086,26.886796,829.891848,2.719298,73.113204,88.344377,682.902859,...,sit,0,27.574666,"[{'date': 2017-10-15 01:45:01, 'interval': 656...",51.561401,2017-10-15 01:45:00,2017-10-15 02:27:00,1653.704861,0,518.626927
85,focused-active,6348,4800,1528.723805,54.983944,1251.585649,0.818713,45.016056,79.682906,761.684152,...,sit,4,64.131928,"[{'date': 2017-10-15 03:10:01, 'interval': 655...",83.602937,2017-10-15 03:10:00,2017-10-15 04:30:00,4054.153933,0,1273.84448
249,focused-active,179,155,257.181505,11.586033,1962.57318,7.631082,88.413967,70.619274,865.832402,...,sit,2,62.112122,"[{'date': 2017-11-03 04:01:47, 'interval': 875...",86.812499,2017-11-03 04:01:46,2017-11-03 04:04:21,2505.310594,1,285.55591


In [23]:
df['sess_id'] = df.index
df_export = df.drop(['rr'], axis = 1)
df_export.sample(1)

Unnamed: 0,activity,beatscount,duration,hf,hfnu,lf,lf_hf,lfnu,mhr,mrri,...,posture,removed_artifacts,rmssd,sdnn,start,stop,total_power,user,vlf,sess_id
266,exercise,5130,2024,174.773225,46.128068,204.113713,1.167877,53.871932,152.419203,402.589279,...,stand,9,26.999776,72.48284,2018-03-16 09:50:13,2018-03-16 10:23:57,517.693607,1,138.806669,266


In [24]:
df_export.describe()

Unnamed: 0,beatscount,duration,hf,hfnu,lf,lf_hf,lfnu,mhr,mrri,nn50,pnn50,removed_artifacts,rmssd,sdnn,total_power,user,vlf,sess_id
count,461.0,461.0,461.0,461.0,461.0,461.0,461.0,461.0,461.0,461.0,461.0,461.0,461.0,461.0,461.0,461.0,461.0,461.0
mean,3874.579176,3086.715835,2015.540541,41.499584,1680.658589,1.954732,58.500416,87.277752,730.570081,904.195228,17.226556,23.104121,64.26114,89.860336,5063.921021,1.357918,1367.721892,230.0
std,6559.932023,5786.969465,3741.637048,17.010719,1681.182632,1.549343,17.010719,19.200477,142.305542,2599.271879,15.367034,62.576614,50.432512,43.13956,6110.490364,1.687514,1232.403252,133.223496
min,105.0,88.0,14.709334,8.241489,36.995074,0.172079,14.681505,54.190501,368.977202,0.0,0.0,0.0,5.402758,21.083352,150.076646,0.0,52.162541,0.0
25%,1102.0,900.0,370.564885,28.448897,654.85672,0.826091,45.238227,74.723575,641.662587,81.0,5.806938,0.0,32.724557,60.613749,1737.940659,0.0,564.166878,115.0
50%,1748.0,1332.0,698.063816,39.088649,1213.115738,1.558287,60.911351,83.047036,735.205128,241.0,13.483146,1.0,47.938118,78.048331,3175.450416,1.0,1017.30721,230.0
75%,3725.0,2405.0,1514.425399,54.761773,1983.360171,2.515075,71.551103,95.145346,817.078411,498.0,24.602122,13.0,74.83586,106.309723,5226.285317,2.0,1711.820497,345.0
max,46856.0,34360.0,22414.537885,85.318495,12243.363436,11.13373,91.758511,164.397688,1144.581425,20895.0,81.780538,591.0,290.138931,253.78725,36726.950587,6.0,7350.357547,460.0


In [25]:
df_export.to_excel(PRE_PATH + "\\sessions.xlsx")