In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline  

import pandas as pd
import numpy as np

from datetime import datetime, timedelta
import parseIntervalFiles as pif
import parseActivityFiles as paf
#pun intended :)
import consolidateFiles as cf
import datacleaning as cl

## ! Parse activity files and parse interval files are to be replaced with the corresponding database queries as soon as they are available 

## 0 - Pipeline configuration 
* set the input/output directories, user id and verbose level

In [2]:
verbose = True
    
DATA_PATH = "C:\\Users\\ju\\GDrive\\Projects\\HeRV\\Data\\" 
# DATA_PATH = "/home/ju/GDrive/Projects/HeRV/Data/"
    
RAW_PATH = DATA_PATH + "Raw"
PRE_PATH = DATA_PATH + "PreProcessed"     

## 1 - Extract sessions from raw data (intervals + annotations files)

### 1.1 - List activities with session start/stop datetime
Parses activities files and prints all errors found (activities without start or stop, invalid dates, etc)

In [33]:
sessions = []
for user in range(7):
    print ("------- parsing activity files for user ", user, " -------")
    user_sess = paf.get_user_sessions(user, dirname=RAW_PATH)
    print ("------- parsing interval files for user ", user, " -------")
    user_sess = cf.sessions_add_beats(sessions=user_sess, dirname=RAW_PATH+"\\"+str(user), verbose=False)
    sessions.extend(user_sess)

------- parsing activity files for user  0  -------
reading act170929.csv ... 
reading act170930.csv ... 
reading act171001.csv ... 
reading act171003.csv ... 
reading act171004.csv ... 
reading act171005.csv ... 
orphan start in: ['2017-10-05 12:54:00', 'start', 'eat', 'sitting', '']
orphan start in: ['2017-10-05 13:48:39', 'start', 'rest-active', 'sitting', '']
orphan start in: ['2017-10-05 13:57:37', 'start', 'movement', 'standing', '']
orphan start in: ['2017-10-05 17:55:44', 'start', 'focused-active', 'sitting', '']
orphan start in: ['2017-10-05 21:53:45', 'start', 'rest-active', 'sitting', '']
orphan start in: ['2017-10-05 23:47:45', 'start', 'focused-active', 'sitting', 'sitting on bed, installing and using pandas']
reading act171006.csv ... 
reading act171010.csv ... 
orphan start in: ['2017-10-10 23:23:37', 'start', 'leisure-passive', 'sitting']
reading act171011.csv ... 
orphan stop in: ['2017-10-11 0:04:41', 'stop', '', '', '']
orphan start in: ['2017-10-11 12:30:10', 'start

In [34]:
df = pd.DataFrame(sessions)

### Include column with beats count for ease of use 

In [35]:
df['beatscount'] = df['rr'].apply(len)

In [36]:
df.sample(5)

Unnamed: 0,activity,duration,notes,posture,rr,start,stop,user,beatscount
300,sleep,1440,,lie,"[{'date': 2017-09-30 00:16:01, 'interval': 940...",2017-09-30 00:16:00,2017-09-30 00:40:00,2,1684
148,eat,314,,sit,"[{'date': 2017-11-07 15:35:02, 'interval': 560...",2017-11-07 15:35:00,2017-11-07 15:40:14,0,550
10,leisure-active,520,,sitting,"[{'date': 2017-09-29 13:20:01, 'interval': 136...",2017-09-29 13:20:00,2017-09-29 13:28:40,0,484
295,rest-passive,840,,lie,"[{'date': 2018-01-21 17:21:22, 'interval': 775...",2018-01-21 17:21:21,2018-01-21 17:35:21,1,943
438,focused-active,1687,,sit,"[{'date': 2017-11-01 13:25:30, 'interval': 920...",2017-11-01 13:25:29,2017-11-01 13:53:36,3,2062


## 2 - Cleaning data

### 2.1 Removing outliers and artifacts

(Note: For now, beats won't be saved to the output file, so, when loading the sessions file and getting
beats from DB again, for example for fragmentation, they will have to be removed again. But this is a reasonably low-cost op)

In [37]:
df['rr'] = df['rr'].apply(cl.clean_rr_series)

In [38]:
df['beatscount_clean'] = df['rr'].apply(len)

In [39]:
df['removed_artifacts'] = df['beatscount'] - df['beatscount_clean']
df['beatscount'] = df['beatscount_clean']
df = df.drop(['beatscount_clean'], axis=1)

### 2.1 Removing sessions without beats recorded or with too few of them

In [40]:
l = len(df)
df = df[df['beatscount'] > 100]
print(l - len(df), ' sessions removed for lack of interval data')

44  sessions removed for lack of interval data


In [41]:
df.describe()

Unnamed: 0,duration,user,beatscount,removed_artifacts
count,451.0,451.0,451.0,451.0
mean,3128.152993,1.277162,3921.585366,24.210643
std,5742.669455,1.576462,6507.836257,63.918994
min,88.0,0.0,105.0,0.0
25%,900.0,0.0,1095.5,0.0
50%,1380.0,1.0,1765.0,1.0
75%,2456.0,2.0,3765.5,14.0
max,34360.0,6.0,46856.0,591.0


### 2.3 Removing wrong activities and grouping the ones with too few examples

In [42]:
df.at[df['activity']=='exercise-low', 'activity'] = 'exercise'
df.at[df['activity']=='exercise-high', 'activity'] = 'exercise'
df.at[df['activity']=='focused-passive ', 'activity'] = 'focused-passive'
df.at[df['activity']=='focused-active ', 'activity'] = 'focused-active'
df = df[df['activity']!='not-recorded']

In [43]:
df.groupby('activity').count()['duration']

activity
eat                 52
exercise            18
focused-active      96
focused-passive     44
household-chores    28
leisure-active       7
leisure-passive     46
movement            54
rest-active         27
rest-passive        22
sleep               53
Name: duration, dtype: int64

## 3 - Aggregating data

In [44]:
dic = df.to_dict(orient='records')
for i in dic:
    i.update(cf.features_from_dic(i['rr']))
print(dic[0]['rmssd'])

239.316870462


In [45]:
df = pd.DataFrame(dic)
df.sample(3)

Unnamed: 0,activity,beatscount,duration,hf,hfnu,lf,lf_hf,lfnu,mhr,mrri,...,posture,removed_artifacts,rmssd,rr,sdnn,start,stop,total_power,user,vlf
47,focused-passive,1947,1594,320.329992,33.848465,626.034921,1.954344,66.151535,91.512172,662.275295,...,sitting,1,24.692601,"[{'date': 2017-10-11 17:34:01, 'interval': 696...",65.115,2017-10-11 17:34:00,2017-10-11 18:00:34,1594.858765,0,648.493852
164,focused-active,14805,13176,1937.548615,45.106077,2357.989239,1.216996,54.893923,76.00817,798.919824,...,sit,8,75.800488,"[{'date': 2017-12-11 17:42:10, 'interval': 539...",93.484978,2017-12-11 17:15:27,2017-12-11 20:55:03,6172.834636,0,1877.296782
36,leisure-passive,1422,2814,1771.410303,55.384541,1426.973719,0.805558,44.615459,81.572208,749.16737,...,sitting,11,81.883888,"[{'date': 2017-10-10 18:45:59, 'interval': 740...",98.578054,2017-10-10 18:45:58,2017-10-10 19:32:52,4892.141225,0,1693.757204


In [46]:
df['sess_id'] = df.index
df_export = df.drop(['rr'], axis = 1)
df_export.sample(1)

Unnamed: 0,activity,beatscount,duration,hf,hfnu,lf,lf_hf,lfnu,mhr,mrri,...,posture,removed_artifacts,rmssd,sdnn,start,stop,total_power,user,vlf,sess_id
413,sleep,14958,14400,490.823684,57.276835,366.108588,0.745907,42.723165,73.023149,829.112782,...,lie,25,37.816694,70.582418,2017-12-22 01:00:00,2017-12-22 05:00:00,1356.283061,4,499.35079,413


In [47]:
df_export.describe()

Unnamed: 0,beatscount,duration,hf,hfnu,lf,lf_hf,lfnu,mhr,mrri,nn50,pnn50,removed_artifacts,rmssd,sdnn,total_power,user,vlf,sess_id
count,447.0,447.0,447.0,447.0,447.0,447.0,447.0,447.0,447.0,447.0,447.0,447.0,447.0,447.0,447.0,447.0,447.0,447.0
mean,3929.136465,3135.742729,2076.606347,41.820272,1696.639613,1.945711,58.179728,87.017901,732.207716,936.527964,17.482592,24.328859,65.238661,91.288731,5155.553577,1.270694,1382.307617,223.0
std,6523.669749,5759.877461,3788.306174,17.106085,1693.45552,1.600287,17.106085,18.629056,141.434246,2637.051462,15.443321,64.167324,50.994049,43.621539,6176.744041,1.582022,1241.811412,129.182042
min,105.0,88.0,13.635122,8.241489,36.995074,0.172079,14.681505,54.190501,396.424172,0.0,0.0,0.0,5.402758,21.083352,150.076646,0.0,98.372239,0.0
25%,1105.0,900.0,370.45051,28.811216,676.607628,0.818759,45.017463,74.645886,645.873295,86.0,5.974706,0.0,32.717216,62.53031,1747.995499,0.0,575.94761,111.5
50%,1768.0,1426.0,717.281722,39.494741,1218.595371,1.531983,60.505259,83.139116,733.686578,243.0,13.684211,1.0,48.366812,78.552601,3207.787111,1.0,1028.660179,223.0
75%,3765.5,2456.0,1623.96673,54.982537,1972.831213,2.470887,71.188784,94.921518,817.468789,529.0,24.99545,14.0,77.078708,108.487086,5258.892691,2.0,1716.335536,334.5
max,46856.0,34360.0,22414.537885,85.318495,12243.363436,11.13373,91.758511,155.541355,1144.581425,20895.0,81.780538,591.0,290.138931,253.78725,36726.950587,6.0,7350.357547,446.0


In [48]:
df_export.to_excel(PRE_PATH + "\\sessions.xlsx")