In [34]:
%load_ext autoreload
%autoreload 2
%matplotlib inline  

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from datetime import datetime, timedelta
import parseIntervalFiles as pif
import parseActivityFiles as paf
#pun intended :)
import consolidateFiles as cf
import datacleaning as cl

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## ! Parse activity files and parse interval files are to be replaced with the corresponding database queries as soon as they are available 

## 0 - Pipeline configuration 
* set the input/output directories, user id and verbose level

In [57]:
verbose = True

PATH = "C:\\Users\\ju\\GDrive\\Projects\\HeRV\\Data\\"
RAW_PATH = PATH + "Raw"
PRE_PATH = PATH + "PreProcessed"

sessfile = PRE_PATH + "\\sessions.xlsx"
    
# duration (in seconds) to be cropped from the beginning of each second to account for stabilization and user adjustment
crop = 90
    
# duration (in seconds) of each fragment to be sent to analysis
duration = 300
    
# if any fragment has more than 'threshold' consecutive seconds with no beats, it will be discarded
threshold = 3    

## 1 - Read sessions

---

In [4]:
df = pd.read_excel(sessfile)
df.sample(5)

Unnamed: 0,activity,duration,notes,posture,start,stop,user,beatscount,removed_artifacts,sess_id
246,focused-passive,900,,sit,2017-10-05 09:16:00,2017-10-05 09:31:00,2,1254,0,246
176,focused-active,258,,sit,2017-10-30 02:11:05,2017-10-30 02:15:23,1,291,0,176
274,leisure-passive,960,,sit,2017-10-08 13:43:00,2017-10-08 13:59:00,2,1174,0,274
144,focused-active,1139,,sit,2017-11-07 12:08:50,2017-11-07 12:27:49,0,1509,12,144
191,leisure-passive,1640,,sit,2017-11-01 17:50:00,2017-11-01 18:17:20,1,2227,7,191


In [5]:
sessions = df.to_dict(orient='records')
print(sessions[0])

{'activity': 'focused-active', 'duration': 2975, 'notes': nan, 'posture': 'sitting', 'start': Timestamp('2017-09-29 12:25:28'), 'stop': Timestamp('2017-09-29 13:15:03'), 'user': 0, 'beatscount': 2862, 'removed_artifacts': 25, 'sess_id': 9}


## 2 - Generate fragments from sessions
---

### 2.1 - breaks the sessions duration in fragments
Configurations:
* duration of each fragment in seconds;
* number of seconds to be discarded at the beginning of the session, accounting for user's stabilization and adjustment to posture and activity 



In [6]:
frags0 = cf.frags_session(sessions[0], 0, 30)
print(len(frags0))
print(frags0[0])

99
{'start': Timestamp('2017-09-29 12:25:28'), 'stop': Timestamp('2017-09-29 12:25:58'), 'activity': 'focused-active', 'posture': 'sitting', 'user': 0, 'sess': 9, 'order': 0}


### 2.2 - extracts the intervals for each fragment
Retrieves from the heartbeat files all the intervals contained in each session's duration and adds them to the fragments objects (in memory)


In [7]:
frags = cf.fragment_sessions(sessions, duration, crop)

341 valid sessions out of 372 total (at least one full fragment of 300 seconds after discarding first 90 seconds)


In [8]:
print(frags[0])

{'start': Timestamp('2017-09-29 12:26:58'), 'stop': Timestamp('2017-09-29 12:31:58'), 'activity': 'focused-active', 'posture': 'sitting', 'user': 0, 'sess': 9, 'order': 0}


## 3 - Add and clean interval data to fragments
---

### 3.1 Extract beats in fragment

In [31]:
for f in frags:
    f['rr'] = cf.beats_in_fragment(f, RAW_PATH)

In [41]:
df = pd.DataFrame(frags)
df['beatcount'] = df['rr'].apply(len)
df.describe()

Unnamed: 0,order,sess,user,beatcount
count,3162.0,3162.0,3162.0,3162.0
mean,21.828906,192.372549,1.577166,370.31246
std,26.860934,137.774717,2.029806,125.543852
min,0.0,9.0,0.0,0.0
25%,2.0,61.0,0.0,307.25
50%,8.0,156.0,0.0,372.0
75%,36.0,347.0,2.0,426.0
max,113.0,408.0,6.0,916.0


### 3.2 Remove outliers from RR series

In [42]:
df['rr'] = df['rr'].apply(cl.clean_rr_series)
df['beatcount'] = df['rr'].apply(len)
df.describe()

Unnamed: 0,order,sess,user,beatcount
count,3162.0,3162.0,3162.0,3162.0
mean,21.828906,192.372549,1.577166,368.230867
std,26.860934,137.774717,2.029806,124.409389
min,0.0,9.0,0.0,0.0
25%,2.0,61.0,0.0,306.0
50%,8.0,156.0,0.0,371.0
75%,36.0,347.0,2.0,425.0
max,113.0,408.0,6.0,912.0


### 3.3 Remove fragments with too few beats (due to hardware malfunction or software was not recording beats)

In [43]:
df = df[df['beatcount'] > 0.6 * duration]
df.describe()

Unnamed: 0,order,sess,user,beatcount
count,3005.0,3005.0,3005.0,3005.0
mean,21.06223,191.497837,1.553411,385.821631
std,26.160555,136.958868,2.015311,99.472651
min,0.0,9.0,0.0,183.0
25%,2.0,64.0,0.0,317.0
50%,8.0,155.0,0.0,377.0
75%,34.0,339.0,2.0,427.0
max,113.0,408.0,6.0,912.0


In [44]:
df.sample(5)

Unnamed: 0,activity,order,posture,rr,sess,start,stop,user,beatcount
129,rest-passive,0,sitting,"[{'date': 2017-09-30 10:11:31, 'interval': 589...",17,2017-09-30 10:11:30,2017-09-30 10:16:30,0,595
732,rest-passive,5,sitting,"[{'date': 2017-10-11 11:36:22, 'interval': 109...",54,2017-10-11 11:36:21,2017-10-11 11:41:21,0,310
390,focused-passive,22,sitting,"[{'date': 2017-10-05 11:26:14, 'interval': 805...",39,2017-10-05 11:26:13,2017-10-05 11:31:13,0,401
2493,sleep,18,lie,"[{'date': 2017-12-21 01:59:50, 'interval': 764...",369,2017-12-21 01:59:49,2017-12-21 02:04:49,4,388
1714,leisure-passive,13,lie,"[{'date': 2017-11-01 01:42:58, 'interval': 962...",188,2017-11-01 01:42:57,2017-11-01 01:47:57,1,326


In [29]:
df.describe()

Unnamed: 0,order,sess,user,beatcount
count,3162.0,3162.0,3162.0,3162.0
mean,21.828906,192.372549,1.577166,370.31246
std,26.860934,137.774717,2.029806,125.543852
min,0.0,9.0,0.0,0.0
25%,2.0,61.0,0.0,307.25
50%,8.0,156.0,0.0,372.0
75%,36.0,347.0,2.0,426.0
max,113.0,408.0,6.0,916.0


## 4 - Extract time and frequency domain features
---

In [None]:
### TODO do it using apply: df[feature_list] = df.apply(lambda row: pd.Series(aggregate_function(row['rr']), axis=1)

In [53]:
dic = df.to_dict(orient='records')
for i in dic:
    i.update(cf.features_from_dic(i['rr']))
print(dic[0]['rmssd'])

280.25107608


In [54]:
df = pd.DataFrame(dic)
df.sample(3)

Unnamed: 0,activity,beatcount,hf,hfnu,lf,lf_hf,lfnu,mhr,mrri,nn50,...,posture,rmssd,rr,sdnn,sess,start,stop,total_power,user,vlf
1935,rest-active,447,192.231818,28.905314,472.807909,2.459572,71.094686,87.997066,684.836689,14,...,sit,21.594874,"[{'date': 2017-09-30 10:54:31, 'interval': 699...",44.965534,240,2017-09-30 10:54:30,2017-09-30 10:59:30,1092.098528,2,427.0588
92,sleep,279,25559.970646,82.3802,5466.866735,0.213884,17.6198,60.216886,1075.351254,226,...,lying down,258.089139,"[{'date': 2017-09-30 06:39:15, 'interval': 923...",241.961615,15,2017-09-30 06:39:14,2017-09-30 06:44:14,40559.111864,0,9532.274483
1335,leisure-passive,468,172.191002,37.729489,284.192075,1.650447,62.270511,91.830414,655.378205,7,...,sit,20.208814,"[{'date': 2017-11-02 15:52:32, 'interval': 663...",36.149288,132,2017-11-02 15:52:31,2017-11-02 15:57:31,661.089225,0,204.706149
772,movement,491,90.876517,18.441423,401.908207,4.422575,81.558577,96.641535,623.040733,2,...,standing,13.9352,"[{'date': 2017-10-12 20:47:29, 'interval': 612...",37.44147,66,2017-10-12 20:47:28,2017-10-12 20:52:28,740.387929,0,247.603205
2909,sleep,264,2083.700783,56.682625,1592.383003,0.764209,43.317375,53.809461,1154.867424,140,...,lie,76.769345,"[{'date': 2017-12-31 05:17:12, 'interval': 115...",182.547461,405,2017-12-31 05:17:10,2017-12-31 05:22:10,15012.319442,6,11336.235655


## removing HF outliers caused by small gaps between the recorded intervals, to which HF is particularly sensitive

### TODO it is best to actually remove the cause by separating continuous sequences in the interval 

In [64]:
len(df)

2786

In [66]:
df = df[df['hf'] < 10000]
print(len(df))
df.describe()

2569


Unnamed: 0,beatcount,hf,hfnu,lf,lf_hf,lfnu,mhr,mrri,nn50,order,pnn50,rmssd,sdnn,sess,total_power,user,vlf
count,2569.0,2569.0,2569.0,2569.0,2569.0,2569.0,2569.0,2569.0,2569.0,2569.0,2569.0,2569.0,2569.0,2569.0,2569.0,2569.0,2569.0
mean,398.920202,1474.701685,44.166343,1603.767425,1.990931,55.833657,78.826999,804.007774,72.769171,15.441806,20.420812,58.797604,78.286654,212.385364,4499.158987,1.776956,1420.689877
std,100.089144,1956.590624,20.370869,2287.248729,1.898729,20.370869,16.809966,164.795702,56.472625,20.843202,17.21087,39.14369,39.080109,133.840987,4722.866569,2.05239,1740.26942
min,183.0,0.575115,5.44825,2.567078,0.027289,2.656368,44.980479,334.412281,0.0,0.0,0.0,3.369205,13.625541,9.0,33.900642,0.0,17.395014
25%,339.0,330.323307,26.940924,495.707873,0.662149,39.836934,68.106733,698.990868,26.0,2.0,6.185567,31.111862,50.738878,79.0,1542.547662,0.0,433.968058
50%,390.0,706.013274,42.265324,1040.921389,1.366006,57.734676,77.645588,780.785714,62.0,6.0,16.097561,46.726057,68.870437,199.0,2863.835434,1.0,878.843079
75%,436.0,1675.762289,60.163066,1945.1021,2.711825,73.059076,86.848047,892.385965,108.0,22.0,30.952381,75.549329,95.664181,364.0,5489.066701,4.0,1650.106616
max,912.0,9991.472718,97.343632,28512.744605,17.354518,94.55175,180.011248,1343.444934,293.0,113.0,79.661017,243.998523,253.568023,408.0,37020.756543,6.0,17066.914795


## 5 - Save

---

In [67]:
df_output = df.drop(['rr'], axis = 1)

In [68]:
filename = PRE_PATH + '\\df_' + str(crop) + '_' + str(duration) + '.xlsx'
print(filename)
df_output.to_excel(filename)

C:\Users\ju\GDrive\Projects\HeRV\Data\PreProcessed\df_90_300.xlsx
