In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline  

import pandas as pd
import consolidateFiles as cf
import datacleaning as cl
import fragmentation as fr

## ! Parse activity files and parse interval files are to be replaced with the corresponding database queries as soon as they are available 

## 0 - Pipeline configuration 
* set the input/output directories, user id and verbose level

In [10]:
verbose = True

PATH = "C:\\Users\\ju\\GDrive\\Projects\\HeRV\\Data\\"
RAW_PATH = PATH + "Raw"
PRE_PATH = PATH + "PreProcessed"

sessfile = PRE_PATH + "\\sessions.xlsx"
    
# duration (in seconds) to be cropped from the beginning of each second to account for stabilization and user adjustment
crop = 30
    
# duration (in seconds) of each fragment to be sent to analysis
duration = 60
    
# if any fragment has more than 'threshold' consecutive seconds with no beats, it will be discarded
threshold = 3    

## 1 - Read sessions

---

In [11]:
df = pd.read_excel(sessfile)
df.sample(5)

Unnamed: 0,activity,beatscount,duration,hf,hfnu,lf,lf_hf,lfnu,mhr,mrri,...,posture,removed_artifacts,rmssd,sdnn,start,stop,total_power,user,vlf,sess_id
90,focused-active,3253,2738,6473.12273,61.154796,4111.693472,0.635195,38.845204,74.433553,832.804796,...,sitting,11,138.306105,154.765788,2017-10-15 21:19:52,2017-10-15 22:05:30,14110.706626,0,3525.890424,90
237,focused-active,1913,1528,424.785132,18.423927,1880.831595,4.427725,81.576073,74.167713,817.078411,...,sit,1,48.286621,81.006877,2017-11-01 18:49:14,2017-11-01 19:14:42,4017.437225,1,1711.820497,237
19,sleep,27837,26160,15803.053319,79.900869,3975.271449,0.251551,20.099131,65.975786,955.910048,...,lying down,185,190.212028,203.053604,2017-10-04 00:44:00,2017-10-04 08:00:00,21842.918036,0,2064.593267,19
61,household-chores,6818,4191,291.456094,27.144197,782.276509,2.684029,72.855803,102.986775,588.313582,...,standing,10,33.293869,60.359268,2017-10-13 11:38:33,2017-10-13 12:48:24,1847.792964,0,774.060361,61
229,focused-active,927,744,321.951406,24.904036,970.81661,3.015413,75.095964,73.384829,821.813376,...,sit,0,31.998836,57.984979,2017-10-31 13:21:03,2017-10-31 13:33:27,2267.520903,1,974.752887,229


In [12]:
sessions = df.to_dict(orient='records')
print(sessions[0])

{'activity': 'focused-active', 'beatscount': 2862, 'duration': 2975, 'hf': 16632.76073930055, 'hfnu': 68.59709571934512, 'lf': 7614.272702685149, 'lf_hf': 0.457787665080388, 'lfnu': 31.40290428065488, 'mhr': 60.34981965901419, 'mrri': 1061.744933612858, 'nn50': 2234, 'notes': nan, 'pnn50': 78.08458580915763, 'posture': 'sitting', 'removed_artifacts': 25, 'rmssd': 239.3168704624504, 'sdnn': 233.7396537299742, 'start': Timestamp('2017-09-29 12:25:28'), 'stop': Timestamp('2017-09-29 13:15:03'), 'total_power': 29488.44338063871, 'user': 0, 'vlf': 5241.409938653003, 'sess_id': 0}


## 2 - Generate fragments from sessions

Breaks the sessions duration in fragments

Configurations:
* duration of each fragment in seconds;
* number of seconds to be discarded at the beginning of the session, accounting for user's stabilization and adjustment to posture and activity 

---


In [15]:
frags = cf.fragment_sessions(sessions, duration, crop)

446 valid sessions out of 447 total (at least one full fragment of 60 seconds after discarding first 30 seconds)


In [16]:
print(len(frags))
print(frags[0])

22924
{'start': Timestamp('2017-09-29 12:25:58'), 'stop': Timestamp('2017-09-29 12:26:58'), 'activity': 'focused-active', 'posture': 'sitting', 'user': 0, 'sess': 0, 'order': 0}


## 3 - Add and clean interval data to fragments
---

### 3.1 Extract beats in fragment

Retrieves from the heartbeat files all the intervals contained in each session's duration and adds them to the fragments objects (in memory)

In [17]:
def fragall(frags, path):
    for i, f in enumerate(frags):
        if (i % 100 == 0):
            print (i, '/', len(frags))
        f['rr'] = cf.beats_in_fragment(f, path)

In [18]:
%time fragall(frags, RAW_PATH)

0 / 22924
100 / 22924
200 / 22924
300 / 22924
400 / 22924
500 / 22924
600 / 22924
700 / 22924
800 / 22924
900 / 22924
1000 / 22924
1100 / 22924
1200 / 22924
1300 / 22924
1400 / 22924
1500 / 22924
1600 / 22924
1700 / 22924
1800 / 22924
1900 / 22924
2000 / 22924
2100 / 22924
2200 / 22924
2300 / 22924
2400 / 22924
2500 / 22924
2600 / 22924
2700 / 22924
2800 / 22924
2900 / 22924
3000 / 22924
3100 / 22924
3200 / 22924
3300 / 22924
3400 / 22924
3500 / 22924
3600 / 22924
3700 / 22924
3800 / 22924
3900 / 22924
4000 / 22924
4100 / 22924
4200 / 22924
4300 / 22924
4400 / 22924
4500 / 22924
4600 / 22924
4700 / 22924
4800 / 22924
4900 / 22924
5000 / 22924
5100 / 22924
5200 / 22924
5300 / 22924
5400 / 22924
5500 / 22924
5600 / 22924
5700 / 22924
5800 / 22924
5900 / 22924
6000 / 22924
6100 / 22924
6200 / 22924
6300 / 22924
6400 / 22924
6500 / 22924
6600 / 22924
6700 / 22924
6800 / 22924
6900 / 22924
7000 / 22924
7100 / 22924
7200 / 22924
7300 / 22924
7400 / 22924
7500 / 22924
7600 / 22924
7700 / 2292

In [19]:
df = pd.DataFrame(frags)
df['beatcount'] = df['rr'].apply(len)
df.describe()

Unnamed: 0,order,sess,user,beatcount
count,22924.0,22924.0,22924.0,22924.0
mean,114.772945,200.89583,1.187576,74.295324
std,134.757,138.292613,1.868483,27.105687
min,0.0,0.0,0.0,0.0
25%,14.0,73.0,0.0,62.0
50%,48.0,173.0,0.0,74.0
75%,184.0,320.0,2.0,85.0
max,571.0,446.0,6.0,200.0


### 3.2 Remove outliers from RR series

In [20]:
df['rr'] = df['rr'].apply(cl.clean_rr_series)
df['beatcount'] = df['rr'].apply(len)
df.describe()

Unnamed: 0,order,sess,user,beatcount
count,22924.0,22924.0,22924.0,22924.0
mean,114.772945,200.89583,1.187576,73.849546
std,134.757,138.292613,1.868483,26.854082
min,0.0,0.0,0.0,0.0
25%,14.0,73.0,0.0,62.0
50%,48.0,173.0,0.0,74.0
75%,184.0,320.0,2.0,85.0
max,571.0,446.0,6.0,186.0


### 3.3 Remove fragments with too few beats (due to hardware malfunction or software was not recording beats)

In [21]:
df = df[df['beatcount'] > 0.6 * duration]
df.describe()

Unnamed: 0,order,sess,user,beatcount
count,21610.0,21610.0,21610.0,21610.0
mean,110.63429,200.177418,1.172466,78.168302
std,131.251482,137.784549,1.852767,20.874893
min,0.0,0.0,0.0,37.0
25%,13.0,73.0,0.0,64.0
50%,45.0,176.0,0.0,75.0
75%,176.0,319.0,2.0,85.0
max,571.0,446.0,6.0,186.0


In [22]:
df.sample(4)

Unnamed: 0,activity,order,posture,rr,sess,start,stop,user,beatcount
21958,rest-active,8,sit,"[{'date': 2017-12-30 19:19:04, 'interval': 838...",437,2017-12-30 19:19:03,2017-12-30 19:20:03,6,79
7396,household-chores,46,stand,"[{'date': 2017-11-04 14:03:22, 'interval': 440...",114,2017-11-04 14:03:21,2017-11-04 14:04:21,0,128
20191,sleep,94,lie,"[{'date': 2017-12-22 02:34:31, 'interval': 834...",413,2017-12-22 02:34:30,2017-12-22 02:35:30,4,67
17951,household-chores,4,stand,"[{'date': 2017-10-17 21:04:31, 'interval': 691...",357,2017-10-17 21:04:30,2017-10-17 21:05:30,2,104


## 4 - Extract time and frequency domain features
---

***TODO*** do it using apply: df[feature_list] = df.apply(lambda row: pd.Series(aggregate_function(row['rr']), axis=1)

In [23]:
dic = df.to_dict(orient='records')
for i in dic:
    i.update(cf.features_from_dic(i['rr']))
print(dic[0]['rmssd'])

  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nper

  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nper

  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nper

  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))


229.33003577


In [24]:
df = pd.DataFrame(dic)
df.sample(3)

Unnamed: 0,activity,beatcount,hf,hfnu,lf,lf_hf,lfnu,mhr,mrri,nn50,...,posture,rmssd,rr,sdnn,sess,start,stop,total_power,user,vlf
8758,movement,117,1628.842284,66.202815,831.53994,0.51051,33.797185,115.215021,526.017094,5,...,stand,87.347799,"[{'date': 2017-11-18 11:01:59, 'interval': 504...",64.479746,147,2017-11-18 11:01:58,2017-11-18 11:02:58,3397.786413,0,937.404188
9647,eat,122,412.495408,57.617482,303.425164,0.735584,42.382518,121.401779,495.942623,2,...,sit,13.783749,"[{'date': 2017-11-24 12:21:14, 'interval': 539...",29.626406,161,2017-11-24 12:21:13,2017-11-24 12:22:13,803.79638,0,87.875808
10896,movement,104,50.744843,18.144755,228.921879,4.511234,81.855245,105.097141,573.096154,0,...,stand,10.460281,"[{'date': 2018-02-01 21:20:46, 'interval': 571...",36.77233,180,2018-02-01 21:20:45,2018-02-01 21:21:45,722.807931,0,443.141209


### removing HF outliers caused by small gaps between the recorded intervals, to which HF is particularly sensitive

***TODO*** it is best to actually remove the cause by separating continuous sequences in the interval 

In [25]:
dfr = df[df['hf'] < 15000]
print(len(df), 'original and', len(dfr), 'after pruning')

21610 original and 19233 after pruning


## 5 - Save

---

In [26]:
df_output = dfr.drop(['rr'], axis = 1)
df_output.describe()

Unnamed: 0,beatcount,hf,hfnu,lf,lf_hf,lfnu,mhr,mrri,nn50,order,pnn50,rmssd,sdnn,sess,total_power,user,vlf
count,19233.0,19233.0,19233.0,19233.0,19233.0,19233.0,19233.0,19233.0,19233.0,19233.0,19233.0,19233.0,19233.0,19233.0,19233.0,19233.0,19233.0
mean,80.363334,1773.941653,49.715246,1531.153352,2.214625,50.284754,80.330579,793.353147,15.412052,90.554724,22.114158,61.525357,70.406003,213.207768,4763.583921,1.298393,1458.488916
std,20.802385,2950.548394,25.388744,2840.047921,3.537051,25.388744,19.158152,168.537135,13.509879,113.640431,20.560198,52.131322,45.708567,137.217487,6895.635161,1.902666,3183.156723
min,37.0,0.077152,0.749662,0.173046,0.005172,0.514513,43.512674,327.907104,0.0,0.0,0.0,1.424952,2.100095,0.0,0.980344,0.0,0.0
25%,68.0,225.169651,27.548188,217.729426,0.399618,28.551939,68.75127,694.717647,4.0,12.0,4.878049,26.782073,38.349954,97.0,962.270918,0.0,180.706275
50%,77.0,580.018943,49.822851,596.142299,1.007111,50.177149,77.08786,785.960526,12.0,36.0,16.438356,42.529168,56.484266,188.0,2101.509697,0.0,514.230086
75%,87.0,1604.705984,71.448061,1579.597222,2.630003,72.451812,87.477389,882.623188,24.0,132.0,34.375,79.151921,89.612703,349.0,5157.723253,2.0,1428.327446
max,186.0,14976.717492,99.485487,38797.808704,132.393456,99.250338,183.052548,1382.954545,100.0,571.0,94.444444,431.483256,328.385002,446.0,70185.639553,6.0,61682.58901


In [27]:
filename = PRE_PATH + '\\df_' + str(duration) + '_' + str(crop) + '.xlsx'
print(filename)
df_output.to_excel(filename)

C:\Users\ju\GDrive\Projects\HeRV\Data\PreProcessed\df_60_30.xlsx


# Applying all steps above to generate different datasets

In [43]:
#durations = [300, 240, 180, 150, 120, 90, 60]
#crops = [90, 60, 30]

durations = [180, 150]
crops = [60]

def multifrag(sessions, durations, crops, path_in, path_out):
    for cr in crops:
        for dr in durations:
            fname = path_out + '\\df_' + str(dr) + '_' + str(cr) + '.xlsx'
            print ('generating', fname, '...')
            ds = fr.gen_fragments_dataset(sessions, dr, cr, path_in)
            print('resulting dataset:', len(ds), 'records' )
            ds.to_excel(fname)

INICIEI 23:15 11323 11477 11341 

In [44]:
%time multifrag(sessions, durations, crops, RAW_PATH, PRE_PATH)

generating C:\Users\ju\GDrive\Projects\HeRV\Data\PreProcessed\df_180_60.xlsx ...
435 valid sessions out of 447 total (at least one full fragment of 180 seconds after discarding first 60 seconds)
0 / 7427
100 / 7427
200 / 7427
300 / 7427
400 / 7427
500 / 7427
600 / 7427
700 / 7427
800 / 7427
900 / 7427
1000 / 7427
1100 / 7427
1200 / 7427
1300 / 7427
1400 / 7427
1500 / 7427
1600 / 7427
1700 / 7427
1800 / 7427
1900 / 7427
2000 / 7427
2100 / 7427
2200 / 7427
2300 / 7427
2400 / 7427
2500 / 7427
2600 / 7427
2700 / 7427
2800 / 7427
2900 / 7427
3000 / 7427
3100 / 7427
3200 / 7427
3300 / 7427
3400 / 7427
3500 / 7427
3600 / 7427
3700 / 7427
3800 / 7427
3900 / 7427
4000 / 7427
4100 / 7427
4200 / 7427
4300 / 7427
4400 / 7427
4500 / 7427
4600 / 7427
4700 / 7427
4800 / 7427
4900 / 7427
5000 / 7427
5100 / 7427
5200 / 7427
5300 / 7427
5400 / 7427
5500 / 7427
5600 / 7427
5700 / 7427
5800 / 7427
5900 / 7427
6000 / 7427
6100 / 7427
6200 / 7427
6300 / 7427
6400 / 7427
6500 / 7427
6600 / 7427
6700 / 7427
6

  .format(nperseg, input_length))


resulting dataset: 6223 records
generating C:\Users\ju\GDrive\Projects\HeRV\Data\PreProcessed\df_150_60.xlsx ...
436 valid sessions out of 447 total (at least one full fragment of 150 seconds after discarding first 60 seconds)
0 / 8962
100 / 8962
200 / 8962
300 / 8962
400 / 8962
500 / 8962
600 / 8962
700 / 8962
800 / 8962
900 / 8962
1000 / 8962
1100 / 8962
1200 / 8962
1300 / 8962
1400 / 8962
1500 / 8962
1600 / 8962
1700 / 8962
1800 / 8962
1900 / 8962
2000 / 8962
2100 / 8962
2200 / 8962
2300 / 8962
2400 / 8962
2500 / 8962
2600 / 8962
2700 / 8962
2800 / 8962
2900 / 8962
3000 / 8962
3100 / 8962
3200 / 8962
3300 / 8962
3400 / 8962
3500 / 8962
3600 / 8962
3700 / 8962
3800 / 8962
3900 / 8962
4000 / 8962
4100 / 8962
4200 / 8962
4300 / 8962
4400 / 8962
4500 / 8962
4600 / 8962
4700 / 8962
4800 / 8962
4900 / 8962
5000 / 8962
5100 / 8962
5200 / 8962
5300 / 8962
5400 / 8962
5500 / 8962
5600 / 8962
5700 / 8962
5800 / 8962
5900 / 8962
6000 / 8962
6100 / 8962
6200 / 8962
6300 / 8962
6400 / 8962
6500 

  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))


resulting dataset: 7500 records
Wall time: 21min 57s
