In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline  

import pandas as pd
import consolidateFiles as cf
import datacleaning as cl
import fragmentation as fr

## ! Parse activity files and parse interval files are to be replaced with the corresponding database queries as soon as they are available 

## 0 - Pipeline configuration 
* set the input/output directories, user id and verbose level

In [2]:
verbose = True

PATH = "C:\\Users\\ju\\GDrive\\Projects\\HeRV\\Data\\"
RAW_PATH = PATH + "Raw"
PRE_PATH = PATH + "PreProcessed"

sessfile = PRE_PATH + "\\sessions.xlsx"
    
# duration (in seconds) to be cropped from the beginning of each second to account for stabilization and user adjustment
crop = 90
    
# duration (in seconds) of each fragment to be sent to analysis
duration = 300
    
# if any fragment has more than 'threshold' consecutive seconds with no beats, it will be discarded
threshold = 3    

## 1 - Read sessions

---

In [3]:
df = pd.read_excel(sessfile)
df.sample(5)

Unnamed: 0,activity,beatscount,duration,hf,hfnu,lf,lf_hf,lfnu,mhr,mrri,...,posture,removed_artifacts,rmssd,sdnn,start,stop,total_power,user,vlf,sess_id
54,rest-passive,2155,1430,1689.127782,55.982722,1328.10277,0.786265,44.017278,92.065717,671.328538,...,standing,55,89.739275,104.30519,2017-10-12 19:21:58,2017-10-12 19:45:48,3769.145476,0,751.914924,54
140,rest-active,1432,1309,670.118718,54.882421,550.889226,0.822077,45.117579,89.880226,676.74581,...,sit,4,65.972778,67.077057,2017-11-15 11:31:43,2017-11-15 11:53:32,1737.940659,0,516.932716,140
214,eat,1779,1257,1274.20293,31.422387,2780.877014,2.182444,68.577613,84.909073,715.817313,...,sit,5,72.32839,87.077767,2017-10-29 19:02:22,2017-10-29 19:23:19,5197.081527,1,1142.001583,214
318,movement,1227,720,41.935333,15.442253,229.626931,5.475739,84.557747,100.62047,599.827221,...,stand,1,18.800923,45.874815,2017-10-06 17:48:00,2017-10-06 18:00:00,850.658983,2,579.096719,318
38,rest-active,1451,1121,1493.242274,55.198024,1212.003592,0.811659,44.801976,76.764375,790.315644,...,sitting,0,61.357545,84.070051,2017-10-10 19:49:33,2017-10-10 20:08:14,3844.011782,0,1138.765916,38


In [4]:
sessions = df.to_dict(orient='records')
print(sessions[0])

{'activity': 'focused-active', 'beatscount': 2862, 'duration': 2975, 'hf': 16632.76073930055, 'hfnu': 68.59709571934512, 'lf': 7614.272702685149, 'lf_hf': 0.457787665080388, 'lfnu': 31.40290428065488, 'mhr': 60.34981965901419, 'mrri': 1061.744933612858, 'nn50': 2234, 'notes': nan, 'pnn50': 78.08458580915763, 'posture': 'sitting', 'removed_artifacts': 25, 'rmssd': 239.3168704624504, 'sdnn': 233.7396537299742, 'start': Timestamp('2017-09-29 12:25:28'), 'stop': Timestamp('2017-09-29 13:15:03'), 'total_power': 29488.44338063871, 'user': 0, 'vlf': 5241.409938653003, 'sess_id': 0}


## 2 - Generate fragments from sessions

Breaks the sessions duration in fragments

Configurations:
* duration of each fragment in seconds;
* number of seconds to be discarded at the beginning of the session, accounting for user's stabilization and adjustment to posture and activity 

---


In [5]:
frags = cf.fragment_sessions(sessions, duration, crop)

410 valid sessions out of 447 total (at least one full fragment of 300 seconds after discarding first 90 seconds)
Wall time: 15.6 ms


In [6]:
print(len(frags))
print(frags[0])

4305
{'start': Timestamp('2017-09-29 12:26:58'), 'stop': Timestamp('2017-09-29 12:31:58'), 'activity': 'focused-active', 'posture': 'sitting', 'user': 0, 'sess': 0, 'order': 0}


## 3 - Add and clean interval data to fragments
---

### 3.1 Extract beats in fragment

Retrieves from the heartbeat files all the intervals contained in each session's duration and adds them to the fragments objects (in memory)

In [7]:
def fragall(frags, path):
    for f in frags:
        f['rr'] = cf.beats_in_fragment(f, path)

In [None]:
%time fragall(frags, RAW_PATH)

In [None]:
df = pd.DataFrame(frags)
df['beatcount'] = df['rr'].apply(len)
df.describe()

### 3.2 Remove outliers from RR series

In [None]:
df['rr'] = df['rr'].apply(cl.clean_rr_series)
df['beatcount'] = df['rr'].apply(len)
df.describe()

### 3.3 Remove fragments with too few beats (due to hardware malfunction or software was not recording beats)

In [None]:
df = df[df['beatcount'] > 0.6 * duration]
df.describe()

In [None]:
df.sample(4)

## 4 - Extract time and frequency domain features
---

***TODO*** do it using apply: df[feature_list] = df.apply(lambda row: pd.Series(aggregate_function(row['rr']), axis=1)

In [None]:
dic = df.to_dict(orient='records')
for i in dic:
    i.update(cf.features_from_dic(i['rr']))
print(dic[0]['rmssd'])

In [None]:
df = pd.DataFrame(dic)
df.sample(3)

### removing HF outliers caused by small gaps between the recorded intervals, to which HF is particularly sensitive

***TODO*** it is best to actually remove the cause by separating continuous sequences in the interval 

In [None]:
dfr = df[df['hf'] < 15000]
print(len(df), 'original and', len(dfr), 'after pruning')

## 5 - Save

---

In [None]:
df_output = dfr.drop(['rr'], axis = 1)
df_output.describe()

In [None]:
filename = PRE_PATH + '\\df_' + str(duration) + '_' + str(crop) + '.xlsx'
print(filename)
df_output.to_excel(filename)

# Applying all steps above to generate different datasets

In [None]:
for cr in [30, 60, 90]:
    for dr in [60, 90, 120, 150, 180, 240, 300]:        
        fname = PRE_PATH + '\\df_' + str(dr) + '_' + str(cr) + '.xlsx'
        print ('generating', fname, '...')
        ds = fr.gen_fragments_dataset(sessions, dr, cr, RAW_PATH)
        print('resulting dataset:', len(ds), 'records' )
        ds.to_excel(fname)