In [28]:
%load_ext autoreload
%autoreload 2
%matplotlib inline  

import pandas as pd
import consolidateFiles as cf
import datacleaning as cl
import fragmentation as fr
import hervpd as hp

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## ! Parse activity files and parse interval files are to be replaced with the corresponding database queries as soon as they are available 

## 0 - Pipeline configuration 
* set the input/output directories, user id and verbose level

In [29]:
verbose = True

PATH = "C:\\Users\\ju\\GDrive\\Projects\\HeRV\\Data\\"
RAW_PATH = PATH + "Raw"
PRE_PATH = PATH + "PreProcessed"

sessfile = PRE_PATH + "\\sessions.xlsx"
    
# duration (in seconds) to be cropped from the beginning of each second to account for stabilization and user adjustment
crop = 0
    
# duration (in seconds) of each fragment to be sent to analysis
duration = 60
    
# if any fragment has more than 'threshold' consecutive seconds with no beats, it will be discarded
threshold = 3    

## 1 - Read sessions

---

In [30]:
df = pd.read_excel(sessfile)
df.sample(5)

Unnamed: 0,activity,beatscount,duration,hf,hfnu,lf,lf_hf,lfnu,mhr,mrri,...,posture,removed_artifacts,rmssd,sdnn,start,stop,total_power,user,vlf,sess_id
353,focused-passive,9751,6840,780.1057,34.606453,1474.114657,1.889635,65.393547,85.343144,716.607425,...,sit,6,45.026164,97.035724,2017-10-16 09:48:00,2017-10-16 11:42:00,3737.419152,2,1483.198795,353
16,eat,858,571,3780.918702,62.690289,2250.188775,0.595143,37.309711,94.707987,666.45338,...,sit,38,142.113592,129.463351,2017-10-01 13:37:53,2017-10-01 13:47:24,7749.503174,0,1718.395697,16
294,focused-passive,1359,900,115.619983,15.006292,654.85672,5.663871,84.993708,89.328623,675.378219,...,sit,1,19.33011,49.012318,2017-10-05 08:27:00,2017-10-05 08:42:00,1247.793522,2,477.316819,294
184,rest-active,1082,870,1288.719441,54.905541,1058.437925,0.82131,45.094459,73.17015,824.203327,...,sit,0,63.971345,65.122295,2018-02-02 23:35:55,2018-02-02 23:50:25,3351.213573,0,1004.056207,184
221,focused-active,1618,1279,316.352672,26.025679,899.187847,2.842359,73.974321,75.131677,802.743511,...,sit,4,39.155542,55.053482,2017-10-30 00:00:01,2017-10-30 00:21:20,1743.593669,1,528.05315,221


In [31]:
sessions = df.to_dict(orient='records')
print(sessions[0])

{'activity': 'focused-active', 'beatscount': 2862, 'duration': 2975, 'hf': 16632.76073930055, 'hfnu': 68.59709571934512, 'lf': 7614.272702685149, 'lf_hf': 0.457787665080388, 'lfnu': 31.40290428065488, 'mhr': 60.34981965901419, 'mrri': 1061.744933612858, 'nn50': 2234, 'notes': nan, 'pnn50': 78.08458580915763, 'posture': 'sit', 'removed_artifacts': 25, 'rmssd': 239.3168704624504, 'sdnn': 233.7396537299742, 'start': Timestamp('2017-09-29 12:25:28'), 'stop': Timestamp('2017-09-29 13:15:03'), 'total_power': 29488.44338063871, 'user': 0, 'vlf': 5241.409938653003, 'sess_id': 0}


## 2 - Generate fragments from sessions

Breaks the sessions duration in fragments

Configurations:
* duration of each fragment in seconds;
* number of seconds to be discarded at the beginning of the session, accounting for user's stabilization and adjustment to posture and activity 

---


In [32]:
frags = cf.fragment_sessions(sessions, duration, crop)

461 valid sessions out of 461 total (at least one full fragment of 60 seconds after discarding first 0 seconds)


In [33]:
print(len(frags))
print(frags[0])

23561
{'start': Timestamp('2017-09-29 12:25:28'), 'stop': Timestamp('2017-09-29 12:26:28'), 'activity': 'focused-active', 'posture': 'sit', 'user': 0, 'sess': 0, 'order': 0}


## 3 - Add and clean interval data to fragments
---

### 3.1 Extract beats in fragment

Retrieves from the heartbeat files all the intervals contained in each session's duration and adds them to the fragments objects (in memory)

In [34]:
def fragall(frags, path):
    for i, f in enumerate(frags):
        if (i % 1000 == 0):
            print (i, '/', len(frags))
        f['rr'] = cf.beats_in_fragment(f, path)

In [35]:
%time fragall(frags, RAW_PATH)

0 / 23561
1000 / 23561
2000 / 23561
3000 / 23561
4000 / 23561
5000 / 23561
6000 / 23561
7000 / 23561
8000 / 23561
9000 / 23561
10000 / 23561
11000 / 23561
12000 / 23561
13000 / 23561
14000 / 23561
15000 / 23561
16000 / 23561
17000 / 23561
18000 / 23561
19000 / 23561
20000 / 23561
21000 / 23561
22000 / 23561
23000 / 23561
Wall time: 30min


In [36]:
df = pd.DataFrame(frags)
df['beatcount'] = df['rr'].apply(len)
df.describe()

Unnamed: 0,order,sess,user,beatcount
count,23561.0,23561.0,23561.0,23561.0
mean,115.771402,213.410679,1.323416,74.12491
std,136.374478,143.339116,1.946194,27.062911
min,0.0,0.0,0.0,0.0
25%,14.0,88.0,0.0,62.0
50%,46.0,188.0,0.0,74.0
75%,190.0,356.0,2.0,85.0
max,571.0,460.0,6.0,226.0


### 3.2 Remove outliers from RR series

In [37]:
df['rr'] = df['rr'].apply(cl.clean_rr_series)
df['beatcount'] = df['rr'].apply(len)
df.describe()

Unnamed: 0,order,sess,user,beatcount
count,23561.0,23561.0,23561.0,23561.0
mean,115.771402,213.410679,1.323416,73.686686
std,136.374478,143.339116,1.946194,26.826963
min,0.0,0.0,0.0,0.0
25%,14.0,88.0,0.0,62.0
50%,46.0,188.0,0.0,74.0
75%,190.0,356.0,2.0,84.0
max,571.0,460.0,6.0,194.0


### 3.3 Remove fragments with too few beats (due to hardware malfunction or software was not recording beats)

***TODO*** remove using the continuous sequence gaps instead, it is more reliable

In [39]:
df = df[df['beatcount'] > 0.83 * duration]
df.describe()

Unnamed: 0,order,sess,user,beatcount
count,21799.0,21799.0,21799.0,21799.0
mean,110.036148,209.820359,1.258636,78.619111
std,131.896389,141.487188,1.889626,20.517988
min,0.0,0.0,0.0,50.0
25%,13.0,85.0,0.0,65.0
50%,43.0,186.0,0.0,75.0
75%,177.0,350.0,2.0,85.0
max,571.0,460.0,6.0,194.0


## 4 - Extract time and frequency domain features
---

***TODO*** do it using apply: df[feature_list] = df.apply(lambda row: pd.Series(aggregate_function(row['rr']), axis=1)

In [40]:
dic = df.to_dict(orient='records')
for i in dic:
    i.update(cf.features_from_dic(i['rr']))
print(dic[0]['rmssd'])

  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nper

  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nper

  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nper

  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))


245.242697688


In [41]:
df = pd.DataFrame(dic)
df.sample(3)

Unnamed: 0,activity,beatcount,hf,hfnu,lf,lf_hf,lfnu,mhr,mrri,nn50,...,posture,rmssd,rr,sdnn,sess,start,stop,total_power,user,vlf
19037,sleep,77,62.714159,58.216755,45.011115,0.717719,41.783245,78.253504,767.727273,0,...,lie,14.847027,"[{'date': 2018-05-13 01:48:01, 'interval': 762...",28.097404,419,2018-05-13 01:48:00,2018-05-13 01:49:00,597.819257,4,490.093983
15364,eat,87,439.01618,20.833697,1668.224675,3.799916,79.166303,87.117515,693.563218,7,...,sit,27.660525,"[{'date': 2017-09-30 10:10:01, 'interval': 828...",59.743969,290,2017-09-30 10:10:00,2017-09-30 10:11:00,2484.1736,2,376.932745
17555,focused-active,70,971.912828,33.793771,1904.098929,1.959125,66.206229,70.781092,857.242857,20,...,sit,51.842851,"[{'date': 2017-11-01 11:11:10, 'interval': 829...",91.419472,397,2017-11-01 11:11:09,2017-11-01 11:12:09,5778.486034,3,2902.474278


### removing HF outliers caused by small gaps between the recorded intervals, to which HF is particularly sensitive

***TODO*** see item 3.3 above

In [42]:
dfr = df[(df['hf'] < 7000) & (df['hf'] < 7000)]
print(len(df), 'original and', len(dfr), 'after pruning')

21799 original and 18080 after pruning


## 5 - Save

---

In [43]:
df_output = dfr.drop(['rr'], axis = 1)
df_output.describe()

Unnamed: 0,beatcount,hf,hfnu,lf,lf_hf,lfnu,mhr,mrri,nn50,order,pnn50,rmssd,sdnn,sess,total_power,user,vlf
count,18080.0,18080.0,18080.0,18080.0,18080.0,18080.0,18080.0,18080.0,18080.0,18080.0,18080.0,18080.0,18080.0,18080.0,18080.0,18080.0,18080.0
mean,81.652931,976.900259,48.09811,1164.00134,2.327656,51.90189,81.213346,779.300969,13.005973,82.851604,18.14571,50.587156,62.489087,231.400996,3429.559581,1.481969,1288.657982
std,20.464163,1293.386431,25.199621,2221.362045,3.435238,25.199621,18.755815,155.226821,11.523759,107.520453,16.907996,39.240511,38.110537,140.338819,4897.57828,1.962649,3007.680264
min,50.0,0.075957,1.430015,0.097406,0.009665,0.957283,48.704193,328.786885,0.0,0.0,0.0,1.542725,1.367274,0.0,0.450307,0.0,0.0
25%,70.0,201.804013,26.296313,184.380097,0.4398,30.545914,70.419291,694.0,3.0,11.0,3.797468,24.855696,36.34782,115.0,857.378349,0.0,162.016019
50%,78.0,490.810441,46.887961,503.37237,1.132744,53.112039,77.629258,778.258075,11.0,32.0,13.660287,38.701476,52.518868,205.0,1837.949466,0.0,464.07941
75%,87.0,1144.561272,69.454086,1287.184414,2.802815,73.703687,87.40092,858.405228,20.0,117.0,28.571429,62.48697,79.212288,381.0,3965.542524,2.0,1231.690927
max,194.0,6990.899195,99.042717,37223.644496,68.929361,98.569985,183.005336,1236.42,106.0,571.0,85.185185,297.590882,317.838575,460.0,62379.329036,6.0,53631.486854


In [44]:
filename = PRE_PATH + '\\df_' + str(duration) + '_' + str(crop) + '.xlsx'
print(filename)
df_output.to_excel(filename)

C:\Users\ju\GDrive\Projects\HeRV\Data\PreProcessed\df_60_0.xlsx


# Applying all steps above to generate different datasets

In [10]:
#durations = [300, 240, 180, 150, 120, 90, 60]
#crops = [120, 90, 60]

durations = [30]
crops = [30]

def multifrag(sessions, durations, crops, path_in, path_out):
    for cr in crops:
        for dr in durations:
            fname = path_out + '\\df_' + str(dr) + '_' + str(cr) + '.xlsx'
            print ('generating', fname, '...')
            ds = fr.gen_fragments_dataset(sessions, dr, cr, path_in)
            print('resulting dataset:', len(ds), 'records' )
            ds.to_excel(fname)

In [11]:
%time multifrag(sessions, durations, crops, RAW_PATH, PRE_PATH)

generating C:\Users\ju\GDrive\Projects\HeRV\Data\PreProcessed\df_30_30.xlsx ...
458 valid sessions out of 458 total (at least one full fragment of 30 seconds after discarding first 30 seconds)
0 / 47539
1000 / 47539
2000 / 47539
3000 / 47539
4000 / 47539
5000 / 47539
6000 / 47539
7000 / 47539
8000 / 47539
9000 / 47539
10000 / 47539
11000 / 47539
12000 / 47539
13000 / 47539
14000 / 47539
15000 / 47539
16000 / 47539
17000 / 47539
18000 / 47539
19000 / 47539
20000 / 47539
21000 / 47539
22000 / 47539
23000 / 47539
24000 / 47539
25000 / 47539
26000 / 47539
27000 / 47539
28000 / 47539
29000 / 47539
30000 / 47539
31000 / 47539
32000 / 47539
33000 / 47539
34000 / 47539
35000 / 47539
36000 / 47539
37000 / 47539
38000 / 47539
39000 / 47539
40000 / 47539
41000 / 47539
42000 / 47539
43000 / 47539
44000 / 47539
45000 / 47539
46000 / 47539
47000 / 47539


  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nper

  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nper

  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nper

  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))


47539 total frags and 37097 kept
resulting dataset: 37097 records
Wall time: 58min 36s


## ( Extra - save files for LDA Grover)

---

In [19]:
dfr[['activity']].to_csv('./classifications')

In [20]:
def get_ints(beats):
    return [beat['interval'] for beat in beats]

dfr['ts'] = dfr['rr'].apply(get_ints)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [21]:
dfr.loc[['ts']].sample(3)

KeyError: "None of [['ts']] are in the [index]"

In [None]:
dfr[['ts']].to_csv('./timeseries')

In [None]:
dfr[cl.features_all].to_csv('./features')

In [None]:
dfr.describe()

In [None]:
for a in d.activity.unique()
       df.groupby(column).count()['user']