In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline  

import pandas as pd
import consolidateFiles as cf
import datacleaning as cl
import fragmentation as fr
import hervpd as hp

## ! Parse activity files and parse interval files are to be replaced with the corresponding database queries as soon as they are available 

## 0 - Pipeline configuration 
* set the input/output directories, user id and verbose level

In [2]:
verbose = True

PATH = "C:\\Users\\ju\\GDrive\\Projects\\HeRV\\Data\\"
RAW_PATH = PATH + "Raw"
PRE_PATH = PATH + "PreProcessed"

sessfile = PRE_PATH + "\\sessions.xlsx"
    
# duration (in seconds) to be cropped from the beginning of each second to account for stabilization and user adjustment
crop = 0
    
# duration (in seconds) of each fragment to be sent to analysis
duration = 60
    
# if any fragment has more than 'threshold' consecutive seconds with no beats, it will be discarded
threshold = 3    

## 1 - Read sessions

---

In [3]:
df = pd.read_excel(sessfile)
df.sample(5)

Unnamed: 0,activity,beatscount,duration,hf,hfnu,lf,lf_hf,lfnu,mhr,mrri,...,posture,removed_artifacts,rmssd,sdnn,start,stop,total_power,user,vlf,sess_id
270,focused-passive,2048,1620,753.913992,31.945305,1606.101025,2.13035,68.054695,75.430523,807.705078,...,sit,3,44.656054,93.173559,2017-10-06 10:32:00,2017-10-06 10:59:00,3926.212754,2,1566.197737,270
179,sleep,35907,30728,16400.934673,74.578251,5590.644032,0.340874,25.421749,77.342784,869.651823,...,lie,390,214.607336,253.78725,2018-02-21 03:07:51,2018-02-21 11:39:59,25042.298185,0,3050.719479,179
96,focused-active,3657,2660,554.427294,41.364241,785.926794,1.417547,58.635759,83.640503,723.609243,...,sit,0,36.974609,67.090064,2017-11-01 05:38:41,2017-11-01 06:23:01,2342.031663,0,1001.677576,96
357,household-chores,1228,900,359.416173,27.518004,946.696618,2.633984,72.481996,80.614712,749.724756,...,stand,0,29.952502,64.313369,2017-11-02 12:38:00,2017-11-02 12:53:00,2035.014958,2,728.902167,357
120,focused-active,457,297,255.196503,28.888913,628.17526,2.461536,71.111087,90.993509,664.214442,...,sit,0,25.601492,58.85488,2017-11-07 17:28:35,2017-11-07 17:33:32,1969.563488,0,1086.191724,120


In [4]:
sessions = df.to_dict(orient='records')
print(sessions[0])

{'activity': 'focused-active', 'beatscount': 2862, 'duration': 2975, 'hf': 16632.76073930055, 'hfnu': 68.59709571934512, 'lf': 7614.272702685149, 'lf_hf': 0.457787665080388, 'lfnu': 31.40290428065488, 'mhr': 60.34981965901419, 'mrri': 1061.744933612858, 'nn50': 2234, 'notes': nan, 'pnn50': 78.08458580915763, 'posture': 'sit', 'removed_artifacts': 25, 'rmssd': 239.3168704624504, 'sdnn': 233.7396537299742, 'start': Timestamp('2017-09-29 12:25:28'), 'stop': Timestamp('2017-09-29 13:15:03'), 'total_power': 29488.44338063871, 'user': 0, 'vlf': 5241.409938653003, 'sess_id': 0}


## 2 - Generate fragments from sessions

Breaks the sessions duration in fragments

Configurations:
* duration of each fragment in seconds;
* number of seconds to be discarded at the beginning of the session, accounting for user's stabilization and adjustment to posture and activity 

---


In [5]:
frags = cf.fragment_sessions(sessions, duration, crop)

428 valid sessions out of 428 total (at least one full fragment of 60 seconds after discarding first 0 seconds)


In [6]:
print(len(frags))
print(frags[0])

23163
{'start': Timestamp('2017-09-29 12:25:28'), 'stop': Timestamp('2017-09-29 12:26:28'), 'activity': 'focused-active', 'posture': 'sit', 'user': 0, 'sess': 0, 'order': 0}


## 3 - Add and clean interval data to fragments
---

### 3.1 Extract beats in fragment

Retrieves from the heartbeat files all the intervals contained in each session's duration and adds them to the fragments objects (in memory)

In [7]:
def fragall(frags, path):
    for i, f in enumerate(frags):
        if (i % 1000 == 0):
            print (i, '/', len(frags))
        f['rr'] = cf.beats_in_fragment(f, path)

In [8]:
%time fragall(frags, RAW_PATH)

0 / 23163
1000 / 23163
2000 / 23163
3000 / 23163
4000 / 23163
5000 / 23163
6000 / 23163
7000 / 23163
8000 / 23163
9000 / 23163
10000 / 23163
11000 / 23163
12000 / 23163
13000 / 23163
14000 / 23163
15000 / 23163
16000 / 23163
17000 / 23163
18000 / 23163
19000 / 23163
20000 / 23163
21000 / 23163
22000 / 23163
23000 / 23163
Wall time: 33min 37s


In [None]:
df = pd.DataFrame(frags)
df['beatcount'] = df['rr'].apply(len)
df.describe()

### 3.2 Remove outliers from RR series

In [10]:
df['rr'] = df['rr'].apply(cl.clean_rr_series)
df['beatcount'] = df['rr'].apply(len)
df.describe()

Unnamed: 0,order,sess,user,beatcount
count,23163.0,23163.0,23163.0,23163.0
mean,123.563917,199.353624,1.354142,73.311661
std,141.395137,132.402872,1.953788,27.217338
min,0.0,0.0,0.0,0.0
25%,14.0,81.0,0.0,62.0
50%,53.0,177.0,0.0,74.0
75%,208.0,328.0,2.0,84.0
max,582.0,427.0,6.0,194.0


### 3.3 Remove fragments with too few beats (due to hardware malfunction or software was not recording beats)

***TODO*** remove using the continuous sequence gaps instead, it is more reliable

In [11]:
df = df[df['beatcount'] > 0.83 * duration]
df.describe()

Unnamed: 0,order,sess,user,beatcount
count,21347.0,21347.0,21347.0,21347.0
mean,116.874221,195.635733,1.285801,78.50349
std,136.552861,130.995367,1.900669,20.672588
min,0.0,0.0,0.0,50.0
25%,14.0,79.0,0.0,65.0
50%,47.0,177.0,0.0,75.0
75%,194.0,320.0,2.0,85.0
max,582.0,427.0,6.0,194.0


## 4 - Extract time and frequency domain features
---

***TODO*** do it using apply: df[feature_list] = df.apply(lambda row: pd.Series(aggregate_function(row['rr']), axis=1)

In [12]:
dic = df.to_dict(orient='records')
for i in dic:
    i.update(cf.features_from_dic(i['rr']))
print(dic[0]['rmssd'])

  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nper

  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nper

  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nper

  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))


245.242697688


In [13]:
df = pd.DataFrame(dic)
df.sample(3)

Unnamed: 0,activity,beatcount,hf,hfnu,lf,lf_hf,lfnu,mhr,mrri,nn50,...,posture,rmssd,rr,sdnn,sess,start,stop,total_power,user,vlf
3101,sleep,55,16621.414734,88.65341,2127.344876,0.127988,11.34659,55.643655,1101.163636,46,...,lie,230.403993,"[{'date': 2017-10-11 06:13:11, 'interval': 894...",161.212738,41,2017-10-11 06:13:10,2017-10-11 06:14:10,19131.199295,0,382.439684
7185,sleep,74,851.637423,81.679272,191.022973,0.224301,18.320728,76.043674,791.513514,9,...,lie,33.378485,"[{'date': 2017-11-09 05:23:04, 'interval': 732...",44.935949,125,2017-11-09 05:23:02,2017-11-09 05:24:02,1130.331881,0,87.671486
17012,rest-active,55,1148.695658,14.442941,6804.640809,5.923798,85.557059,99.22148,624.836364,8,...,stand,103.548253,"[{'date': 2017-11-12 16:14:07, 'interval': 604...",124.538372,361,2017-11-12 16:14:06,2017-11-12 16:15:06,10522.264475,2,2568.928007


### removing HF outliers caused by small gaps between the recorded intervals, to which HF is particularly sensitive

***TODO*** see item 3.3 above

In [14]:
dfr = df[(df['hf'] < 7000) & (df['hf'] < 7000)]
print(len(df), 'original and', len(dfr), 'after pruning')

21347 original and 17666 after pruning


## 5 - Save

---

In [15]:
df_output = dfr.drop(['rr'], axis = 1)
df_output.describe()

Unnamed: 0,beatcount,hf,hfnu,lf,lf_hf,lfnu,mhr,mrri,nn50,order,pnn50,rmssd,sdnn,sess,total_power,user,vlf
count,17666.0,17666.0,17666.0,17666.0,17666.0,17666.0,17666.0,17666.0,17666.0,17666.0,17666.0,17666.0,17666.0,17666.0,17666.0,17666.0,17666.0
mean,81.585079,969.246014,48.179146,1158.858722,2.332469,51.820854,81.099199,780.819761,12.954828,90.235537,18.126541,50.126587,62.197084,216.13308,3420.917027,1.517321,1292.812291
std,20.648444,1286.297804,25.306244,2230.997488,3.448721,25.306244,18.903363,156.537622,11.540382,115.353936,16.979029,38.829106,38.071423,129.485541,4916.938442,1.972285,3031.009085
min,50.0,0.075957,1.430015,0.097406,0.009665,0.957283,48.704193,328.786885,0.0,0.0,0.0,1.542725,1.367274,0.0,0.450307,0.0,0.0
25%,70.0,200.630797,26.235981,181.107653,0.433504,30.240869,70.233416,694.663793,3.0,12.0,3.75,24.788957,36.104089,106.0,850.103235,0.0,160.037876
50%,77.0,488.014063,46.958066,497.020693,1.12956,53.041934,77.440007,780.270979,10.0,35.0,13.513514,38.536981,52.301923,194.0,1824.298633,0.0,460.234709
75%,87.0,1132.644922,69.759131,1269.88337,2.811559,73.764019,87.301916,860.981988,20.0,131.0,28.571429,61.772653,78.650953,353.0,3947.373838,2.0,1231.780576
max,194.0,6990.899195,99.042717,37223.644496,68.929361,98.569985,183.005336,1236.42,106.0,582.0,85.185185,297.590882,317.838575,427.0,62379.329036,6.0,53631.486854


In [16]:
filename = PRE_PATH + '\\df_' + str(duration) + '_' + str(crop) + '.xlsx'
print(filename)
df_output.to_excel(filename)

C:\Users\ju\GDrive\Projects\HeRV\Data\PreProcessed\df_60_0.xlsx


# Applying all steps above to generate different datasets

In [17]:
#durations = [300, 240, 180, 150, 120, 90, 60]
#crops = [120, 90, 60]

durations = [30]
crops = [30]

def multifrag(sessions, durations, crops, path_in, path_out):
    for cr in crops:
        for dr in durations:
            fname = path_out + '\\df_' + str(dr) + '_' + str(cr) + '.xlsx'
            print ('generating', fname, '...')
            ds = fr.gen_fragments_dataset(sessions, dr, cr, path_in)
            print('resulting dataset:', len(ds), 'records' )
            ds.to_excel(fname)

In [18]:
%time multifrag(sessions, durations, crops, RAW_PATH, PRE_PATH)

generating C:\Users\ju\GDrive\Projects\HeRV\Data\PreProcessed\df_30_30.xlsx ...
428 valid sessions out of 428 total (at least one full fragment of 30 seconds after discarding first 30 seconds)
0 / 46055
1000 / 46055
2000 / 46055
3000 / 46055
4000 / 46055
5000 / 46055
6000 / 46055
7000 / 46055
8000 / 46055
9000 / 46055
10000 / 46055
11000 / 46055
12000 / 46055
13000 / 46055
14000 / 46055
15000 / 46055
16000 / 46055
17000 / 46055
18000 / 46055
19000 / 46055
20000 / 46055
21000 / 46055
22000 / 46055
23000 / 46055
24000 / 46055
25000 / 46055
26000 / 46055
27000 / 46055
28000 / 46055
29000 / 46055
30000 / 46055
31000 / 46055
32000 / 46055
33000 / 46055
34000 / 46055
35000 / 46055
36000 / 46055
37000 / 46055
38000 / 46055
39000 / 46055
40000 / 46055
41000 / 46055
42000 / 46055
43000 / 46055
44000 / 46055
45000 / 46055
46000 / 46055


  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nper

  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nper

  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nper

  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))


46055 total frags and 35777 kept
resulting dataset: 35777 records
Wall time: 57min 55s


## ( Extra - save files for LDA Grover)

---

In [19]:
dfr[['activity']].to_csv('./classifications')

In [20]:
def get_ints(beats):
    return [beat['interval'] for beat in beats]

dfr['ts'] = dfr['rr'].apply(get_ints)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [21]:
dfr.loc[['ts']].sample(3)

KeyError: "None of [['ts']] are in the [index]"

In [None]:
dfr[['ts']].to_csv('./timeseries')

In [None]:
dfr[cl.features_all].to_csv('./features')

In [None]:
dfr.describe()

In [None]:
for a in d.activity.unique()
       df.groupby(column).count()['user']