In [35]:
%load_ext autoreload
%autoreload 2
%matplotlib inline  

import pandas as pd
import numpy as np

from datetime import datetime, timedelta

import plotly.offline as pl
pl.init_notebook_mode(connected=True)
import plotly.graph_objs as go

from sklearn.ensemble import RandomForestClassifier


import parseIntervalFiles as pif
import parseActivityFiles as paf
#pun intended :)
import consolidateFiles as cf
import datacleaning as cl
import dataviz as dv
import fragmentation as fr
import classif_multiclass as cmc
import hervpd as hp

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## ! Parse activity files and parse interval files are to be replaced with the corresponding database queries as soon as they are available 

## 0 - Pipeline configuration 
* set the input/output directories, user id and verbose level

In [2]:
verbose = True
    
DATA_PATH = "C:\\Users\\ju\\GDrive\\Projects\\HeRV\\Data\\" 
# DATA_PATH = "/home/ju/GDrive/Projects/HeRV/Data/"
    
RAW_PATH = DATA_PATH + "Raw"
PRE_PATH = DATA_PATH + "PreProcessed"     

## Extracting sessions

In [3]:
sess = [x for x in paf.get_user_sessions(0, dirname=RAW_PATH, verbose=False) if x['start'] > datetime(2018, 6, 1)]

In [4]:
sess = cf.sessions_add_beats(sessions=sess, dirname=RAW_PATH+"\\0", verbose=False)

In [5]:
df = pd.DataFrame(sess)

In [6]:
df = df[df.activity.isin(['train-baseline', 'train-focus', 'train-breathe'])]

### Include column with beats count for ease of use 

In [7]:
df['beatscount'] = df['rr'].apply(len)

### Removing outliers

In [8]:
df['rr'] = df['rr'].apply(cl.clean_rr_series)

In [9]:
df['beatscount_clean'] = df['rr'].apply(len)

In [10]:
df['removed_artifacts'] = df['beatscount'] - df['beatscount_clean']
df['beatscount'] = df['beatscount_clean']
df = df.drop(['beatscount_clean'], axis=1)

In [11]:
df.describe()

Unnamed: 0,duration,user,beatscount,removed_artifacts
count,34.0,34.0,34.0,34.0
mean,134.0,0.0,182.588235,0.970588
std,9.620055,0.0,14.873441,1.992186
min,115.0,0.0,154.0,0.0
25%,129.25,0.0,168.25,0.0
50%,133.0,0.0,183.5,0.0
75%,137.0,0.0,190.0,1.0
max,171.0,0.0,212.0,10.0


### Removing sessions without beats recorded or with too few of them

In [12]:
l = len(df)
df = df[df['beatscount'] > 100]
print(l - len(df), ' sessions out of', l, 'removed for lack of interval data')

0  sessions out of 34 removed for lack of interval data


In [13]:
df.groupby('activity').count()['duration']

activity
train-baseline    10
train-breathe     12
train-focus       12
Name: duration, dtype: int64

## 3 - Aggregating data

In [14]:
dic = df.to_dict(orient='records')
for i in dic:
    i.update(cf.features_from_dic(i['rr']))
print(dic[0]['rmssd'])
df = pd.DataFrame(dic)

77.4297488894


In [15]:
df['sess_id'] = df.index
df = df.drop(['rr'], axis = 1)
df.sample(2)

Unnamed: 0,activity,beatscount,duration,hf,hfnu,lf,lf_hf,lfnu,mhr,mrri,...,posture,removed_artifacts,rmssd,sdnn,start,stop,total_power,user,vlf,sess_id
25,train-baseline,187,137,1522.691969,68.500727,700.192415,0.459839,31.499273,81.436624,748.219251,...,sit,1,71.230128,86.74775,2018-06-20 22:39:33,2018-06-20 22:41:50,3318.652558,0,1095.768174,25
12,train-focus,167,131,932.902562,64.469682,514.138176,0.551117,35.530318,75.165411,801.640719,...,sit,0,51.07501,52.703332,2018-06-18 17:56:24,2018-06-18 17:58:35,1976.828808,0,529.78807,12


In [16]:
df.describe()

Unnamed: 0,beatscount,duration,hf,hfnu,lf,lf_hf,lfnu,mhr,mrri,nn50,pnn50,removed_artifacts,rmssd,sdnn,total_power,user,vlf,sess_id
count,34.0,34.0,34.0,34.0,34.0,34.0,34.0,34.0,34.0,34.0,34.0,34.0,34.0,34.0,34.0,34.0,34.0,34.0
mean,182.588235,134.0,1214.495269,41.693868,1543.555472,3.041646,58.306132,81.828783,745.594468,27.294118,15.494426,0.970588,61.776942,70.944759,3572.06233,0.0,814.011589,16.5
std,14.873441,9.620055,1552.613819,23.271721,1648.24183,3.542726,23.271721,5.500732,49.72798,18.579117,11.205384,1.992186,34.546702,27.607223,3227.702304,0.0,965.265696,9.958246
min,154.0,115.0,129.875803,6.953304,176.231245,0.217607,17.871682,72.144605,664.222826,1.0,0.546448,0.0,15.785101,30.404905,660.072449,0.0,90.929902,0.0
25%,168.25,129.25,226.269504,20.207246,525.447724,0.608749,37.717613,77.089744,704.308152,16.5,8.761442,0.0,38.502121,53.145949,1392.460912,0.0,331.448531,8.25
50%,183.5,133.0,538.846769,42.807214,1050.68589,1.339294,57.192786,81.319711,747.664889,23.0,12.266754,0.0,55.281217,65.130564,2254.308665,0.0,459.362925,16.5
75%,190.0,137.0,1292.932697,62.282387,1986.146941,3.988796,79.792754,86.365528,786.487532,35.5,19.049586,1.0,76.033094,84.526677,3563.543102,0.0,629.660254,24.75
max,212.0,171.0,6015.98254,82.128318,8846.488332,13.381653,93.046696,92.160516,841.824675,68.0,41.975309,10.0,153.35397,141.658789,12151.712956,0.0,3940.988059,33.0


In [18]:
df.to_excel(PRE_PATH + "\\train_sessions.xlsx")


---


# VISUALIZATION

---



In [19]:
#d21 = df_export[df_export.start > '2018-06-21']
for feat in cl.features_all:
    dv.boxplot_compare(df, feat, groupby='activity', min_examples=2)


---


# FRAGMENTATION

---



In [24]:
durations = [60, 30, 20, 10]
crop = 5
sessions = df.to_dict(orient='records')

def multifrag(sessions, durations):
    for dr in durations:
        fname = PRE_PATH + '\\df_train_' + str(dr) + '.xlsx'
        print ('generating', fname, '...')
        ds = fr.gen_fragments_dataset(sessions, dr, crop, RAW_PATH)
        print('resulting dataset:', len(ds), 'records' )
        ds.to_excel(fname)

In [27]:
%time multifrag(sessions, durations)

generating C:\Users\ju\GDrive\Projects\HeRV\Data\PreProcessed\df_train_60.xlsx ...
34 valid sessions out of 34 total (at least one full fragment of 60 seconds after discarding first 5 seconds)
0 / 65



nperseg = 256 is greater than input length  = 238, using nperseg = 238


nperseg = 256 is greater than input length  = 241, using nperseg = 241


nperseg = 256 is greater than input length  = 234, using nperseg = 234


nperseg = 256 is greater than input length  = 243, using nperseg = 243


nperseg = 256 is greater than input length  = 235, using nperseg = 235


nperseg = 256 is greater than input length  = 246, using nperseg = 246


nperseg = 256 is greater than input length  = 240, using nperseg = 240


nperseg = 256 is greater than input length  = 239, using nperseg = 239


nperseg = 256 is greater than input length  = 232, using nperseg = 232


nperseg = 256 is greater than input length  = 242, using nperseg = 242


nperseg = 256 is greater than input length  = 245, using nperseg = 245


nperseg = 256 is greater than input length  = 233, using nperseg = 233


nperseg = 256 is greater than input length  = 237, using nperseg = 237


nperseg = 256 is greater than input length  = 226,

65 total frags and 64 kept
resulting dataset: 64 records
generating C:\Users\ju\GDrive\Projects\HeRV\Data\PreProcessed\df_train_30.xlsx ...
34 valid sessions out of 34 total (at least one full fragment of 30 seconds after discarding first 5 seconds)
0 / 134



nperseg = 256 is greater than input length  = 114, using nperseg = 114


nperseg = 256 is greater than input length  = 117, using nperseg = 117


nperseg = 256 is greater than input length  = 116, using nperseg = 116


nperseg = 256 is greater than input length  = 122, using nperseg = 122


nperseg = 256 is greater than input length  = 112, using nperseg = 112


nperseg = 256 is greater than input length  = 118, using nperseg = 118


nperseg = 256 is greater than input length  = 120, using nperseg = 120


nperseg = 256 is greater than input length  = 115, using nperseg = 115


nperseg = 256 is greater than input length  = 113, using nperseg = 113


nperseg = 256 is greater than input length  = 119, using nperseg = 119


nperseg = 256 is greater than input length  = 121, using nperseg = 121


nperseg = 256 is greater than input length  = 110, using nperseg = 110


nperseg = 256 is greater than input length  = 111, using nperseg = 111


nperseg = 256 is greater than input length  = 109,

134 total frags and 129 kept
resulting dataset: 129 records
generating C:\Users\ju\GDrive\Projects\HeRV\Data\PreProcessed\df_train_20.xlsx ...
34 valid sessions out of 34 total (at least one full fragment of 20 seconds after discarding first 5 seconds)
0 / 204



nperseg = 256 is greater than input length  = 75, using nperseg = 75


nperseg = 256 is greater than input length  = 74, using nperseg = 74


nperseg = 256 is greater than input length  = 76, using nperseg = 76


nperseg = 256 is greater than input length  = 71, using nperseg = 71


nperseg = 256 is greater than input length  = 79, using nperseg = 79


nperseg = 256 is greater than input length  = 73, using nperseg = 73


nperseg = 256 is greater than input length  = 80, using nperseg = 80


nperseg = 256 is greater than input length  = 77, using nperseg = 77


nperseg = 256 is greater than input length  = 70, using nperseg = 70


nperseg = 256 is greater than input length  = 69, using nperseg = 69


nperseg = 256 is greater than input length  = 72, using nperseg = 72


nperseg = 256 is greater than input length  = 78, using nperseg = 78


nperseg = 256 is greater than input length  = 81, using nperseg = 81


nperseg = 256 is greater than input length  = 82, using nperseg = 82


npers

204 total frags and 200 kept
resulting dataset: 200 records
generating C:\Users\ju\GDrive\Projects\HeRV\Data\PreProcessed\df_train_10.xlsx ...
34 valid sessions out of 34 total (at least one full fragment of 10 seconds after discarding first 5 seconds)
0 / 422



nperseg = 256 is greater than input length  = 35, using nperseg = 35


nperseg = 256 is greater than input length  = 34, using nperseg = 34


nperseg = 256 is greater than input length  = 33, using nperseg = 33


nperseg = 256 is greater than input length  = 36, using nperseg = 36


nperseg = 256 is greater than input length  = 39, using nperseg = 39


nperseg = 256 is greater than input length  = 30, using nperseg = 30


invalid value encountered in double_scalars


invalid value encountered in double_scalars


invalid value encountered in double_scalars


nperseg = 256 is greater than input length  = 38, using nperseg = 38


nperseg = 256 is greater than input length  = 37, using nperseg = 37


nperseg = 256 is greater than input length  = 28, using nperseg = 28


nperseg = 256 is greater than input length  = 32, using nperseg = 32


nperseg = 256 is greater than input length  = 31, using nperseg = 31


nperseg = 256 is greater than input length  = 41, using nperseg = 41


nperseg =

422 total frags and 417 kept
resulting dataset: 417 records
Wall time: 1min 10s



---


# CLASSIFICATION

---



In [36]:
features = ['mrri', 'sdnn', 'rmssd', 'pnn50', 'hfnu', 'lfnu', 'lf_hf']
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)

In [37]:
df = pd.read_excel(PRE_PATH+'\\df_train_60_5.xlsx')
df.describe()

Unnamed: 0,beatcount,hf,hfnu,lf,lf_hf,lfnu,mhr,mrri,nn50,order,pnn50,rmssd,sdnn,sess,total_power,user,vlf
count,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0
mean,80.8125,906.892903,43.019616,1401.593039,3.256969,56.980384,81.337328,748.949495,11.421875,0.484375,14.79051,52.723166,63.134512,16.09375,3095.316772,0.0,786.83083
std,5.687287,1203.777538,24.086947,2569.160132,4.784701,24.086947,5.801891,52.736762,9.004064,0.503706,12.148936,36.781636,30.96495,9.857265,4035.128292,0.0,1179.827968
min,68.0,82.046471,3.35904,60.775005,0.151619,13.165736,69.165795,653.445652,0.0,0.0,0.0,15.203744,24.110738,0.0,362.491185,0.0,32.990982
25%,76.75,224.814554,24.555316,395.724516,0.564498,36.079339,76.968898,704.744186,4.0,0.0,5.146104,25.321243,42.287758,7.75,1176.921437,0.0,194.565217
50%,81.0,364.750644,41.971416,749.652321,1.382853,58.028584,81.113127,742.931948,9.5,0.0,11.898061,39.910302,54.352067,16.0,1843.541765,0.0,355.30522
75%,85.25,978.300152,63.920661,1446.63954,3.075189,75.444684,86.010238,786.481923,15.5,1.0,20.067568,64.196444,72.547394,24.25,3062.817274,0.0,829.750739
max,92.0,5776.928692,86.834264,19869.445137,28.770411,96.64096,94.373136,882.808824,44.0,1.0,55.0,178.143096,168.596896,33.0,27541.808917,0.0,5655.788389


In [38]:
dfr = df[df.activity.isin(['train-focus', 'train-breathe'])]

In [40]:
for label in ['activity']:
    cmc.plot_matrices(dfr, clf, features, label)




--------------------------------------------- activity ---------------------------------------------
activity
train-breathe    23
train-focus      23
Name: user, dtype: int64
36 10
