In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline  

import pandas as pd
import numpy as np

from datetime import datetime, timedelta

import plotly.offline as pl
pl.init_notebook_mode(connected=True)
import plotly.graph_objs as go

from sklearn.ensemble import RandomForestClassifier


import parseIntervalFiles as pif
import consolidateFiles as cf
import datacleaning as cl
import dataviz as dv
import fragmentation as fr
import classif_multiclass as cmc
import hervpd as hp

## ! Parse activity files and parse interval files are to be replaced with the corresponding database queries as soon as they are available 

## 0 - Pipeline configuration 
* set the input/output directories, user id and verbose level

In [2]:
verbose = True
    
DATA_PATH = "C:\\Users\\ju\\GDrive\\Projects\\HeRV\\Data\\" 
# DATA_PATH = "/home/ju/GDrive/Projects/HeRV/Data/"
    
RAW_PATH = DATA_PATH + "Raw"
PRE_PATH = DATA_PATH + "PreProcessed"

dt1 = datetime(2018, 6, 18)
dt2 = datetime(2018, 6, 23)

user = 1

## Extracting sessions

In [3]:
sess = cf.get_user_sessions(user, dt1, dt2, RAW_PATH, verbose=True)

reading act180621.csv ... 
11 sessions extracted and 0 errors found


In [4]:
sess = cf.sessions_add_beats(sess, RAW_PATH, verbose=False)

In [5]:
dfs = pd.DataFrame(sess)

In [6]:
dfs = dfs[dfs.activity.isin(['train-baseline', 'train-focus', 'train-breathe'])]

### Include column with beats count for ease of use 

In [7]:
dfs['beatscount'] = dfs['rr'].apply(len)

### Removing outliers

In [8]:
dfs['rr'] = dfs['rr'].apply(cl.clean_rr_series)

In [9]:
dfs['beatscount_clean'] = dfs['rr'].apply(len)

In [10]:
dfs['removed_artifacts'] = dfs['beatscount'] - dfs['beatscount_clean']
dfs['beatscount'] = dfs['beatscount_clean']
dfs = dfs.drop(['beatscount_clean'], axis=1)

In [11]:
dfs.describe()

Unnamed: 0,duration,user,beatscount,removed_artifacts
count,10.0,10.0,10.0,10.0
mean,130.0,1.0,177.8,0.0
std,11.284207,0.0,18.18913,0.0
min,104.0,1.0,146.0,0.0
25%,127.25,1.0,166.5,0.0
50%,129.0,1.0,179.5,0.0
75%,134.5,1.0,188.5,0.0
max,147.0,1.0,202.0,0.0


### Removing sessions without beats recorded or with too few of them

In [12]:
l = len(dfs)
dfs = dfs[dfs['beatscount'] > 100]
print(l - len(dfs), ' sessions out of', l, 'removed for lack of interval data')

0  sessions out of 10 removed for lack of interval data


In [13]:
dfs.groupby('activity').count()['duration']

activity
train-baseline    2
train-breathe     4
train-focus       4
Name: duration, dtype: int64

## 3 - Aggregating data

In [14]:
dic = dfs.to_dict(orient='records')
for i in dic:
    i.update(cf.features_from_dic(i['rr']))
print(dic[0]['rmssd'])
dfs = pd.DataFrame(dic)

21.0457834257


In [15]:
dfs['sess_id'] = dfs.index
dfs = dfs.drop(['rr'], axis = 1)
dfs.sample(2)

Unnamed: 0,activity,beatscount,duration,hf,hfnu,lf,lf_hf,lfnu,mhr,mrri,...,posture,removed_artifacts,rmssd,sdnn,start,stop,total_power,user,vlf,sess_id
7,train-breathe,165,128,795.670151,8.868473,8176.225219,10.275898,91.131527,76.540396,794.587879,...,sit,0,47.335402,93.631998,2018-06-21 22:28:19,2018-06-21 22:30:27,9042.885023,1,70.989653,7
0,train-baseline,201,147,145.01518,6.776602,1994.92419,13.756658,93.223398,80.603377,748.975124,...,sit,0,21.045783,59.268241,2018-06-21 17:01:07,2018-06-21 17:03:34,3158.860601,1,1018.92123,0


In [16]:
dfs.describe()

Unnamed: 0,beatscount,duration,hf,hfnu,lf,lf_hf,lfnu,mhr,mrri,nn50,notes,pnn50,removed_artifacts,rmssd,sdnn,total_power,user,vlf,sess_id
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,0.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,177.8,130.0,310.461362,12.024548,3316.44012,9.63372,87.975452,81.157411,747.346863,14.5,,8.652746,0.0,29.150462,62.165445,3982.42476,1.0,355.523278,4.5
std,18.18913,11.284207,347.150903,7.034007,3682.377861,4.833126,7.034007,4.605022,46.987885,16.338435,,9.968072,0.0,17.615224,30.412808,3937.569688,0.0,300.113723,3.02765
min,146.0,104.0,36.467514,5.716321,130.060979,2.784573,73.57694,74.280077,686.883598,0.0,,0.0,0.0,12.977209,24.809968,354.178357,1.0,70.989653,0.0
25%,166.5,127.25,47.949711,7.20023,302.024523,6.397409,86.481419,77.04508,708.318049,2.0,,1.072598,0.0,15.249627,36.735998,727.622338,1.0,109.884889,2.25
50%,179.5,129.0,119.132197,9.029392,1423.626166,10.078461,90.970608,81.958307,735.401946,4.5,,2.344086,0.0,19.339475,52.80244,2175.156658,1.0,276.111,4.5
75%,188.5,134.5,649.252629,13.518581,6839.998216,13.018699,92.79977,84.972709,789.977841,29.0,,17.559182,0.0,44.246508,91.544346,7832.96706,1.0,510.719363,6.75
max,202.0,147.0,838.172523,26.42306,9056.304573,16.493768,94.283679,87.607577,820.879747,39.0,,24.840764,0.0,56.267163,104.3373,9991.695123,1.0,1018.92123,9.0


In [17]:
dfs[dfs.activity == 'train-focus']

Unnamed: 0,activity,beatscount,duration,hf,hfnu,lf,lf_hf,lfnu,mhr,mrri,...,posture,removed_artifacts,rmssd,sdnn,start,stop,total_power,user,vlf,sess_id
1,train-focus,183,127,93.249214,21.938356,331.801837,3.558227,78.061644,85.371274,704.743169,...,sit,0,17.633167,36.473001,2018-06-21 17:04:23,2018-06-21 17:06:30,572.936528,1,147.885476,1
3,train-focus,189,128,51.675768,5.716321,852.328142,16.493768,94.283679,87.607577,686.883598,...,sit,0,14.553094,37.524987,2018-06-21 17:10:43,2018-06-21 17:12:51,1191.452715,1,287.448805,3
6,train-focus,202,141,36.467514,13.557302,232.520472,6.376099,86.442698,83.936218,715.673267,...,sit,0,12.977209,24.809968,2018-06-21 22:24:49,2018-06-21 22:27:10,354.178357,1,85.19037,6
8,train-focus,187,130,46.707692,26.42306,130.060979,2.784573,73.57694,85.318206,705.86631,...,sit,0,17.171025,46.33664,2018-06-21 22:32:17,2018-06-21 22:34:27,697.077024,1,520.308353,8


In [18]:
dfs.to_excel(PRE_PATH + "\\train_sessions.xlsx")


---


# VISUALIZATION

---



In [19]:
#d21 = df_export[df_export.start > '2018-06-21']
for feat in cl.features_all:
    dv.boxplot_compare(dfs, feat, groupby='activity', min_examples=1)


---


# FRAGMENTATION

---



In [20]:
durations = [60, 30, 20, 10]
crop = 5
sessions = dfs.to_dict(orient='records')

def multifrag(sessions, durations):
    for dr in durations:
        fname = PRE_PATH + '\\df_train_' + str(dr) + '.xlsx'
        print ('generating', fname, '...')
        ds = fr.gen_fragments_dataset(sessions, dr, crop, RAW_PATH)
        print('resulting dataset:', len(ds), 'records' )
        ds.to_excel(fname)

In [21]:
%time multifrag(sessions, durations)

generating C:\Users\ju\GDrive\Projects\HeRV\Data\PreProcessed\df_train_60.xlsx ...
10 valid sessions out of 10 total (at least one full fragment of 60 seconds after discarding first 5 seconds)
0 / 19



nperseg = 256 is greater than input length  = 239, using nperseg = 239


nperseg = 256 is greater than input length  = 246, using nperseg = 246


nperseg = 256 is greater than input length  = 243, using nperseg = 243


nperseg = 256 is greater than input length  = 244, using nperseg = 244


nperseg = 256 is greater than input length  = 240, using nperseg = 240


nperseg = 256 is greater than input length  = 241, using nperseg = 241


nperseg = 256 is greater than input length  = 242, using nperseg = 242


nperseg = 256 is greater than input length  = 245, using nperseg = 245


nperseg = 256 is greater than input length  = 237, using nperseg = 237


nperseg = 256 is greater than input length  = 233, using nperseg = 233



19 total frags and 19 kept
resulting dataset: 19 records
generating C:\Users\ju\GDrive\Projects\HeRV\Data\PreProcessed\df_train_30.xlsx ...
10 valid sessions out of 10 total (at least one full fragment of 30 seconds after discarding first 5 seconds)
0 / 39



nperseg = 256 is greater than input length  = 118, using nperseg = 118


nperseg = 256 is greater than input length  = 116, using nperseg = 116


nperseg = 256 is greater than input length  = 117, using nperseg = 117


nperseg = 256 is greater than input length  = 114, using nperseg = 114


nperseg = 256 is greater than input length  = 115, using nperseg = 115


nperseg = 256 is greater than input length  = 120, using nperseg = 120


nperseg = 256 is greater than input length  = 111, using nperseg = 111


nperseg = 256 is greater than input length  = 119, using nperseg = 119


nperseg = 256 is greater than input length  = 121, using nperseg = 121


nperseg = 256 is greater than input length  = 112, using nperseg = 112


nperseg = 256 is greater than input length  = 113, using nperseg = 113


nperseg = 256 is greater than input length  = 122, using nperseg = 122


nperseg = 256 is greater than input length  = 123, using nperseg = 123



39 total frags and 39 kept
resulting dataset: 39 records
generating C:\Users\ju\GDrive\Projects\HeRV\Data\PreProcessed\df_train_20.xlsx ...
10 valid sessions out of 10 total (at least one full fragment of 20 seconds after discarding first 5 seconds)
0 / 59



nperseg = 256 is greater than input length  = 76, using nperseg = 76


nperseg = 256 is greater than input length  = 77, using nperseg = 77


nperseg = 256 is greater than input length  = 75, using nperseg = 75


nperseg = 256 is greater than input length  = 78, using nperseg = 78


nperseg = 256 is greater than input length  = 80, using nperseg = 80


nperseg = 256 is greater than input length  = 74, using nperseg = 74


nperseg = 256 is greater than input length  = 70, using nperseg = 70


nperseg = 256 is greater than input length  = 79, using nperseg = 79


nperseg = 256 is greater than input length  = 72, using nperseg = 72


nperseg = 256 is greater than input length  = 71, using nperseg = 71


nperseg = 256 is greater than input length  = 81, using nperseg = 81



59 total frags and 59 kept
resulting dataset: 59 records
generating C:\Users\ju\GDrive\Projects\HeRV\Data\PreProcessed\df_train_10.xlsx ...
10 valid sessions out of 10 total (at least one full fragment of 10 seconds after discarding first 5 seconds)
0 / 121



nperseg = 256 is greater than input length  = 34, using nperseg = 34


nperseg = 256 is greater than input length  = 35, using nperseg = 35


nperseg = 256 is greater than input length  = 36, using nperseg = 36


nperseg = 256 is greater than input length  = 40, using nperseg = 40


nperseg = 256 is greater than input length  = 33, using nperseg = 33


nperseg = 256 is greater than input length  = 37, using nperseg = 37


nperseg = 256 is greater than input length  = 31, using nperseg = 31


nperseg = 256 is greater than input length  = 29, using nperseg = 29


invalid value encountered in double_scalars


invalid value encountered in double_scalars


invalid value encountered in double_scalars


nperseg = 256 is greater than input length  = 38, using nperseg = 38


nperseg = 256 is greater than input length  = 28, using nperseg = 28


nperseg = 256 is greater than input length  = 30, using nperseg = 30


nperseg = 256 is greater than input length  = 39, using nperseg = 39


nperseg =

121 total frags and 121 kept
resulting dataset: 121 records
Wall time: 9.85 s



---


# CLASSIFICATION

---



In [22]:
features = cl.features_fd
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)

In [23]:
dff = pd.read_excel(PRE_PATH+'\\df_train_20.xlsx')
dff.describe()

Unnamed: 0,beatcount,hf,hfnu,lf,lf_hf,lfnu,mhr,mrri,nn50,order,pnn50,rmssd,sdnn,sess,total_power,user,vlf
count,59.0,59.0,59.0,59.0,59.0,59.0,59.0,59.0,59.0,59.0,59.0,59.0,59.0,59.0,59.0,59.0,59.0
mean,26.237288,287.428948,21.178566,1732.078811,16.061014,78.821434,81.043432,748.602742,2.084746,2.491525,8.80915,28.641679,56.644246,4.40678,2019.507759,1.0,0.0
std,1.968186,359.545226,19.194135,2025.650272,38.0315,19.194135,4.98911,50.415306,2.500263,1.755512,10.724459,19.305675,36.655789,2.97761,2311.626426,0.0,0.0
min,22.0,2.058388,0.387951,1.406026,0.160247,13.811465,70.701368,673.103448,0.0,0.0,0.0,6.022394,4.901014,0.0,3.464414,1.0,0.0
25%,25.0,25.941202,8.788185,139.042234,2.477759,71.192057,76.911883,704.632275,0.0,1.0,0.0,12.250272,25.158889,2.0,185.204787,1.0,0.0
50%,26.0,68.518106,15.040132,616.844398,5.648878,84.959868,81.662256,737.84,1.0,2.0,3.703704,19.996428,41.827319,4.0,659.479413,1.0,0.0
75%,28.0,460.340304,28.807943,3239.532014,10.37933,91.211815,85.283248,792.441538,4.5,4.0,19.52381,46.689257,93.332765,7.0,3701.261521,1.0,0.0
max,30.0,1179.054705,86.188535,8129.815765,256.764267,99.612049,89.482244,861.73913,8.0,6.0,34.782609,68.502694,123.271495,9.0,9264.450008,1.0,0.0


In [24]:
dfr = dff[dff.activity.isin(['train-focus', 'train-breathe'])]

In [25]:
for label in ['activity']:
    cmc.plot_matrices(dfr, clf, features, label)




--------------------------------------------- activity ---------------------------------------------
activity
train-breathe    24
train-focus      24
Name: user, dtype: int64
38 10
