In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline  

import pandas as pd
import numpy as np

from datetime import datetime, timedelta

import plotly.offline as pl
pl.init_notebook_mode(connected=True)
import plotly.graph_objs as go

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

import parseIntervalFiles as pif
import consolidateFiles as cf
import datacleaning as cl
import dataviz as dv
import fragmentation as fr
import classif_multiclass as cmc
import hervpd as hp

## Pipeline configuration 
* set the input/output directories, users and date range

In [2]:
verbose = True
    
DATA_PATH = "C:\\Users\\ju\\GDrive\\Projects\\HeRV\\Data\\" 
# DATA_PATH = "/home/ju/GDrive/Projects/HeRV/Data/"
    
RAW_PATH = DATA_PATH + "Raw"
PRE_PATH = DATA_PATH + "PreProcessed"

dt1 = datetime(2018, 6, 21)
dt2 = datetime(2018, 10, 27)

users = [1, 3, 9, 10, 11, 12, 13]

## Extracting sessions

In [3]:
%time sess = cf.gen_sessions_dataset(users, dt1, dt2, dirname=RAW_PATH, verbose=True)

parsing C:\Users\ju\GDrive\Projects\HeRV\Data\Raw\1\act180621.csv
0 sessions excluded
parsing C:\Users\ju\GDrive\Projects\HeRV\Data\Raw\1\act180623.csv
0 sessions excluded
parsing C:\Users\ju\GDrive\Projects\HeRV\Data\Raw\1\act180624.csv
0 sessions excluded
parsing C:\Users\ju\GDrive\Projects\HeRV\Data\Raw\1\act180626.csv
0 sessions excluded
[ 1 ] train-baseline 2018-06-21 17:01:07
[ 1 ] train-focus 2018-06-21 17:04:23
[ 1 ] train-breathe 2018-06-21 17:08:04
[ 1 ] train-focus 2018-06-21 17:10:43
[ 1 ] train-breathe 2018-06-21 17:13:39
[ 1 ] train-baseline 2018-06-21 22:22:37
[ 1 ] train-focus 2018-06-21 22:24:49
[ 1 ] train-breathe 2018-06-21 22:28:19
[ 1 ] train-focus 2018-06-21 22:32:17
[ 1 ] train-breathe 2018-06-21 22:35:42
[ 1 ] leisure-passive 2018-06-21 22:41:08
[ 1 ] train-baseline 2018-06-23 23:37:06
[ 1 ] train-focus 2018-06-23 23:40:35
[ 1 ] train-breathe 2018-06-23 23:43:22
[ 1 ] train-focus 2018-06-23 23:47:30
[ 1 ] train-breathe 2018-06-23 23:50:59
[ 1 ] leisure-passive 2

In [4]:
dfs = pd.DataFrame(sess)

In [5]:
dfs = dfs[dfs.activity.isin(['train-baseline', 'train-focus', 'train-breathe'])]

In [6]:
dfs.groupby('activity').count()['duration']

activity
train-baseline    10
train-breathe     27
train-focus       27
Name: duration, dtype: int64

In [8]:
dfs[dfs.user==1][['activity', 'user', 'start', 'duration', 'beatscount']]

Unnamed: 0,activity,user,start,duration,beatscount
0,train-baseline,1,2018-06-21 17:01:07,147,201
1,train-focus,1,2018-06-21 17:04:23,127,183
2,train-breathe,1,2018-06-21 17:08:04,127,158
3,train-focus,1,2018-06-21 17:10:43,128,189
4,train-breathe,1,2018-06-21 17:13:39,133,176
5,train-baseline,1,2018-06-21 22:22:37,104,146
6,train-focus,1,2018-06-21 22:24:49,141,202
7,train-breathe,1,2018-06-21 22:28:19,128,165
8,train-focus,1,2018-06-21 22:32:17,130,187
9,train-breathe,1,2018-06-21 22:35:42,135,171


In [None]:
dfs.describe()

In [None]:
dfs.drop(['rr'], axis =  1).to_excel(PRE_PATH + "\\train_sessions.xlsx")


---


# VISUALIZATION

---



In [None]:
#d21 = df_export[df_export.start > '2018-06-21']
for feat in cl.features_all:
    dv.boxplot_compare(dfs[dfs.user==11], feat, groupby='activity', min_examples=1)


---


# FRAGMENTATION

---



In [None]:
sessions = dfs.to_dict(orient='records')

### Example dataset for 60 sec frags

In [None]:
frags = fr.gen_fragments_dataset(sessions, 60, 5, RAW_PATH)
dff = pd.DataFrame(frags)
dff.sample(2)

In [None]:
dv.full_plot_sess(dfs.iloc[-2], dff, RAW_PATH)

### Generate datasets for multiple fragment sizes

In [None]:
durations = [60, 30, 20, 15, 10]
crop = 10

for dr in durations:
    fname = PRE_PATH + '\\df_train_' + str(dr) + '.xlsx'
    print ('generating', fname, '...')
    ds = pd.DataFrame(fr.gen_fragments_dataset(sessions, dr, crop, RAW_PATH))
    print('resulting dataset:', len(ds), 'records')
    ds.drop('rr', axis=1).to_excel(fname)


---


# CLASSIFICATION

---



In [None]:
dff = pd.read_excel(PRE_PATH+'\\df_train_15.xlsx')

In [None]:
dff[dff.user==1].groupby('activity').count()['start']

In [None]:
dfr = dff[dff.activity.isin(['train-focus', 'train-breathe'])]

In [None]:
features = cl.features_all
clf1 = RandomForestClassifier(n_estimators=10, n_jobs=-1)
clf2 = RandomForestClassifier(n_estimators=100, n_jobs=-1)
clf3 = RandomForestClassifier(n_estimators=1000, n_jobs=-1)
clf4 = RandomForestClassifier(n_estimators=10000, n_jobs=-1)
clf5 = RandomForestClassifier(n_estimators=100000, n_jobs=-1)

In [None]:
dff.groupby('user').count()['start']

In [None]:
cmc.barplot_accuracy_per_user(dff, clf4, features, label='activity', users=[1,3,9,10,11,12,13])

In [None]:
for u in [1,3,9,10,11,12,13]:
    print('----', u, '----')
    cmc.plot_matrices(dfr[dfr.user==u], clf3, features, label='activity')

print ('------ combined -------')
cmc.plot_matrices(dfr[dfr.user.isin([1,3])], clf3, features, label='activity')