In [2]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import svm
from sklearn.preprocessing import scale
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, train_test_split

import hervpd as hp

In [3]:
df = pd.read_csv( 'data/12features/combined.csv')
print(df.activity.unique())
df.describe()

['eat' 'focused-active' 'focused-passive' 'household-chores'
 'leisure-active' 'leisure-passive' 'movement' 'sleep' 'rest-active']


Unnamed: 0,user,AVGNN,SDNN,MeanHR,MinHR,MaxHR,RMSSD,pNNxx,TINN,powerVLF,powerLF,powerHF,ratioHFLF
count,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0
mean,1.1875,758.6,72.865018,81.37714,67.362033,108.560417,46.409227,20.082227,358.020221,2877.387599,1423.919671,1000.824001,2.753764
std,0.913249,129.785003,32.960406,13.75457,10.84968,36.385572,32.537407,18.316263,146.885899,3619.576847,1443.217366,1841.656665,2.296974
min,0.0,475.73,17.833,51.628,41.437,58.117,4.8825,0.0,83.0,85.654,57.852,7.9777,0.15468
25%,0.0,670.3625,51.8715,73.23225,60.29275,91.47025,27.43875,6.593075,258.0,995.1825,591.3075,258.9175,1.257
50%,2.0,750.545,67.057,79.942,66.1815,100.52,37.9945,15.4175,339.0,1807.95,1040.35,512.09,2.03735
75%,2.0,819.3075,85.82,89.504,74.28525,112.58,54.76225,26.7905,426.25,3304.25,1697.95,960.045,3.579975
max,2.0,1162.2,264.25,126.12,104.9,550.32,240.47,98.232,1103.0,29716.0,12958.0,16103.0,18.228


In [8]:
df.groupby('activity').count()['AVGNN']

activity
eat                 53
focused-active      88
focused-passive     73
household-chores    68
leisure-active      14
leisure-passive     89
movement            65
rest-active         19
sleep               75
Name: AVGNN, dtype: int64

### Apply scale to make all features go to mean = 0 and SD = 1. That will allow SVM kernels not to put too much weight on features with larger values.

In [13]:
#df.iloc[:, 3:15] = df.iloc[:, 3:15].apply(lambda x: scale(x))
df = hp.scaleWithinUser(df)
df.describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,user,AVGNN,SDNN,MeanHR,MinHR,MaxHR,RMSSD,pNNxx,TINN,powerVLF,powerLF,powerHF,ratioHFLF
count,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0
mean,1.1875,-2.424531e-16,-6.612358000000001e-17,1.1836940000000001e-17,5.698056e-16,-2.314325e-16,8.245039e-17,-1.857175e-17,1.7143150000000003e-17,3.183728e-17,7.836868000000001e-17,-1.75105e-16,3.673532e-18
std,0.913249,1.00092,1.00092,1.00092,1.00092,1.00092,1.00092,1.00092,1.00092,1.00092,1.00092,1.00092,1.00092
min,0.0,-2.843791,-2.873066,-2.460395,-2.553288,-2.698142,-2.905886,-2.207686,-2.721551,-1.275233,-1.467678,-1.552781,-1.441435
25%,0.0,-0.6640136,-0.7040336,-0.6410388,-0.6684354,-0.5597017,-0.7309028,-0.7525458,-0.6982344,-0.5539669,-0.577282,-0.5662694,-0.6796624
50%,2.0,-0.0655166,-0.1676111,-0.09715803,-0.04039264,-0.1973524,-0.1655967,-0.2414187,-0.1085526,-0.306594,-0.3017131,-0.3295817,-0.2990694
75%,2.0,0.5280744,0.4666745,0.5956143,0.5992392,0.4212228,0.5763606,0.4514282,0.5244985,0.1528431,0.1965599,0.3533213,0.3809616
max,2.0,3.751426,4.527089,3.880179,5.022477,8.01763,4.081758,4.1198,3.771926,6.268217,5.81662,6.204422,5.978021


### Test dataset will have 20% of examples, the other 80% will be for training (with cross-validation)

In [14]:
train, test = train_test_split(df, test_size=0.2)
print (len(train), len(test))

435 109


### 3 - Model selection:  Perform an exhaustive search, with k-fold cross-validation,  in the space parameter consisting of:
* $C = 10^{i}, com -1 \leq i \leq 6$, for both linear and RBF kernels
* $\gamma = 10^{i}, com -3 \leq i \leq 3$, for RBF kernel.

In [15]:
crossval = StratifiedShuffleSplit(n_splits=4, test_size=0.2)

c_range = np.logspace(-1, 2, 4) 
gamma_range = np.logspace(-2, 1, 4)

param_lin=dict(C=c_range)
param_rbf=dict(C=c_range, gamma=gamma_range)


In [20]:
grid_lin = GridSearchCV(svm.SVC(kernel='linear', cache_size=1000), param_grid=param_lin, cv=crossval)
grid_lin.fit(X=train.iloc[:, 3:15], y=train['activity'])

print("Kernel linear --- ")
print("Best params: %s with score %0.5f" % (grid_lin.best_params_, grid_lin.best_score_))

clf1 = svm.SVC(kernel='linear', cache_size=1000, C=grid_lin.best_params_['C'])
clf1.fit(X=train.iloc[:, 3:15], y=train['activity'])
hp.report(test, clf1.predict(test.iloc[:, 3:15]))

Kernel linear --- 
Best params: {'C': 10.0} with score 0.43391


TypeError: report() missing 1 required positional argument: 'labelName'

In [17]:
grid_rbf = GridSearchCV(svm.SVC(kernel='rbf', cache_size=1000), param_grid=param_rbf, cv=crossval)
grid_rbf.fit(X=train.iloc[:, 3:15], y=train['activity'])

print("Kernel RBF --- ")
print("Best params: %s with score %0.5f" % (grid_rbf.best_params_, grid_rbf.best_score_))

Kernel RBF --- 
Best params: {'C': 100.0, 'gamma': 0.01} with score 0.42529


In [18]:


clf2 = svm.SVC(kernel='rbf', cache_size=1000, C=grid_rbf.best_params_['C'], gamma=grid_rbf.best_params_['gamma'])
clf2.fit(X=train.iloc[:, 3:15], y=train['activity'])
hp.printResults(test['activity'].values, clf1.predict(test.iloc[:, 3:15]))

AttributeError: module 'hervpd' has no attribute 'printResults'


## Now, we can run the same steps above with multiple experiments! Time to have fun!!

In [None]:
# exclude from df activities with too few examples

df = pd.read_csv( 'data/12features/combined.csv')
df = hp.excludeActivities(df, ['leisure-active',  'rest-active'])
hp.runFlow(df)

In [None]:
hp.runFlowByUser(df)

### 2 - Removing activities from the list

In [None]:
df_reduced = hp.filterActivities(df, ['focused-active', 'focused-passive', 'leisure-passive'])
df_reduced.groupby('activity').count()['user']

In [None]:
hp.runFlow(df_reduced)

In [None]:
hp.runFlowByUser(df_reduced)

### 1 - All activities per user

In [None]:
hp.runFlow(ju)

In [None]:
hp.runFlow(ron)

In [None]:
hp.runFlow(edu)

### 2 - Splitting database into movement categories (movement, household chores and exercise) and all others

In [None]:
includelist = ['movement', 'household-chores']

df_move = hp.addPartition(df, includelist, pname='move', labelIn='move', labelOut='still')
df_move.groupby('move').count()['user']

In [None]:
hp.runFlow(df_move, labelName='move')

In [None]:
jumove = hp.addPartition(ju, includelist, pname='move', labelIn='move', labelOut='still')
hp.runFlow(jumove, labelName='move')

In [None]:
edumove = hp.addPartition(edu, includelist, pname='move', labelIn='move', labelOut='still')
hp.runFlow(edumove, labelName='move')

In [None]:
ronmove = hp.addPartition(ron, includelist, pname='move', labelIn='move', labelOut='still')
hp.runFlow(ronmove, labelName='move')

In [None]:
includelist
dfnomove = hp.excludeActivities(df, includelist)
dfnomove.describe()
hp.runFlow(dfnomove)

In [None]:
[x for x in df['activity']]