In [2]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import svm
from sklearn.preprocessing import scale
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, train_test_split

import hervpd as hp

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
df = pd.read_csv( 'data/12features/combined.csv')
print(df.activity.unique())
df.describe()


['eat' 'focused-active' 'focused-passive' 'household-chores'
 'leisure-active' 'leisure-passive' 'movement' 'sleep' 'rest-active']


Unnamed: 0,user,AVGNN,SDNN,MeanHR,MinHR,MaxHR,RMSSD,pNNxx,TINN,powerVLF,powerLF,powerHF,ratioHFLF
count,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0
mean,1.1875,758.6,72.865018,81.37714,67.362033,108.560417,46.409227,20.082227,358.020221,2877.387599,1423.919671,1000.824001,2.753764
std,0.913249,129.785003,32.960406,13.75457,10.84968,36.385572,32.537407,18.316263,146.885899,3619.576847,1443.217366,1841.656665,2.296974
min,0.0,475.73,17.833,51.628,41.437,58.117,4.8825,0.0,83.0,85.654,57.852,7.9777,0.15468
25%,0.0,670.3625,51.8715,73.23225,60.29275,91.47025,27.43875,6.593075,258.0,995.1825,591.3075,258.9175,1.257
50%,2.0,750.545,67.057,79.942,66.1815,100.52,37.9945,15.4175,339.0,1807.95,1040.35,512.09,2.03735
75%,2.0,819.3075,85.82,89.504,74.28525,112.58,54.76225,26.7905,426.25,3304.25,1697.95,960.045,3.579975
max,2.0,1162.2,264.25,126.12,104.9,550.32,240.47,98.232,1103.0,29716.0,12958.0,16103.0,18.228


### Apply scale to make all features go to mean = 0 and SD = 1. That will allow SVM kernels not to put too much weight on features with larger values.

In [None]:
#df.iloc[:, 3:15] = df.iloc[:, 3:15].apply(lambda x: scale(x))
df = hp.scaleWithinUser(df)
df.describe()

### Test dataset will have 20% of examples, the other 80% will be for training (with cross-validation)

In [None]:
train, test = train_test_split(df, test_size=0.2)
print (len(train), len(test))

### 3 - Model selection:  Perform an exhaustive search, with k-fold cross-validation,  in the space parameter consisting of:
* $C = 10^{i}, com -1 \leq i \leq 6$, for both linear and RBF kernels
* $\gamma = 10^{i}, com -3 \leq i \leq 3$, for RBF kernel.

In [None]:
crossval = StratifiedShuffleSplit(n_splits=4, test_size=0.2)

c_range = np.logspace(-1, 2, 4) 
gamma_range = np.logspace(-2, 1, 4)

param_lin=dict(C=c_range)
param_rbf=dict(C=c_range, gamma=gamma_range)


In [None]:
grid_lin = GridSearchCV(svm.SVC(kernel='linear', cache_size=1000), param_grid=param_lin, cv=crossval)
grid_lin.fit(X=train.iloc[:, 3:15], y=train['activity'])

print("Kernel linear --- ")
print("Best params: %s with score %0.5f" % (grid_lin.best_params_, grid_lin.best_score_))

In [None]:
grid_rbf = GridSearchCV(svm.SVC(kernel='rbf', cache_size=1000), param_grid=param_rbf, cv=crossval)
grid_rbf.fit(X=train.iloc[:, 3:15], y=train['activity'])

print("Kernel RBF --- ")
print("Best params: %s with score %0.5f" % (grid_rbf.best_params_, grid_rbf.best_score_))

In [None]:
clf1 = svm.SVC(kernel='linear', cache_size=1000, C=grid_lin.best_params_['C'])
clf1.fit(X=train.iloc[:, 3:15], y=train['activity'])
hp.printResults(test['activity'].values, clf1.predict(test.iloc[:, 3:15]))

clf2 = svm.SVC(kernel='rbf', cache_size=1000, C=grid_rbf.best_params_['C'], gamma=grid_rbf.best_params_['gamma'])
clf2.fit(X=train.iloc[:, 3:15], y=train['activity'])
hp.printResults(test['activity'].values, clf1.predict(test.iloc[:, 3:15]))

## Now, we can run the same steps above with multiple experiments! Time to have fun!!

In [23]:
df = pd.read_csv( 'data/12features/combined.csv')
hp.runFlow(df)
df.describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


435 train examples and 109 test examples
Best params for linear kernel: {'C': 1.0} with score 0.45977
Best params for RBF kernel: {'C': 10.0, 'gamma': 0.10000000000000001} with score 0.43103
--- test results for linear kernel:
39 out of 109 right! :)
--- test results for RBF kernel:
39 out of 109 right! :)


### 1 - All activities per user

In [None]:
ju = hp.userRows(df, 0)
ron = hp.userRows(df, 1)
edu = hp.userRows(df, 2)

hp.runFlow(ju)
hp.runFlow(ron)
hp.runFlow(edu)

### 2 - Splitting database into movement categories (movement, household chores and exercise) and all others

In [31]:
includelist = ['movement', 'household-chores']

dfmove = hp.addPartition(df, includelist, pname='move', labelIn='move', labelOut='still')
hp.runFlow(dfmove, labelName='move')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


435 train examples and 109 test examples
Best params for linear kernel: {'C': 0.10000000000000001} with score 0.90230
Best params for RBF kernel: {'C': 1.0, 'gamma': 0.01} with score 0.88218
--- test results for linear kernel:
96 out of 109 right! :)
--- test results for RBF kernel:
96 out of 109 right! :)


In [32]:
jumove = hp.addPartition(ju, includelist, pname='move', labelIn='move', labelOut='still')
hp.runFlow(jumove, labelName='move')



148 train examples and 37 test examples
Best params for linear kernel: {'C': 0.10000000000000001} with score 0.88333
Best params for RBF kernel: {'C': 1.0, 'gamma': 0.01} with score 0.90833
--- test results for linear kernel:
30 out of 37 right! :)
--- test results for RBF kernel:
30 out of 37 right! :)


In [34]:
edumove = hp.addPartition(edu, includelist, pname='move', labelIn='move', labelOut='still')
hp.runFlow(edumove, labelName='move')



229 train examples and 58 test examples
Best params for linear kernel: {'C': 10.0} with score 0.86413
Best params for RBF kernel: {'C': 10.0, 'gamma': 0.10000000000000001} with score 0.89674
--- test results for linear kernel:
49 out of 58 right! :)
--- test results for RBF kernel:
49 out of 58 right! :)


In [35]:
ronmove = hp.addPartition(ron, includelist, pname='move', labelIn='move', labelOut='still')
hp.runFlow(ronmove, labelName='move')



57 train examples and 15 test examples
Best params for linear kernel: {'C': 0.10000000000000001} with score 1.00000
Best params for RBF kernel: {'C': 1.0, 'gamma': 0.01} with score 0.97917
--- test results for linear kernel:
15 out of 15 right! :)
--- test results for RBF kernel:
15 out of 15 right! :)


In [37]:
dfnomove = hp.excludeActivities(df, includelist)
dfnomove.describe()
hp.runFlow(dfnomove)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


328 train examples and 83 test examples
Best params for linear kernel: {'C': 0.10000000000000001} with score 0.43182
Best params for RBF kernel: {'C': 100.0, 'gamma': 0.10000000000000001} with score 0.42424
--- test results for linear kernel:
34 out of 83 right! :)
--- test results for RBF kernel:
34 out of 83 right! :)
