In [77]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import svm
from sklearn.preprocessing import scale
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, train_test_split

import hervpd as hp

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [96]:
df = pd.read_csv( 'data/12features/combined.csv')
print(df.activity.unique())
df.describe()


['eat' 'focused-active' 'focused-passive' 'household-chores'
 'leisure-active' 'leisure-passive' 'movement' 'sleep' 'rest-active']


Unnamed: 0,user,AVGNN,SDNN,MeanHR,MinHR,MaxHR,RMSSD,pNNxx,TINN,powerVLF,powerLF,powerHF,ratioHFLF
count,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0
mean,1.1875,758.6,72.865018,81.37714,67.362033,108.560417,46.409227,20.082227,358.020221,2877.387599,1423.919671,1000.824001,2.753764
std,0.913249,129.785003,32.960406,13.75457,10.84968,36.385572,32.537407,18.316263,146.885899,3619.576847,1443.217366,1841.656665,2.296974
min,0.0,475.73,17.833,51.628,41.437,58.117,4.8825,0.0,83.0,85.654,57.852,7.9777,0.15468
25%,0.0,670.3625,51.8715,73.23225,60.29275,91.47025,27.43875,6.593075,258.0,995.1825,591.3075,258.9175,1.257
50%,2.0,750.545,67.057,79.942,66.1815,100.52,37.9945,15.4175,339.0,1807.95,1040.35,512.09,2.03735
75%,2.0,819.3075,85.82,89.504,74.28525,112.58,54.76225,26.7905,426.25,3304.25,1697.95,960.045,3.579975
max,2.0,1162.2,264.25,126.12,104.9,550.32,240.47,98.232,1103.0,29716.0,12958.0,16103.0,18.228


### Apply scale to make all features go to mean = 0 and SD = 1. That will allow SVM kernels not to put too much weight on features with larger values.

In [97]:
df.iloc[:, 3:15] = df.iloc[:, 3:15].apply(lambda x: scale(x))
df.describe()



Unnamed: 0,user,AVGNN,SDNN,MeanHR,MinHR,MaxHR,RMSSD,pNNxx,TINN,powerVLF,powerLF,powerHF,ratioHFLF
count,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0,544.0
mean,1.1875,-3.485774e-16,-5.37152e-16,-3.424548e-16,7.265430000000001e-17,-1.136754e-16,1.8367660000000002e-17,5.571524e-17,-5.1429450000000005e-17,1.20002e-16,7.918502000000001e-17,2.06126e-17,-9.051175000000001e-17
std,0.913249,1.00092,1.00092,1.00092,1.00092,1.00092,1.00092,1.00092,1.00092,1.00092,1.00092,1.00092,1.00092
min,0.0,-2.181534,-1.671177,-2.164846,-2.391674,-1.387634,-1.277451,-1.097424,-1.874062,-0.7719972,-0.9474144,-0.5396012,-1.132567
25%,0.0,-0.6805001,-0.6375176,-0.5927038,-0.6521657,-0.4701286,-0.5835725,-0.7371355,-0.6815649,-0.5204856,-0.5774449,-0.4032181,-0.652224
50%,2.0,-0.0621213,-0.1763742,-0.1044351,-0.1089082,-0.2211816,-0.258855,-0.2549112,-0.1296089,-0.2957312,-0.2660186,-0.2656216,-0.3121818
75%,2.0,0.4681849,0.3934085,0.5913918,0.6386906,0.1105736,0.2569569,0.3665839,0.4649362,0.1180401,0.1900494,-0.02216294,0.3600263
max,2.0,3.112621,5.811856,3.255939,3.463007,12.15224,5.969725,4.270615,5.076494,7.421673,7.999277,8.207869,6.742991


### Test dataset will have 20% of examples, the other 80% will be for training (with cross-validation)

In [104]:
train, test = train_test_split(df, test_size=0.2)
print (len(train), len(test))

435 109


### 3 - Model selection:  Perform an exhaustive search, with k-fold cross-validation,  in the space parameter consisting of:
* $C = 10^{i}, com -1 \leq i \leq 6$, for both linear and RBF kernels
* $\gamma = 10^{i}, com -3 \leq i \leq 3$, for RBF kernel.

In [105]:
crossval = StratifiedShuffleSplit(n_splits=4, test_size=0.2)

c_range = np.logspace(-1, 2, 4) 
gamma_range = np.logspace(-2, 1, 4)

param_lin=dict(C=c_range)
param_rbf=dict(C=c_range, gamma=gamma_range)


In [106]:
grid_lin = GridSearchCV(svm.SVC(kernel='linear', cache_size=1000), param_grid=param_lin, cv=crossval)
grid_lin.fit(X=train.iloc[:, 3:15], y=train['activity'])

print("Kernel linear --- ")
print("Best params: %s with score %0.5f" % (grid_lin.best_params_, grid_lin.best_score_))

Kernel linear --- 
Best params: {'C': 100.0} with score 0.38506


In [107]:
grid_rbf = GridSearchCV(svm.SVC(kernel='rbf', cache_size=1000), param_grid=param_rbf, cv=crossval)
grid_rbf.fit(X=train.iloc[:, 3:15], y=train['activity'])

print("Kernel RBF --- ")
print("Best params: %s with score %0.5f" % (grid_rbf.best_params_, grid_rbf.best_score_))

Kernel RBF --- 
Best params: {'C': 100.0, 'gamma': 0.10000000000000001} with score 0.46552


In [108]:
clf1 = svm.SVC(kernel='linear', cache_size=1000, C=grid_lin.best_params_['C'])
clf1.fit(X=train.iloc[:, 3:15], y=train['activity'])
hpp.printResults(test['activity'].values, clf1.predict(test.iloc[:, 3:15]))

clf2 = svm.SVC(kernel='rbf', cache_size=1000, C=grid_rbf.best_params_['C'], gamma=grid_rbf.best_params_['gamma'])
clf2.fit(X=train.iloc[:, 3:15], y=train['activity'])
hpp.printResults(test['activity'].values, clf1.predict(test.iloc[:, 3:15]))

------
Got 42 out of 109 right! :)
------
Got 42 out of 109 right! :)


## Now, we can run the same steps above with multiple experiments! Time to have fun!!

In [114]:
lessactiv = hp.excludeActivities(df, ['sleep', 'rest-active', 'leisure-active'])
print(lessactiv.activity.unique())
hp.runFlow(lessactiv)

ju =  df.loc[df['user'] == 0]
hp.runFlow(ju)

ron = df.loc[df['user'] == 1]
hp.runFlow(ron)

edu = df.loc[df['user'] == 2]
hp.runFlow(edu)

['eat' 'focused-active' 'focused-passive' 'household-chores'
 'leisure-passive' 'movement']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


348 train examples and 88 test examples
Best params for linear kernel: {'C': 100.0} with score 0.40000
Best params for RBF kernel: {'C': 1.0, 'gamma': 0.10000000000000001} with score 0.46786
--- test results for linear kernel:
46 out of 88 right! :)
--- test results for RBF kernel:
46 out of 88 right! :)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


148 train examples and 37 test examples
Best params for linear kernel: {'C': 10.0} with score 0.55000
Best params for RBF kernel: {'C': 10.0, 'gamma': 0.10000000000000001} with score 0.50000
--- test results for linear kernel:
20 out of 37 right! :)
--- test results for RBF kernel:
20 out of 37 right! :)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


57 train examples and 15 test examples
Best params for linear kernel: {'C': 1.0} with score 0.64583
Best params for RBF kernel: {'C': 10.0, 'gamma': 0.01} with score 0.64583
--- test results for linear kernel:
10 out of 15 right! :)
--- test results for RBF kernel:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


10 out of 15 right! :)
229 train examples and 58 test examples
Best params for linear kernel: {'C': 1.0} with score 0.42391
Best params for RBF kernel: {'C': 10.0, 'gamma': 0.10000000000000001} with score 0.47826
--- test results for linear kernel:
23 out of 58 right! :)
--- test results for RBF kernel:
23 out of 58 right! :)


Unnamed: 0,user,activity,posture,AVGNN,SDNN,MeanHR,MinHR,MaxHR,RMSSD,pNNxx,TINN,powerVLF,powerLF,powerHF,ratioHFLF
0,0,eat,sit,-0.588436,0.902062,0.477434,0.313013,0.790589,1.287573,0.485631,0.974303,0.213346,-0.189418,0.202600,-0.834889
1,0,eat,sit,-0.853271,1.001970,0.816542,0.572245,1.190566,1.738393,0.690993,1.171917,-0.332523,0.192650,0.841743,-0.909128
2,0,eat,sit,-0.696406,-0.057699,0.611476,0.746420,0.561166,0.325057,-0.232206,0.742618,-0.708316,0.324283,-0.278426,0.487191
3,0,eat,sit,-0.603552,-0.466139,0.495845,0.516155,-0.207702,-0.115057,-0.583654,-0.163680,-0.688934,-0.103489,-0.302367,0.049779
4,0,eat,sit,-1.129675,-0.653931,1.211101,1.422453,0.191449,-0.758294,-0.930140,-0.504394,-0.255621,-0.549369,-0.423032,0.037578
5,0,eat,sit,-1.078543,-0.869722,1.134620,1.089971,0.097644,-0.701753,-0.826448,-0.926879,-0.621569,-0.175131,-0.372477,0.417993
6,0,eat,sit,-1.437081,-1.093833,1.708121,1.935197,0.291031,-0.562739,-0.972038,-0.579351,-0.628388,-0.749710,-0.510912,1.259220
7,0,focused-active,sit,1.968139,4.687049,-1.615068,-2.259751,-0.325027,4.366402,2.688652,4.531353,6.216003,6.628161,6.135001,-0.810599
8,0,focused-active,sit,3.112621,4.832812,-2.164846,-2.391674,-0.397622,5.969725,3.150579,3.366112,7.421673,5.194418,6.212720,-0.887537
9,0,focused-active,sit,2.713903,4.909338,-1.989979,-2.214270,-0.693974,5.197596,2.980956,4.681267,2.381919,4.629326,8.207869,-0.980810
