In [47]:
%load_ext autoreload
%autoreload 2

import numpy as np
import matplotlib.pyplot as plt

from sklearn import svm
from sklearn.preprocessing import scale
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV

import herv_preprocess as hpp

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Parameter configuration

In [112]:
# input data
filename = 'data/12features/ju.csv'
# k-fold cross validation (one in each k examples will be used for validation, the others for training)
k = 5

includelist  = ('eat','focused-active')
grouplist  = ('household-chores','movement')

### 1 - Extract data from the csv file and split it into labels and features

In [113]:
data = hpp.getData(filename)
#data = hpp.filterActivities(data, includelist)
print(data[0])

(0, 'eat', 'sit',  682.29998779,  102.56999969,  87.93800354,  70.75499725,  137.30000305,  88.26499939,  28.96899986,  501.,  3648.89990234,  1150.80004883,  1373.59997559,  0.83780998)


In [114]:
hpp.groupActivities(data, grouplist)
datagroups = hpp.getDataByActivity(np.array(data))
print (len(datagroups))

2


In [115]:
train, test = hpp.balanceTrainTestDatasets(datagroups)
ltrain, ftrain = hpp.splitExamples(train)
ltest, ftest = hpp.splitExamples(test)

in: 55 examples (44 for train and 11 for test)
out: 130 examples (104 for train and 26 for test)

Total: 148 train examples and 37 test examples 


In [116]:
ftrain, ftest = hpp.scaleFeatures(ftrain, ftest)
print(ftrain.shape)
print(ftest.shape)

min and max values before scaling:  0.154679998755 29716.0
min and max values after scaling:  -2.55328795792 8.0176301504
(148, 12)
(37, 12)


### 2 - test classifiers with no cross validation and predefined parameters (C=1, $\gamma$ = 0.01)

In [117]:
clf1 = svm.SVC(kernel='linear', cache_size=1000, C=10)
clf1.fit(X=ftrain, y=ltrain)
hpp.printResults(ltest, clf1.predict(ftest))

------
Got 24 out of 37 right! :)


In [122]:
clf2 = svm.SVC(kernel='rbf', cache_size=1000, C=10, gamma=0.1)
clf2.fit(X=ftrain, y=ltrain)
hpp.printResults(ltest, clf2.predict(ftest))

------
Got 24 out of 37 right! :)


### 3 - Model selection:  Perform an exhaustive search, with k-fold cross-validation,  in the space parameter consisting of:
* $C = 10^{i}, com -1 \leq i \leq 6$, for both linear and RBF kernels
* $\gamma = 10^{i}, com -3 \leq i \leq 3$, for RBF kernel.

In [119]:
crossval = StratifiedShuffleSplit(n_splits=k-1, test_size=0.2)

# C varies from 10^(-1) to 10^6, gamma from 10^(-3) to 10^3
c_range = np.logspace(-1, 2, 4) 
gamma_range = np.logspace(-2, 1, 4)

param_lin=dict(C=c_range)
param_rbf=dict(C=c_range, gamma=gamma_range)


In [120]:
grid_lin = GridSearchCV(svm.SVC(kernel='linear', cache_size=1000), param_grid=param_lin, cv=crossval)
grid_lin.fit(X=ftrain, y=ltrain)

print("Kernel linear --- ")
print("Best params: %s with score %0.5f" % (grid_lin.best_params_, grid_lin.best_score_))

Kernel linear --- 
Best params: {'C': 10.0} with score 0.97500


In [121]:
grid_rbf = GridSearchCV(svm.SVC(kernel='rbf', cache_size=1000), param_grid=param_rbf, cv=crossval)
grid_rbf.fit(X=ftrain, y=ltrain)

print("Kernel RBF --- ")
print("Best params: %s with score %0.5f" % (grid_rbf.best_params_, grid_rbf.best_score_))

Kernel RBF --- 
Best params: {'C': 10.0, 'gamma': 0.10000000000000001} with score 0.98333


In [None]:
ltrain, ltest, ftrain, ftest = hpp.trainAndTestDatasets(labels, examples, k )