In [50]:
%load_ext autoreload
%autoreload 2

import numpy as np
import matplotlib.pyplot as plt

from sklearn import svm
from sklearn.preprocessing import scale
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV

import herv_preprocess as hpp

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Parameter configuration

In [228]:
# input data
filename = 'data/304_sessions_e.csv'
# k-fold cross validation (one in each k examples will be used for validation, the others for training)
k = 5

includelist  = ('movement', 'household-chores')
grouplist = ('movement')

### 1 - Extract data from the csv file and split it into labels and features

In [229]:
data = hpp.getData(filename, dtype)
data = hpp.filterActivities(data, includelist)
hpp.groupActivities(data, grouplist)
datagroups = hpp.getDataByActivity(np.array(data))
print (len(datagroups))

2


In [230]:
train, test = hpp.balanceTrainTestDatasets(datagroups)
ltrain, ftrain = hpp.splitExamples(train)
ltest, ftest = hpp.splitExamples(test)
ftrain, ftest = hpp.scaleFeatures(ftrain, ftest)
print(ftrain.shape)
print(ftest.shape)

in: 30 examples (24 for train and 6 for test)
out: 40 examples (32 for train and 8 for test)

Total: 56 train examples and 14 test examples 
min and max values before scaling:  0.0 866.729980469
min and max values after scaling:  -2.29007073971 4.2907637833
(56, 8)
(14, 8)


### 2 - test classifiers with no cross validation and predefined parameters (C=1, $\gamma$ = 0.01)

In [231]:
clf1 = svm.SVC(kernel='linear', cache_size=1000, C=1)
clf1.fit(X=ftrain, y=ltrain)
hpp.printResults(ltest, clf1.predict(ftest))

expected		result
------------------------------
in		out
in		out
in		out
in		out
in		out
in		in
out		in
out		in
out		in
out		in
out		out
out		out
out		out
out		out
------
Got 5 out of 14 right! :)


In [232]:
clf2 = svm.SVC(kernel='rbf', cache_size=1000, C=1, gamma=0.1)
clf2.fit(X=ftrain, y=ltrain)
hpp.printResults(ltest, clf2.predict(ftest))

expected		result
------------------------------
in		out
in		out
in		out
in		out
in		out
in		in
out		in
out		out
out		in
out		in
out		out
out		out
out		out
out		in
------
Got 5 out of 14 right! :)


### 3 - Model selection:  Perform an exhaustive search, with k-fold cross-validation,  in the space parameter consisting of:
* $C = 10^{i}, com -1 \leq i \leq 6$, for both linear and RBF kernels
* $\gamma = 10^{i}, com -3 \leq i \leq 3$, for RBF kernel.

In [187]:
crossval = StratifiedShuffleSplit(n_splits=k-1, test_size=0.2)

# C varies from 10^(-1) to 10^6, gamma from 10^(-3) to 10^3
c_range = np.logspace(-1, 2, 4) 
gamma_range = np.logspace(-2, 1, 4)

param_lin=dict(C=c_range)
param_rbf=dict(C=c_range, gamma=gamma_range)


In [193]:
grid_lin = GridSearchCV(svm.SVC(kernel='linear', cache_size=1000), param_grid=param_lin, cv=crossval)
grid_lin.fit(X=ftrain, y=ltrain)

print("Kernel linear --- ")
print("Best params: %s with score %0.5f" % (grid_lin.best_params_, grid_lin.best_score_))

Kernel linear --- 
Best params: {'C': 10.0} with score 0.77551


In [201]:
grid_rbf = GridSearchCV(svm.SVC(kernel='rbf', cache_size=1000), param_grid=param_rbf, cv=crossval)
grid_rbf.fit(X=ftrain, y=ltrain)

print("Kernel RBF --- ")
print("Best params: %s with score %0.5f" % (grid_rbf.best_params_, grid_rbf.best_score_))

Kernel RBF --- 
Best params: {'C': 1.0, 'gamma': 0.10000000000000001} with score 0.79082


In [140]:
ltrain, ltest, ftrain, ftest = hpp.trainAndTestDatasets(labels, examples, k )

eat: 36 examples (29 for train and 7 for test)
focused-active: 43 examples (35 for train and 8 for test)
focused-passive: 40 examples (32 for train and 8 for test)
household-chores: 40 examples (32 for train and 8 for test)
leisure-passive: 49 examples (40 for train and 9 for test)
movement: 30 examples (24 for train and 6 for test)
rest-active: 19 examples (16 for train and 3 for test)
sleep: 43 examples (35 for train and 8 for test)

Total: 243 train examples and 57 test examples 
