In [47]:
%load_ext autoreload
%autoreload 2

import numpy as np
import matplotlib.pyplot as plt

from sklearn import svm
from sklearn.preprocessing import scale
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV

import herv_preprocess as hpp

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Parameter configuration

In [58]:
# input data
filename = 'data/12features/combined.csv'
# k-fold cross validation (one in each k examples will be used for validation, the others for training)
k = 5

includelist  = ('eat','focused-active')
grouplist  = ('household-chores','movement')

### 1 - Extract data from the csv file and split it into labels and features

In [59]:
data = hpp.getData(filename)
#data = hpp.filterActivities(data, includelist)
print(data[0])

(1, 'eat', 'sit',  687.15997314,  74.40599823,  87.31600189,  68.35299683,  103.69999695,  40.04800034,  13.81700039,  312.,  1804.09997559,  3734.39990234,  771.40002441,  4.84110022)


In [60]:
hpp.groupActivities(data, grouplist)
datagroups = hpp.getDataByActivity(np.array(data))
print (len(datagroups))

2


In [61]:
train, test = hpp.balanceTrainTestDatasets(datagroups)
ltrain, ftrain = hpp.splitExamples(train)
ltest, ftest = hpp.splitExamples(test)

in: 98 examples (79 for train and 19 for test)
out: 329 examples (264 for train and 65 for test)

Total: 343 train examples and 84 test examples 


In [62]:
ftrain, ftest = hpp.scaleFeatures(ftrain, ftest)
print(ftrain.shape)
print(ftest.shape)

min and max values before scaling:  0.0 29716.0
min and max values after scaling:  -2.4363003381 10.5424862637
(343, 12)
(84, 12)


### 2 - test classifiers with no cross validation and predefined parameters (C=1, $\gamma$ = 0.01)

In [63]:
clf1 = svm.SVC(kernel='linear', cache_size=1000, C=10)
clf1.fit(X=ftrain, y=ltrain)
hpp.printResults(ltest, clf1.predict(ftest))

expected		result
------------------------------
in		in
in		in
in		in
in		in
in		out
in		out
in		out
in		in
in		in
in		in
in		in
in		out
in		in
in		in
in		in
in		in
in		out
in		in
in		in
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
------
Got 79 out of 84 right! :)


In [64]:
clf2 = svm.SVC(kernel='rbf', cache_size=1000, C=10, gamma=0.1)
clf2.fit(X=ftrain, y=ltrain)
hpp.printResults(ltest, clf2.predict(ftest))

expected		result
------------------------------
in		in
in		in
in		in
in		in
in		out
in		out
in		out
in		in
in		in
in		in
in		in
in		in
in		in
in		in
in		in
in		in
in		in
in		in
in		in
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		in
out		out
out		out
out		out
out		out
out		in
out		out
out		out
out		out
out		out
out		out
out		out
out		in
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		in
out		out
out		out
out		in
out		in
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		in
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
out		out
------
Got 74 out of 84 right! :)


### 3 - Model selection:  Perform an exhaustive search, with k-fold cross-validation,  in the space parameter consisting of:
* $C = 10^{i}, com -1 \leq i \leq 6$, for both linear and RBF kernels
* $\gamma = 10^{i}, com -3 \leq i \leq 3$, for RBF kernel.

In [65]:
crossval = StratifiedShuffleSplit(n_splits=k-1, test_size=0.2)

# C varies from 10^(-1) to 10^6, gamma from 10^(-3) to 10^3
c_range = np.logspace(-1, 2, 4) 
gamma_range = np.logspace(-2, 1, 4)

param_lin=dict(C=c_range)
param_rbf=dict(C=c_range, gamma=gamma_range)


In [66]:
grid_lin = GridSearchCV(svm.SVC(kernel='linear', cache_size=1000), param_grid=param_lin, cv=crossval)
grid_lin.fit(X=ftrain, y=ltrain)

print("Kernel linear --- ")
print("Best params: %s with score %0.5f" % (grid_lin.best_params_, grid_lin.best_score_))

Kernel linear --- 
Best params: {'C': 1.0} with score 0.80072


In [67]:
grid_rbf = GridSearchCV(svm.SVC(kernel='rbf', cache_size=1000), param_grid=param_rbf, cv=crossval)
grid_rbf.fit(X=ftrain, y=ltrain)

print("Kernel RBF --- ")
print("Best params: %s with score %0.5f" % (grid_rbf.best_params_, grid_rbf.best_score_))

Kernel RBF --- 
Best params: {'C': 1.0, 'gamma': 0.10000000000000001} with score 0.85507


In [None]:
ltrain, ltest, ftrain, ftest = hpp.trainAndTestDatasets(labels, examples, k )