In [83]:
# description of this dataset http://groupware.les.inf.puc-rio.br/har#ixzz2PyRdbAfA
from sklearn import datasets
from sklearn import preprocessing as pp
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
import numpy
import csv
import pandas as pd
import time

le = pp.LabelEncoder() 
le.fit(['sitting', 'walking', 'sittingdown', 'standing', 'standingup'])

### Retrieving all data
overall = pd.read_csv("./dataset-har-PUC-Rio-ugulino.csv", delimiter=';', header='infer') 
data = overall.loc[:, "x1":"z4"].as_matrix() # has to be converted to ndarray in order to be processed by segment_signal()
targets = overall.loc[:,"class,,"].as_matrix() # double commas: looks like the researchers are naughty

# print type(data)
# print type(targets)
# print data.shape
# print targets.shape


def debug(item, shape=True):
    print item
    print type(item)
    if (shape == True):
        print item.shape()
    print "----------------------"

### Data segmentation: shall use a sudden change of sensor readings
### like if (x_pre - x_curr <= 1.0, do nothing)
### Range of Accelerometer sensor readings is +3g/-3g

# reading 14 sets of data in every 2 seconds. 
# For segmenting the data from online only. 
# each set of data is taken 150ms apart from another.
# so choosing a window size of 14 will be 2.1 seconds.


def segment_signal(data, window_size=14): 

    N = data.shape[0]
    dim = data.shape[1]
    K = N/window_size
    segments = numpy.empty((K, window_size, dim))
    for i in range(K):
        segment = data[i*window_size:i*window_size+window_size,:]
        segments[i] = numpy.vstack(segment)
    return segments

##!!!! questions: for normalization, should it be done right after loading csv or after segmenation? 
##!!!! Normalize() can't process nadarray with dimension > 2.
X = pp.normalize(data)
y = targets[::14] 
y = y[:-1]# -1 because it will have a extra set of data than X.

segs = segment_signal(X)

### feautre extraction // take the difference between sensors

### this method is to extract the difference between consecutive sensor readings.
## parameter raw is a 2D ndarray
## return a 2D ndarray
def extract_diff(raw):

    N = raw.shape[0] # number of sets of sensor readings
    dim = raw.shape[1] # number of values in each readings
    features = numpy.empty((N - 1, dim))
    for i in range(1, N):
        for j in range(dim):
            features[i-1][j] = raw[i][j] - raw[i-1][j]

    return features

def extract_diff_2(raw):

    N = raw.shape[0] # number of segments of sensor readings ()
    I = raw.shape[1] # number of sets of readings (14)
    J = raw.shape[2] # number of values in each set of readings (12)
    feature_num = (I - 1) * J
    feature = numpy.empty((feature_num))
    features = numpy.empty((N, feature_num))
    for n in range(N):
        idx = 0;
        for i in range(1, I):
            for j in range(J):
                feature[idx] = raw[n][i][j] - raw[n][i-1][j]
                idx += 1
        numpy.append(features[n], feature)
    return features

features = extract_diff_2(segs)

features

array([[  -3.,   -3.,   -1., ...,   -1.,   -5.,    0.],
       [  -4.,   -1.,   -4., ...,    0.,   -4.,   -1.],
       [  -3.,   -4.,   -1., ...,   -3.,    0.,   -2.],
       ..., 
       [-164., -165., -164., ..., -166., -166., -165.],
       [-165., -166., -165., ..., -164., -163., -163.],
       [-164., -163., -164., ..., -164., -163., -164.]])

In [82]:
segs

array([[[-0.01013149,  0.31069913, -0.21276136, ..., -0.50657466,
         -0.34784794, -0.49644317],
        [-0.01016021,  0.31835337, -0.21675123, ..., -0.50462397,
         -0.35222076, -0.49107701],
        [-0.00338375,  0.32822383, -0.2064088 , ..., -0.51094638,
         -0.35191009, -0.48726012],
        ..., 
        [ 0.01016086,  0.33869517, -0.19983015, ..., -0.50804276,
         -0.34885603, -0.48772105],
        [ 0.        ,  0.32600865, -0.20715133, ..., -0.5059926 ,
         -0.35317604, -0.4924089 ],
        [ 0.        ,  0.32804547, -0.20629664, ..., -0.5039049 ,
         -0.35171885, -0.49375916]],

       [[ 0.        ,  0.32896102, -0.20348104, ..., -0.50531125,
         -0.35270047, -0.49174585],
        [-0.00338788,  0.32523692, -0.20666096, ..., -0.5047948 ,
         -0.35911576, -0.49124326],
        [ 0.00676696,  0.33496449, -0.20300878, ..., -0.50752195,
         -0.34849841, -0.49060455],
        ..., 
        [-0.01015393,  0.32492566, -0.21323247, ...,

In [77]:
le = pp.LabelEncoder()
le.fit(targets)
list(le.classes_)

['sitting', 'sittingdown', 'standing', 'standingup', 'walking']

In [78]:
Targets = le.transform(targets)
Targets

array([0, 0, 0, ..., 4, 4, 4])

In [79]:
y[:-1].shape

(11829,)

In [80]:
print X.shape
print segs.shape
print features.shape
print y.shape

(165632, 12)
(11830, 14, 12)
(11830, 156)
(11830,)


In [19]:
features.shape

(13, 12)

In [62]:
y.shape

(11830,)

In [41]:
Y = y[::14]
Y.shape

(11831,)

In [52]:
segs.shape

(11830, 14, 12)

In [66]:
#having 15 neurons
kfold = KFold(n_splits=10, shuffle=True)

fold_index = 0
for train, test in kfold.split(features):
    clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                     hidden_layer_sizes=(15,), random_state=1).fit(features[train], y[train])
    predictions = clf.predict(features[test])
    accuracy = clf.score(features[test], y[test])
    cm = confusion_matrix(y[test], predictions)

    print('In the %i fold, the classification accuracy is %f' %(fold_index, accuracy))
    print('And the confusion matrix is: ')
    print(cm)
    fold_index += 1

In the 0 fold, the classification accuracy is 0.917160
And the confusion matrix is: 
[[360   1   0   0   1]
 [  1  92   0   1   3]
 [  1   0 280  29   7]
 [  0   0  33  55   1]
 [  2   1  14   3 298]]
In the 1 fold, the classification accuracy is 0.935757
And the confusion matrix is: 
[[363   2   0   0   0]
 [  1  74   1   0   4]
 [  0   2 316  13  14]
 [  1   0  19  77   3]
 [  1   0  12   3 277]]
In the 2 fold, the classification accuracy is 0.921386
And the confusion matrix is: 
[[370   3   0   0   0]
 [  4  75   0   1   3]
 [  0   3 291  13  11]
 [  0   0  30  63   1]
 [  3   3  15   3 291]]
In the 3 fold, the classification accuracy is 0.921386
And the confusion matrix is: 
[[353   4   2   0   0]
 [  3  69   0   0   2]
 [  0   2 322  17   8]
 [  0   0  37  56   1]
 [  0   2  12   3 290]]
In the 4 fold, the classification accuracy is 0.923922
And the confusion matrix is: 
[[369   5   0   0   0]
 [ 10  62   1   0   1]
 [  0   5 325  10  11]
 [  0   1  33  55   0]
 [  0   1  10   2 2

In [67]:
#having 10 neurons
kfold = KFold(n_splits=10, shuffle=True)

fold_index = 0
for train, test in kfold.split(features):
    clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                     hidden_layer_sizes=(10,), random_state=1).fit(features[train], y[train])
    predictions = clf.predict(features[test])
    accuracy = clf.score(features[test], y[test])
    cm = confusion_matrix(y[test], predictions)

    print('In the %i fold, the classification accuracy is %f' %(fold_index, accuracy))
    print('And the confusion matrix is: ')
    print(cm)
    fold_index += 1

In the 0 fold, the classification accuracy is 0.900254
And the confusion matrix is: 
[[351   6   0   0   0]
 [ 13  83   1   0   3]
 [  0  10 282  16   5]
 [  0   1  32  57   4]
 [  1   3  11  12 292]]
In the 1 fold, the classification accuracy is 0.914624
And the confusion matrix is: 
[[353   5   2   0   0]
 [  3  69   5   2   3]
 [  0   8 307  15   6]
 [  0   0  29  57   2]
 [  1   0  15   5 296]]
In the 2 fold, the classification accuracy is 0.907016
And the confusion matrix is: 
[[356  13   0   0   1]
 [  1  79   1   1   3]
 [  0   4 306  19   4]
 [  0   0  34  45   0]
 [  0   4  13  12 287]]
In the 3 fold, the classification accuracy is 0.918005
And the confusion matrix is: 
[[355   0   0   0   0]
 [  4  73   3   1   2]
 [  0   6 320  19  10]
 [  0   2  22  54   0]
 [  2   1  18   7 284]]
In the 4 fold, the classification accuracy is 0.918005
And the confusion matrix is: 
[[378   6   1   0   2]
 [  1  70   0   0   1]
 [  0   6 309  11  11]
 [  0   0  38  43   2]
 [  0   0  14   4 2

In [68]:
#having 25 neurons
kfold = KFold(n_splits=10, shuffle=True)

fold_index = 0
for train, test in kfold.split(features):
    clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                     hidden_layer_sizes=(25,), random_state=1).fit(features[train], y[train])
    predictions = clf.predict(features[test])
    accuracy = clf.score(features[test], y[test])
    cm = confusion_matrix(y[test], predictions)

    print('In the %i fold, the classification accuracy is %f' %(fold_index, accuracy))
    print('And the confusion matrix is: ')
    print(cm)
    fold_index += 1

In the 0 fold, the classification accuracy is 0.935757
And the confusion matrix is: 
[[339   5   1   0   1]
 [  4  86   1   0   1]
 [  0   1 330  22  10]
 [  0   0  16  66   1]
 [  2   2   8   1 286]]
In the 1 fold, the classification accuracy is 0.945900
And the confusion matrix is: 
[[367   1   0   0   0]
 [  4  74   1   1   1]
 [  0   1 308  14  12]
 [  1   3  11  73   3]
 [  1   2   6   2 297]]
In the 2 fold, the classification accuracy is 0.930685
And the confusion matrix is: 
[[351   6   0   0   0]
 [  3  82   0   0   3]
 [  0   1 301  17  13]
 [  0   0  14  93   1]
 [  0   1  21   2 274]]
In the 3 fold, the classification accuracy is 0.924768
And the confusion matrix is: 
[[346   4   0   0   0]
 [  3  79   1   0   0]
 [  0   5 285  22  22]
 [  0   0  10  87   3]
 [  0   4  14   1 297]]
In the 4 fold, the classification accuracy is 0.924768
And the confusion matrix is: 
[[358   2   0   0   1]
 [  1  78   0   0   7]
 [  0   3 305  23  11]
 [  0   0  15  68   0]
 [  0   4  20   2 2

In [69]:
#having 50 neurons
kfold = KFold(n_splits=10, shuffle=True)

fold_index = 0
for train, test in kfold.split(features):
    clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                     hidden_layer_sizes=(50,), random_state=1).fit(features[train], y[train])
    predictions = clf.predict(features[test])
    accuracy = clf.score(features[test], y[test])
    cm = confusion_matrix(y[test], predictions)

    print('In the %i fold, the classification accuracy is %f' %(fold_index, accuracy))
    print('And the confusion matrix is: ')
    print(cm)
    fold_index += 1

In the 0 fold, the classification accuracy is 0.927303
And the confusion matrix is: 
[[370   3   0   0   1]
 [  3  84   0   0   0]
 [  0   4 306  16   4]
 [  0   0  34  51   1]
 [  0   3  11   6 286]]
In the 1 fold, the classification accuracy is 0.961961
And the confusion matrix is: 
[[348   5   0   0   0]
 [  1  84   1   0   0]
 [  1   2 337   6   7]
 [  1   0  11  74   2]
 [  0   1   6   1 295]]
In the 2 fold, the classification accuracy is 0.937447
And the confusion matrix is: 
[[353   5   0   0   0]
 [  5  80   0   0   1]
 [  0   2 315  17  12]
 [  0   0  12  61   0]
 [  1   1  15   3 300]]
In the 3 fold, the classification accuracy is 0.927303
And the confusion matrix is: 
[[360   6   0   0   0]
 [  3  80   2   1   2]
 [  0   2 305  14   8]
 [  1   1  23  79   1]
 [  1   3  10   8 273]]
In the 4 fold, the classification accuracy is 0.953508
And the confusion matrix is: 
[[330   4   0   1   0]
 [  2  67   0   0   2]
 [  0   3 318  12   8]
 [  0   0   7  81   0]
 [  1   1  12   2 3

In [71]:
# having C = 50

from sklearn.svm import SVC

kfold = KFold(n_splits=10, shuffle=True)

fold_index = 0
for train, test in kfold.split(features):
    svm = SVC(kernel = 'linear', C = 50).fit(features[train], y[train])
    svm_predictions = svm.predict(features[test])
    recall = recall_score(y[test], svm_predictions, average='macro') # extra line to calculate recall
    accuracy = svm.score(features[test], y[test])
    cm = confusion_matrix(y[test], svm_predictions)

    print('In the %i fold, the classification accuracy is %f and the recall is %f' %(fold_index, accuracy, recall))
    print('And the confusion matrix is: ')
    print(cm)
    fold_index += 1

In the 0 fold, the classification accuracy is 0.892646 and the recall is 0.850717
And the confusion matrix is: 
[[359   1   0   0   2]
 [  3  91   0   0   1]
 [  1  15 301   3  27]
 [  1   1  36  49   1]
 [  1   1  30   3 256]]
In the 1 fold, the classification accuracy is 0.892646 and the recall is 0.864324
And the confusion matrix is: 
[[369   3   0   0   1]
 [  0  77   0   0   1]
 [  0  16 290   5  24]
 [  0   0  39  62   2]
 [  2   2  28   4 258]]
In the 2 fold, the classification accuracy is 0.905325 and the recall is 0.871520
And the confusion matrix is: 
[[381   3   0   0   0]
 [  5  68   0   0   0]
 [  0   5 299   4  23]
 [  0   0  35  67   0]
 [  2   6  26   3 256]]
In the 3 fold, the classification accuracy is 0.903635 and the recall is 0.862178
And the confusion matrix is: 
[[348   5   0   0   0]
 [  5  84   2   0   3]
 [  0  11 297   5  30]
 [  0   0  24  47   1]
 [  0   7  20   1 293]]
In the 4 fold, the classification accuracy is 0.906171 and the recall is 0.874224
And th

In [74]:
# having C = 20
import time

start_time = time.time()

from sklearn.svm import SVC

kfold = KFold(n_splits=10, shuffle=True)

fold_index = 0
for train, test in kfold.split(features):
    svm = SVC(kernel = 'linear', C = 50).fit(features[train], y[train])
    svm_predictions = svm.predict(features[test])
#     recall = recall_score(y[test], svm_predictions, average='macro') # extra line to calculate recall
    accuracy = svm.score(features[test], y[test])
    cm = confusion_matrix(y[test], svm_predictions)

    print('In the %i fold, the classification accuracy is %f and the recall is %f' %(fold_index, accuracy, recall))
    print('And the confusion matrix is: ')
    print(cm)
    fold_index += 1
    

print("--- %s seconds ---" % (time.time() - start_time))

In the 0 fold, the classification accuracy is 0.900254 and the recall is 0.857264
And the confusion matrix is: 
[[362   2   0   0   1]
 [  1  85   3   0   0]
 [  0   8 285   5  23]
 [  0   0  39  44   2]
 [  3   6  19   6 289]]
In the 1 fold, the classification accuracy is 0.885883 and the recall is 0.857264
And the confusion matrix is: 
[[363   6   0   0   0]
 [  4  76   0   0   0]
 [  1  14 303   8  36]
 [  0   0  36  57   0]
 [  2   8  17   3 249]]
In the 2 fold, the classification accuracy is 0.900254 and the recall is 0.857264
And the confusion matrix is: 
[[354   3   0   0   1]
 [  5  81   0   0   1]
 [  0   7 300   4  32]
 [  0   0  26  64   0]
 [  2   4  30   3 266]]
In the 3 fold, the classification accuracy is 0.901099 and the recall is 0.857264
And the confusion matrix is: 
[[370   2   0   0   0]
 [  2  63   0   0   1]
 [  0   9 287   1  19]
 [  0   1  42  39   1]
 [  1   4  29   5 307]]
In the 4 fold, the classification accuracy is 0.895182 and the recall is 0.857264
And th