In [30]:
# description of this dataset http://groupware.les.inf.puc-rio.br/har#ixzz2PyRdbAfA
from sklearn import datasets
from sklearn import preprocessing as pp
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
import numpy
import csv
import pandas as pd
import time

le = pp.LabelEncoder() 
le.fit(['sitting', 'walking', 'sittingdown', 'standing', 'standingup'])

### Retrieving all data
overall = pd.read_csv("./dataset-har-PUC-Rio-ugulino.csv", delimiter=';', header='infer') 
data = overall.loc[:, "x1":"z4"].as_matrix() # has to be converted to ndarray in order to be processed by segment_signal()
targets = overall.loc[:,"class,,"].as_matrix() # double commas: looks like the researchers are naughty

# print type(data)
# print type(targets)
# print data.shape
# print targets.shape


def debug(item, shape=True):
    print item
    print type(item)
    if (shape == True):
        print item.shape()
    print "----------------------"

### Data segmentation: shall use a sudden change of sensor readings
### like if (x_pre - x_curr <= 1.0, do nothing)
### Range of Accelerometer sensor readings is +3g/-3g

# reading 14 sets of data in every 2 seconds. 
# For segmenting the data from online only. 
# each set of data is taken 150ms apart from another.
# so choosing a window size of 14 will be 2.1 seconds.


def segment_signal(data, window_size=14): 

    N = data.shape[0]
    dim = data.shape[1]
    K = N/window_size
    segments = numpy.empty((K, window_size, dim))
    for i in range(K):
        segment = data[i*window_size:i*window_size+window_size,:]
        segments[i] = numpy.vstack(segment)
    return segments

##!!!! questions: for normalization, should it be done right after loading csv or after segmenation? 
##!!!! Normalize() can't process nadarray with dimension > 2.
X = pp.normalize(data)
y = targets[::14] 
y = y[:-1]# -1 because it will have a extra set of data than X.

segs = segment_signal(X)

### feautre extraction // take the difference between sensors

### this method is to extract the difference between consecutive sensor readings.
## parameter raw is a 2D ndarray
## return a 2D ndarray
def extract_diff(raw):

    N = raw.shape[0] # number of sets of sensor readings
    dim = raw.shape[1] # number of values in each readings
    features = numpy.empty((N - 1, dim))
    for i in range(1, N):
        for j in range(dim):
            features[i-1][j] = raw[i][j] - raw[i-1][j]

    return features

def extract_diff_2(raw):

    N = raw.shape[0] # number of segments of sensor readings ()
    I = raw.shape[1] # number of sets of readings (14)
    J = raw.shape[2] # number of values in each set of readings (12)
    feature_num = (I - 1) * J
    feature = numpy.empty((feature_num))
    features = numpy.empty((N, feature_num))
    for n in range(N):
        idx = 0;
        for i in range(1, I):
            for j in range(J):
                feature[idx] = raw[n][i][j] - raw[n][i-1][j]
                idx += 1
        numpy.append(features[n], feature)
    return features

features = extract_diff_2(segs)

features

array([[-0.01013149,  0.31069913, -0.21276136, ..., -0.5059926 ,
        -0.35317604, -0.4924089 ],
       [ 0.        ,  0.32804547, -0.20629664, ..., -0.52461956,
        -0.34184887, -0.47723456],
       [ 0.01012601,  0.33753363, -0.20589552, ..., -0.57849997,
        -0.3221193 , -0.45030964],
       ..., 
       [-0.08017998,  0.11319527, -0.3254364 , ..., -0.25154313,
        -0.08913907, -0.20514197],
       [-0.08529687,  0.28269821, -0.36312097, ..., -0.22611662,
        -0.07500454, -0.19854142],
       [ 0.00239637,  0.23005184, -0.29235754, ..., -0.49743294,
        -0.17449218, -0.47659805]])

In [26]:
features.shape

(11830, 156)

In [2]:
le = pp.LabelEncoder()
le.fit(targets)
list(le.classes_)

['sitting', 'sittingdown', 'standing', 'standingup', 'walking']

In [3]:
Targets = le.transform(targets)
Targets

array([0, 0, 0, ..., 4, 4, 4])

In [7]:
y = Targets[::14] 
y = y[:-1]
y

array([0, 0, 0, ..., 4, 4, 4])

In [27]:
y.shape

(11830,)

In [32]:
a = numpy.array([1, 3, 4])
a

array([1, 3, 4])

In [20]:
segs


array([[[-0.01013149,  0.31069913, -0.21276136, ..., -0.50657466,
         -0.34784794, -0.49644317],
        [-0.01016021,  0.31835337, -0.21675123, ..., -0.50462397,
         -0.35222076, -0.49107701],
        [-0.00338375,  0.32822383, -0.2064088 , ..., -0.51094638,
         -0.35191009, -0.48726012],
        ..., 
        [ 0.01016086,  0.33869517, -0.19983015, ..., -0.50804276,
         -0.34885603, -0.48772105],
        [ 0.        ,  0.32600865, -0.20715133, ..., -0.5059926 ,
         -0.35317604, -0.4924089 ],
        [ 0.        ,  0.32804547, -0.20629664, ..., -0.5039049 ,
         -0.35171885, -0.49375916]],

       [[ 0.        ,  0.32896102, -0.20348104, ..., -0.50531125,
         -0.35270047, -0.49174585],
        [-0.00338788,  0.32523692, -0.20666096, ..., -0.5047948 ,
         -0.35911576, -0.49124326],
        [ 0.00676696,  0.33496449, -0.20300878, ..., -0.50752195,
         -0.34849841, -0.49060455],
        ..., 
        [-0.01015393,  0.32492566, -0.21323247, ...,

In [29]:
import sys
print (sys.version)

2.7.13 |Anaconda custom (x86_64)| (default, Dec 20 2016, 23:05:08) 
[GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.57)]


In [28]:
#having 15 neurons
kfold = KFold(n_splits=10, shuffle=True)

fold_index = 0
for train, test in kfold.split(features):
    clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                     hidden_layer_sizes=(15,), random_state=1).fit(features[train], y[train])
    predictions = clf.predict(features[test])
    accuracy = clf.score(features[test], y[test])
    cm = confusion_matrix(y[test], predictions)

    print('In the %i fold, the classification accuracy is %f' %(fold_index, accuracy))
    print('And the confusion matrix is: ')
    print(cm)
    fold_index += 1

In the 0 fold, the classification accuracy is 0.935757
And the confusion matrix is: 
[[374   4   0   0   0]
 [  0  70   1   0   0]
 [  0   4 298  16   8]
 [  0   0  24  61   1]
 [  1   2  11   4 304]]
In the 1 fold, the classification accuracy is 0.912933
And the confusion matrix is: 
[[334   2   0   0   0]
 [  3  89   3   1   4]
 [  0   3 307  24  16]
 [  0   0  26  75   2]
 [  0   0  16   3 275]]
In the 2 fold, the classification accuracy is 0.909552
And the confusion matrix is: 
[[365   6   0   0   0]
 [  2  77   1   0   2]
 [  0   8 297   9  12]
 [  0   0  39  65   1]
 [  0   2  18   7 272]]
In the 3 fold, the classification accuracy is 0.947591
And the confusion matrix is: 
[[351   0   0   0   0]
 [  4  69   0   0   3]
 [  0   0 305  10  10]
 [  0   0  12  73   2]
 [  0   0  14   7 323]]
In the 4 fold, the classification accuracy is 0.929839
And the confusion matrix is: 
[[372   5   0   1   0]
 [  6  72   1   0   2]
 [  0   2 313  19  12]
 [  1   0  17  63   1]
 [  0   0  14   2 2

In [9]:
features

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [10]:
segs

array([[[-0.01013149,  0.31069913, -0.21276136, ..., -0.50657466,
         -0.34784794, -0.49644317],
        [-0.01016021,  0.31835337, -0.21675123, ..., -0.50462397,
         -0.35222076, -0.49107701],
        [-0.00338375,  0.32822383, -0.2064088 , ..., -0.51094638,
         -0.35191009, -0.48726012],
        ..., 
        [ 0.01016086,  0.33869517, -0.19983015, ..., -0.50804276,
         -0.34885603, -0.48772105],
        [ 0.        ,  0.32600865, -0.20715133, ..., -0.5059926 ,
         -0.35317604, -0.4924089 ],
        [ 0.        ,  0.32804547, -0.20629664, ..., -0.5039049 ,
         -0.35171885, -0.49375916]],

       [[ 0.        ,  0.32896102, -0.20348104, ..., -0.50531125,
         -0.35270047, -0.49174585],
        [-0.00338788,  0.32523692, -0.20666096, ..., -0.5047948 ,
         -0.35911576, -0.49124326],
        [ 0.00676696,  0.33496449, -0.20300878, ..., -0.50752195,
         -0.34849841, -0.49060455],
        ..., 
        [-0.01015393,  0.32492566, -0.21323247, ...,