In [1]:
# description of this dataset http://groupware.les.inf.puc-rio.br/har#ixzz2PyRdbAfA
from sklearn import datasets
from sklearn import preprocessing as pp
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
import numpy
import csv
import pandas as pd
import time

le = pp.LabelEncoder() 
le.fit(['sitting', 'walking', 'sittingdown', 'standing', 'standingup'])

initial = time.time()
### Retrieving all data
overall = pd.read_csv("./dataset-har-PUC-Rio-ugulino.csv", delimiter=';', header='infer') 
data = overall.loc[:, "x1":"z4"].as_matrix() # has to be converted to ndarray in order to be processed by segment_signal()
targets = overall.loc[:,"class,,"].as_matrix() # double commas: looks like the researchers are naughty

load = time.time()
print "--- time to load and select datasets: %s seconds ---" % (load - initial)


### Data segmentation: shall use a sudden change of sensor readings
### like if (x_pre - x_curr <= 1.0, do nothing)
### Range of Accelerometer sensor readings is +3g/-3g

# reading 14 sets of data in every 2 seconds. 
# For segmenting the data from online only. 
# each set of data is taken 150ms apart from another.
# so choosing a window size of 14 will be 2.1 seconds.


def segment_signal(data, window_size=14): 

    N = data.shape[0]
    dim = data.shape[1]
    K = N/window_size
    segments = numpy.empty((K, window_size, dim))
    for i in range(K):
        segment = data[i*window_size:i*window_size+window_size,:]
        segments[i] = numpy.vstack(segment)
    return segments



##!!!! questions: for normalization, should it be done right after loading csv or after segmenation? 
##!!!! Normalize() can't process nadarray with dimension > 2.
X = pp.normalize(data)
y = targets[::14] 
y = y[:-1]# -1 because it will have a extra set of data than X.

normalizing = time.time()
print "--- time to normalize: %s seconds ---" % (normalizing - load)

segs = segment_signal(X)

segmenting = time.time()
print "--- time to segment: %s seconds ---" % (segmenting - normalizing)

### feautre extraction // take the difference between sensors

### this method is to extract the difference between consecutive sensor readings.
## parameter raw is a 2D ndarray
## return a 2D ndarray
def extract_diff(raw):

    N = raw.shape[0] # number of sets of sensor readings
    dim = raw.shape[1] # number of values in each readings
    features = numpy.empty((N - 1, dim))
    for i in range(1, N):
        for j in range(dim):
            features[i-1][j] = raw[i][j] - raw[i-1][j]

    return features

def extract_diff_2(raw):

    N = raw.shape[0] # number of segments of sensor readings ()
    I = raw.shape[1] # number of sets of readings (14)
    J = raw.shape[2] # number of values in each set of readings (12)
    feature_num = (I - 1) * J
    feature = numpy.empty((feature_num))
    features = numpy.empty((N, feature_num))
    for n in range(N):
        idx = 0;
        for i in range(1, I):
            for j in range(J):
                feature[idx] = raw[n][i][j] - raw[n][i-1][j]
                idx += 1
        features[n] = feature
        

    return features

features = extract_diff_2(segs)

extracting_feature = time.time()
print "--- time to extract features: %s seconds ---" % (extracting_feature - segmenting)

#having 15 neurons
kfold = KFold(n_splits=10, shuffle=True)

fold_index = 0
for train, test in kfold.split(features):
    clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                     hidden_layer_sizes=(15,), random_state=1).fit(features[train], y[train])
    predictions = clf.predict(features[test])
    accuracy = clf.score(features[test], y[test])
    cm = confusion_matrix(y[test], predictions)

    print('In the %i fold, the classification accuracy is %f' %(fold_index, accuracy))
    print('And the confusion matrix is: ')
    print(cm)
    fold_index += 1


evaluate_model = time.time()
print "--- time to extract features: %s seconds ---" % (evaluate_model - extracting_feature)





--- time to load and select datasets: 0.370768070221 seconds ---
--- time to normalize: 0.0338640213013 seconds ---
--- time to segment: 0.433979988098 seconds ---
--- time to extract features: 2.50458693504 seconds ---
In the 0 fold, the classification accuracy is 0.751479
And the confusion matrix is: 
[[301   1  49   2   3]
 [  4  45  15   9  17]
 [ 98   5 233   4   0]
 [  5  24   7  30  23]
 [  0  11   4  13 280]]
In the 1 fold, the classification accuracy is 0.730347
And the confusion matrix is: 
[[286   4  59   5   2]
 [  7  44  16  14   7]
 [121  10 210   2   2]
 [  5  26  10  43  13]
 [  1   5   4   6 281]]
In the 2 fold, the classification accuracy is 0.752325
And the confusion matrix is: 
[[284   1  47   3   2]
 [  4  34  14  22  10]
 [117   7 223   2   1]
 [  1  12   9  47  18]
 [  0   4   3  16 302]]
In the 3 fold, the classification accuracy is 0.732037
And the confusion matrix is: 
[[304   5  64   3   2]
 [  5  30  13  20  13]
 [102   2 195   4   1]
 [ 11  13  12  32  19]
