In [48]:
# description of this dataset http://groupware.les.inf.puc-rio.br/har#ixzz2PyRdbAfA
from sklearn import datasets
from sklearn import preprocessing as pp
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
import numpy
import csv
import pandas as pd

le = pp.LabelEncoder() 
le.fit(['sitting', 'walking', 'sittingdown', 'standing', 'standingup'])

### Retrieving all data
overall = pd.read_csv("./dataset-har-PUC-Rio-ugulino.csv", delimiter=';', header='infer') 
data = overall.loc[:, "x1":"z4"].as_matrix() # has to be converted to ndarray in order to be processed by segment_signal()
targets = overall.loc[:,"class,,"].as_matrix() # double commas: looks like the researchers are naughty

# print type(data)
# print type(targets)
# print data.shape
# print targets.shape


def debug(item, shape=True):
    print item
    print type(item)
    if (shape == True):
        print item.shape()
    print "----------------------"

### Data segmentation: shall use a sudden change of sensor readings
### like if (x_pre - x_curr <= 1.0, do nothing)
### Range of Accelerometer sensor readings is +3g/-3g

# reading 14 sets of data in every 2 seconds. 
# For segmenting the data from online only. 
# each set of data is taken 150ms apart from another.
# so choosing a window size of 14 will be 2.1 seconds.


def segment_signal(data, window_size=14): 

    N = data.shape[0]
    dim = data.shape[1]
    K = N/window_size
    segments = numpy.empty((K, window_size, dim))
    for i in range(K):
        segment = data[i*window_size:i*window_size+window_size,:]
        segments[i] = numpy.vstack(segment)
    return segments

##!!!! questions: for normalization, should it be done right after loading csv or after segmenation? 
##!!!! Normalize() can't process nadarray with dimension > 2.
X = pp.normalize(data)
y = targets[::14]

segs = segment_signal(X)

### feautre extraction // take the difference between sensors

### this method is to extract the difference between consecutive sensor readings.
## parameter raw is a 2D ndarray
## return a 2D ndarray
def extract_diff(raw):

    N = raw.shape[0] # number of sets of sensor readings
    dim = raw.shape[1] # number of values in each readings
    features = numpy.empty((N - 1, dim))
    for i in range(1, N):
        for j in range(dim):
            features[i-1][j] = raw[i][j] - raw[i-1][j]

    return features

features = extract_diff(segs[0])
features

array([[ -2.87208136e-05,   7.65424768e-03,  -3.98987512e-03,
          6.55328316e-03,   1.72324882e-04,   3.20483955e-03,
         -6.43001546e-02,   9.95654873e-04,   5.89270444e-03,
          1.95069735e-03,  -4.37281930e-03,   5.36615619e-03],
       [  6.77646324e-03,   9.87045812e-03,   1.03424318e-02,
          3.05164884e-02,   6.71373256e-03,   1.02050217e-02,
          3.42557131e-03,  -3.10666194e-04,   2.68845745e-04,
         -6.32241265e-03,   3.10666194e-04,   3.81689122e-03],
       [ -3.41727876e-03,  -1.77441144e-03,   1.25794581e-02,
         -1.04027119e-02,   3.73579385e-03,  -3.65197409e-03,
         -2.17931380e-04,   1.74345104e-03,   1.89175910e-03,
         -9.33238641e-03,   1.65706377e-03,   4.38702049e-03],
       [  3.41677499e-03,  -1.56097715e-03,  -1.26101883e-02,
          7.01241194e-03,  -3.72571838e-03,   3.64441749e-03,
          2.11382322e-04,  -1.69105858e-03,   1.44715590e-03,
          2.48780733e-03,  -1.70945623e-03,  -1.07530927e-03],
    

In [8]:
le = pp.LabelEncoder() 
le.fit(targets)
list(le.classes_)

['sitting', 'sittingdown', 'standing', 'standingup', 'walking']

In [49]:
y.shape

(11831,)

In [19]:
features.shape

(13, 12)

In [45]:
y.shape

(165632,)

In [41]:
Y = y[::14]
Y.shape

(11831,)

In [51]:
##need to assign a value for each segmented set of data
##This took like 3 mins to finish executing.
kfold = KFold(n_splits=10, shuffle=True)

fold_index = 0

for train, test in kfold.split(segs):
    clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                     hidden_layer_sizes=(15,), random_state=1).fit(segs[train], y[train])
    predictions = clf.predict(segs[test])
    accuracy = clf.score(segs[test], y[test])
    cm = confusion_matrix(y[test], predictions)

    print('In the %i fold, the classification accuracy is %f' %(fold_index, accuracy))
    print('And the confusion matrix is: ')
    print(cm)
    fold_index += 1

ValueError: Found array with dim 3. Estimator expected <= 2.

In [8]:
# feature = numpy.empty((5, 5))
# debug(feature[0][0], False)
# debug(segs[0][0], False)
debug(segs[0], False)

[[  -3.   92.  -63.  -23.   18.  -19.    5.  104.  -92. -150. -103. -147.]
 [  -3.   94.  -64.  -21.   18.  -18.  -14.  104.  -90. -149. -104. -145.]
 [  -1.   97.  -61.  -12.   20.  -15.  -13.  104.  -90. -151. -104. -144.]
 [  -2.   96.  -57.  -15.   21.  -16.  -13.  104.  -89. -153. -103. -142.]
 [  -1.   96.  -61.  -13.   20.  -15.  -13.  104.  -89. -153. -104. -143.]
 [  -2.   95.  -62.  -14.   19.  -16.  -13.  104.  -89. -153. -104. -142.]
 [   1.  100.  -62.  -10.   22.  -12.  -13.  104.  -90. -151. -104. -143.]
 [  -1.   97.  -63.  -13.   20.  -15.  -12.  104.  -88. -151. -104. -142.]
 [  -1.   98.  -63.  -14.   19.  -17.  -13.  104.  -90. -152. -103. -144.]
 [   0.   98.  -61.  -11.   22.  -13.  -13.  104.  -90. -151. -104. -144.]
 [  -1.   95.  -60.  -13.   19.  -17.  -13.  105.  -89. -150. -104. -144.]
 [   3.  100.  -59.  -10.   22.  -13.  -13.  104.  -90. -150. -103. -144.]
 [   0.   96.  -61.  -14.   20.  -15.  -12.  105.  -88. -149. -104. -145.]
 [   0.   97.  -61.  -14.

In [None]:
segs[0].shape

In [32]:
X

array([[-0.01013149,  0.31069913, -0.21276136, -0.07767478,  0.06078896,
        -0.06416612,  0.01688582,  0.3512251 , -0.31069913, -0.50657466,
        -0.34784794, -0.49644317],
       [-0.01016021,  0.31835337, -0.21675123, -0.0711215 ,  0.06096128,
        -0.06096128, -0.04741433,  0.35222076, -0.30480642, -0.50462397,
        -0.35222076, -0.49107701],
       [-0.00338375,  0.32822383, -0.2064088 , -0.04060501,  0.06767502,
        -0.05075626, -0.04398876,  0.35191009, -0.30453758, -0.51094638,
        -0.35191009, -0.48726012],
       [-0.00680103,  0.32644942, -0.19382934, -0.05100772,  0.07141081,
        -0.05440824, -0.04420669,  0.35365354, -0.30264582, -0.52027877,
        -0.35025303, -0.4828731 ],
       [-0.00338425,  0.32488844, -0.20643953, -0.04399531,  0.06768509,
        -0.05076382, -0.04399531,  0.35196248, -0.30119866, -0.51779096,
        -0.35196248, -0.48394841],
       [-0.00678138,  0.32211534, -0.21022264, -0.04746963,  0.06442307,
        -0.054251  , -

In [29]:
segs.shape

(11830, 14, 12)

In [37]:
normalized = pp.normalize(data)
normalized

array([[-0.01013149,  0.31069913, -0.21276136, ..., -0.50657466,
        -0.34784794, -0.49644317],
       [-0.01016021,  0.31835337, -0.21675123, ..., -0.50462397,
        -0.35222076, -0.49107701],
       [-0.00338375,  0.32822383, -0.2064088 , ..., -0.51094638,
        -0.35191009, -0.48726012],
       ..., 
       [-0.02721635,  0.25311207, -0.38919382, ..., -0.5035025 ,
        -0.21773081, -0.41641017],
       [-0.03942975,  0.22606388, -0.39955476, ..., -0.4863002 ,
        -0.22080658, -0.41006936],
       [-0.03609219,  0.22170914, -0.37123391, ..., -0.54138278,
        -0.22686516, -0.38154596]])