In [14]:
from os import listdir
from pandas import read_csv
import pandas as pd
import numpy as np
import glob
from pandas import DataFrame

In [15]:
# load a single file as a numpy array
def load_file(filepath):
    dataframe = read_csv(filepath, header=None, delim_whitespace=True)
    return dataframe.values

# load a group of files, such as x, y, z data for a given variable and  return them as a 3d numpy array
def load_group(filenames, prefix=''):
    loaded = list()
    for name in filenames:
        data = load_file(prefix + name)
        loaded.append(data)
    # stack group so that features are the 3rd dimension
    loaded = np.dstack(loaded)
    return loaded

# load a dataset type, such as train or test
def load_dataset(type, prefix=''):
    path = prefix + type + '/Inertial Signals/'
    
    filenames = list()
    # total acceleration
    filenames += ['total_acc_x_' + type + '.txt', 'total_acc_y_' + type + '.txt', 'total_acc_z_' + type + '.txt']
    # body acceleration
    filenames += ['body_acc_x_' + type + '.txt', 'body_acc_y_' + type + '.txt', 'body_acc_z_' + type + '.txt']
    # body gyroscope
    filenames += ['body_gyro_x_' + type + '.txt', 'body_gyro_y_' + type + '.txt', 'body_gyro_z_' + type + '.txt']

    # load input data
    X = load_group(filenames, path)
    # load output data
    y = load_file(prefix + type + '/y_'+type+'.txt')
    return X, y

In [16]:
# summarize the balance of classes in an output variable column
def class_breakdown(data):
    # convert the numpy array into a dataframe
    df = DataFrame(data)
    # group data by the class value and calculate the number of rows
    counts = df.groupby(0).size()
    # retrieve raw rows
    counts = counts.values
    # summarize
    for i in range(len(counts)):
        percent = counts[i] / len(df) * 100
        print('Class=%d, total=%d, percentage=%.3f' % (i+1, counts[i], percent))

In [17]:
# load all train data
trainX, trainy = load_dataset('train', 'UCI HAR Dataset/')
class_breakdown(trainy)
# load all test data
testX, testy = load_dataset('test', 'UCI HAR Dataset/')
class_breakdown(testy)

Class=1, total=1226, percentage=16.676
Class=2, total=1073, percentage=14.595
Class=3, total=986, percentage=13.411
Class=4, total=1286, percentage=17.492
Class=5, total=1374, percentage=18.689
Class=6, total=1407, percentage=19.138
Class=1, total=496, percentage=16.831
Class=2, total=471, percentage=15.982
Class=3, total=420, percentage=14.252
Class=4, total=491, percentage=16.661
Class=5, total=532, percentage=18.052
Class=6, total=537, percentage=18.222


In [18]:
trainX

array([[[ 1.012817e+00, -1.232167e-01,  1.029341e-01, ...,
          3.019122e-02,  6.601362e-02,  2.285864e-02],
        [ 1.022833e+00, -1.268756e-01,  1.056872e-01, ...,
          4.371071e-02,  4.269897e-02,  1.031572e-02],
        [ 1.022028e+00, -1.240037e-01,  1.021025e-01, ...,
          3.568780e-02,  7.485018e-02,  1.324969e-02],
        ...,
        [ 1.018445e+00, -1.240696e-01,  1.003852e-01, ...,
          3.985177e-02,  1.909445e-03, -2.170124e-03],
        [ 1.019372e+00, -1.227451e-01,  9.987355e-02, ...,
          3.744932e-02, -7.982483e-05, -5.642633e-03],
        [ 1.021171e+00, -1.213260e-01,  9.498741e-02, ...,
          2.881781e-02, -3.771800e-05, -1.446006e-03]],

       [[ 1.018851e+00, -1.239760e-01,  9.792958e-02, ...,
          1.711106e-02,  6.122797e-03,  1.226815e-02],
        [ 1.022380e+00, -1.268078e-01,  9.935086e-02, ...,
          2.417851e-02,  9.710357e-03,  1.614958e-02],
        [ 1.020781e+00, -1.277862e-01,  9.811381e-02, ...,
          3.02

About the dataset: 

"The experiments have been carried out with a group of 30 volunteers within an age bracket of 19-48 years. Each person performed six activities (WALKING, WALKING_UPSTAIRS, WALKING_DOWNSTAIRS, SITTING, STANDING, LAYING) wearing a smartphone (Samsung Galaxy S II) on the waist. Using its embedded accelerometer and gyroscope, we captured 3-axial linear acceleration and 3-axial angular velocity at a constant rate of 50Hz. The experiments have been video-recorded to label the data manually. The obtained dataset has been randomly partitioned into two sets, where 70% of the volunteers was selected for generating the training data and 30% the test data. 

The sensor signals (accelerometer and gyroscope) were pre-processed by applying noise filters and then sampled in fixed-width sliding windows of 2.56 sec and 50% overlap (128 readings/window). The sensor acceleration signal, which has gravitational and body motion components, was separated using a Butterworth low-pass filter into body acceleration and gravity. The gravitational force is assumed to have only low frequency components, therefore a filter with 0.3 Hz cutoff frequency was used. From each window, a vector of features was obtained by calculating variables from the time and frequency domain."

"The features selected for this database come from the accelerometer and gyroscope 3-axial raw signals tAcc-XYZ and tGyro-XYZ. These time domain signals (prefix 't' to denote time) were captured at a constant rate of 50 Hz. Then they were filtered using a median filter and a 3rd order low pass Butterworth filter with a corner frequency of 20 Hz to remove noise. Similarly, the acceleration signal was then separated into body and gravity acceleration signals (tBodyAcc-XYZ and tGravityAcc-XYZ) using another low pass Butterworth filter with a corner frequency of 0.3 Hz. 

Subsequently, the body linear acceleration and angular velocity were derived in time to obtain Jerk signals (tBodyAccJerk-XYZ and tBodyGyroJerk-XYZ). Also the magnitude of these three-dimensional signals were calculated using the Euclidean norm (tBodyAccMag, tGravityAccMag, tBodyAccJerkMag, tBodyGyroMag, tBodyGyroJerkMag). 

Finally a Fast Fourier Transform (FFT) was applied to some of these signals producing fBodyAcc-XYZ, fBodyAccJerk-XYZ, fBodyGyro-XYZ, fBodyAccJerkMag, fBodyGyroMag, fBodyGyroJerkMag. (Note the 'f' to indicate frequency domain signals). 

These signals were used to estimate variables of the feature vector for each pattern:  
'-XYZ' is used to denote 3-axial signals in the X, Y and Z directions."

Read Datasheets for data sets
Model carts for model reporting

age exclusion - doesn't work well for everyone

