Download the dataset

In [1]:
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00507/wisdm-dataset.zip"

--2021-01-10 06:54:23--  https://archive.ics.uci.edu/ml/machine-learning-databases/00507/wisdm-dataset.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 309713877 (295M) [application/x-httpd-php]
Saving to: ‘wisdm-dataset.zip’


2021-01-10 06:54:30 (41.8 MB/s) - ‘wisdm-dataset.zip’ saved [309713877/309713877]



Extract the downloaded dataset

In [2]:
!unzip wisdm-dataset.zip

Archive:  wisdm-dataset.zip
   creating: wisdm-dataset/
  inflating: wisdm-dataset/WISDM-dataset-description.pdf  
   creating: wisdm-dataset/arffmagic-master/
  inflating: wisdm-dataset/arffmagic-master/Makefile  
  inflating: wisdm-dataset/arffmagic-master/.DS_Store  
 extracting: wisdm-dataset/arffmagic-master/README.md  
   creating: wisdm-dataset/arffmagic-master/src/
  inflating: wisdm-dataset/arffmagic-master/src/arff.cpp  
  inflating: wisdm-dataset/arffmagic-master/src/comparator.h  
  inflating: wisdm-dataset/arffmagic-master/src/chunk.h  
  inflating: wisdm-dataset/arffmagic-master/src/main.cpp  
  inflating: wisdm-dataset/arffmagic-master/src/attribute.h  
  inflating: wisdm-dataset/arffmagic-master/src/libmfcc.c  
  inflating: wisdm-dataset/arffmagic-master/src/raw.h  
  inflating: wisdm-dataset/arffmagic-master/src/try.h  
  inflating: wisdm-dataset/arffmagic-master/src/write.h  
  inflating: wisdm-dataset/arffmagic-master/src/chunk.cpp  
  inflating: wisdm-dataset/arffma

Import the relevant python libraries

In [3]:
from __future__ import print_function
from matplotlib import pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from IPython.display import display, HTML

from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn import preprocessing

import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Reshape
from keras.layers import Conv2D, MaxPooling2D
from keras.utils import np_utils

import glob

In [4]:
pd.options.display.float_format = '{:.3f}'.format
sns.set()
plt.style.use('ggplot')

Define constants

In [5]:
LABELS = ['A',
          'B',
          'C',
          'D',
          'E']
TIME_PERIODS = 200
STEP_DISTANCE = 100 

Define functions to read data

In [6]:
def read_data(file_path):
  column_names = ['user-id',
                    'activity',
                    'timestamp',
                    'x-axis',
                    'y-axis',
                    'z-axis']

  all_files = glob.glob(file_path + "/*.txt")

  li = []

  for filename in all_files:
    df = pd.read_csv(filename, header=None, names=column_names)
    li.append(df)

  df = pd.concat(li, axis=0, ignore_index=True)
  
  df['z-axis'].replace(regex=True,
      inplace=True,
      to_replace=r';',
      value=r'')
  df['z-axis'] = df['z-axis'].apply(convert_to_float)
  df.dropna(axis=0, how='any', inplace=True)

  return df


def convert_to_float(x):
    try:
        return np.float(x)
    except:
        return np.nan


def show_basic_dataframe_info(dataframe):

    # Shape and how many rows and columns
    print('Number of columns in the dataframe: %i' % (dataframe.shape[1]))
    print('Number of rows in the dataframe: %i\n' % (dataframe.shape[0]))

Read accelorometer data from the dataset

In [7]:
df = read_data("wisdm-dataset/raw/watch/accel/")

Display dataframe info

In [8]:
show_basic_dataframe_info(df)
df.head(200)

Number of columns in the dataframe: 6
Number of rows in the dataframe: 3777046



Unnamed: 0,user-id,activity,timestamp,x-axis,y-axis,z-axis
0,1637,A,14491150618000,-2.728,-1.688,3.205
1,1637,A,14491170682000,-2.749,-3.257,1.262
2,1637,A,14491190778000,-2.457,-6.222,-1.602
3,1637,A,14491210906000,-1.997,-12.516,-16.966
4,1637,A,14491231896000,1.748,-21.265,-21.998
...,...,...,...,...,...,...
195,1637,A,14495080228000,1.894,-10.804,-0.843
196,1637,A,14495100324000,1.758,-10.706,-0.802
197,1637,A,14495120553000,1.095,-9.683,-0.462
198,1637,A,14495140649000,-1.265,-8.890,-1.554


Remove unnessary activities from the dataset

In [9]:
df = df[df['activity'] <= 'E']

Encode lables and add them into a new column

In [10]:
le = preprocessing.LabelEncoder()
le.fit(LABELS)
df['ActivityEncoded'] = le.transform(df['activity'].values.ravel())

In [11]:
df

Unnamed: 0,user-id,activity,timestamp,x-axis,y-axis,z-axis,ActivityEncoded
0,1637,A,14491150618000,-2.728,-1.688,3.205,0
1,1637,A,14491170682000,-2.749,-3.257,1.262,0
2,1637,A,14491190778000,-2.457,-6.222,-1.602,0
3,1637,A,14491210906000,-1.997,-12.516,-16.966,0
4,1637,A,14491231896000,1.748,-21.265,-21.998,0
...,...,...,...,...,...,...,...
3726868,1608,E,718380750614000,9.382,-2.658,-0.102,4
3726869,1608,E,718380800539120,9.399,-2.661,-0.099,4
3726870,1608,E,718380850464240,9.360,-2.622,-0.078,4
3726871,1608,E,718380900389360,9.336,-2.689,-0.037,4


Normalise the dataframe

In [22]:
max = df[['x-axis', 'y-axis','z-axis']].max().max()
min = df[['x-axis', 'y-axis','z-axis']].min().min()

ndf = df.copy()
ndf['x-axis'] = (df['x-axis'] - min)/(max-min)
ndf['y-axis'] = (df['y-axis'] - min)/(max-min)
ndf['z-axis'] = (df['z-axis'] - min)/(max-min)
ndf = ndf.round({'x-axis': 4, 'y-axis': 4, 'z-axis': 4})

max, min

(66.615074, -78.47761)

Split the dataset to into test and train

In [13]:
df_train = ndf[ndf['user-id'] <= 1640]
df_test = ndf[ndf['user-id'] > 1640]

Create segmants and lables

In [14]:
def create_segments_and_labels(dff, time_steps, step):

    # x, y, z acceleration as features
    N_FEATURES = 3
    # Number of steps to advance in each iteration (for me, it should always
    # be equal to the time_steps in order to have no overlap between segments)
    # step = time_steps
    segments = []
    labels = []
    accel_data = dff[['x-axis', 'y-axis', 'z-axis']]
    for i in range(0, len(dff) - time_steps, step):
        values = accel_data.iloc[i:(i + time_steps)].values
        # Retrieve the most often used label in this segment
        label = stats.mode(dff['ActivityEncoded'][i: i + time_steps])[0][0]
        segments.append(values)
        labels.append(label)

    # Bring the segments into a better shape
    reshaped_segments = np.asarray(segments, dtype= np.float32).reshape(-1, time_steps, N_FEATURES)
    labels = np.asarray(labels)

    return reshaped_segments, labels

In [15]:
x_train, y_train = create_segments_and_labels(df_train,
                                              TIME_PERIODS,
                                              STEP_DISTANCE)

x_test, y_test = create_segments_and_labels(df_test,
                                              TIME_PERIODS,
                                              STEP_DISTANCE)

Hot encode the lables

In [16]:
num_classes = le.classes_.size
y_train_hot = np_utils.to_categorical(y_train, num_classes)
y_test_hot = np_utils.to_categorical(y_test, num_classes)

Flattern the data

In [17]:
num_time_periods, num_sensors = x_train.shape[1], x_train.shape[2]
input_shape = (num_time_periods*num_sensors)
x_train_data = x_train.reshape(x_train.shape[0], input_shape)
x_test_data = x_test.reshape(x_test.shape[0], input_shape)
print('x_train shape:', x_train_data.shape)

x_train shape: (8719, 600)


Define the model

In [18]:
model_m = Sequential()
model_m.add(Reshape((TIME_PERIODS, num_sensors), input_shape=(input_shape,)))
model_m.add(Dense(100, activation='relu'))
model_m.add(Dense(100, activation='relu'))
model_m.add(Dense(100, activation='relu'))
model_m.add(Flatten())
model_m.add(Dense(num_classes, activation='softmax'))
print(model_m.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
reshape (Reshape)            (None, 200, 3)            0         
_________________________________________________________________
dense (Dense)                (None, 200, 100)          400       
_________________________________________________________________
dense_1 (Dense)              (None, 200, 100)          10100     
_________________________________________________________________
dense_2 (Dense)              (None, 200, 100)          10100     
_________________________________________________________________
flatten (Flatten)            (None, 20000)             0         
_________________________________________________________________
dense_3 (Dense)              (None, 5)                 100005    
Total params: 120,605
Trainable params: 120,605
Non-trainable params: 0
__________________________________________________

In [19]:
callbacks_list = [
    keras.callbacks.ModelCheckpoint(
        filepath='best_model.{epoch:02d}-{val_loss:.2f}.h5',
        monitor='val_loss', save_best_only=True),
]

model_m.compile(loss='categorical_crossentropy',
                optimizer='adam', metrics=['accuracy'])

# Hyper-parameters
BATCH_SIZE = 400
EPOCHS = 50

# Enable validation to use ModelCheckpoint and EarlyStopping callbacks.
history = model_m.fit(x_train_data,
                      y_train_hot,
                      batch_size=BATCH_SIZE,
                      epochs=EPOCHS,
                      callbacks=callbacks_list,
                      validation_data=(x_test_data, y_test_hot),
                      verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [20]:
score = model_m.evaluate(x_test_data, y_test_hot, verbose=1)

print('\nAccuracy on test data: %0.2f' % score[1])
print('\nLoss on test data: %0.2f' % score[0])


Accuracy on test data: 0.79

Loss on test data: 0.52


In [21]:
model_m.save('watch')
!zip -r watch.zip watch

INFO:tensorflow:Assets written to: watch/assets
  adding: watch/ (stored 0%)
  adding: watch/saved_model.pb (deflated 90%)
  adding: watch/assets/ (stored 0%)
  adding: watch/variables/ (stored 0%)
  adding: watch/variables/variables.index (deflated 65%)
  adding: watch/variables/variables.data-00000-of-00001 (deflated 17%)
