In [1]:
from __future__ import print_function
from matplotlib import pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from IPython.display import display, HTML

from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn import preprocessing

import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Reshape
from keras.layers import Conv2D, MaxPooling2D
from keras.utils import np_utils

import glob
import os

In [2]:
#Some styling
pd.options.display.float_format = '{:.1f}'.format
sns.set()
plt.style.use('ggplot')

#Label encoder used to get a numeric representation of a label
le = preprocessing.LabelEncoder()

#The activities
LABELS = ['standing',
          'walking-natural',
          'walking-rider',
          'trotting-natural',
          'trotting-rider',
          'running-natural',
          'running-rider',
          'jumping',
          'grazing',
          'eating',
          'head shake',
          'shaking',
          'scratch-biting',
          'rubbing',
          'fighting',
          'rolling',
          'scared']

#Sliding windows
TIME_PERIODS = 80
STEP_DISTANCE = 40

#Datasets
FILES = sorted(glob.glob('Data/*'))

# Set up dataframe

In [3]:
REMOVE_COLUMNS = ['Mx', 'My', 'Mz','A3D','G3D','M3D'] #Add columns to drop from dataframe

def loadDataFrame(files):
    """
    Simple function to set up dataframe and initial clean-up of the data
    files: path to files
    returns: combined dataframe of all files
    """
    df = pd.DataFrame()
    for file in files:
        csv = pd.read_csv(file)
        csv['filename']=file
        df = df.append(csv)
        
    df.drop(REMOVE_COLUMNS, axis=1, inplace=True)
    df['ActivityEncoded'] = le.fit_transform(df['label'].values.ravel())

    return df

def convert_to_float(x):

    try:
        return np.float(x)
    except:
        return np.nan


df = loadDataFrame(FILES)
# df.head(5000)



# Plot data composition

In [4]:

# #Training examples per activity type
# df['label'].value_counts().plot(kind='bar', title='Training Examples of subject Viva by Activity Type')
# plt.show()

# SECONDS = 10 #nr of seconds to display accelerometer data
# SAMPLING_RATE = 20 #the sampling rate at which data was recorded

# def plot_activity(activity, data):

#     fig, (ax0, ax1, ax2) = plt.subplots(nrows=3, figsize=(15, 10), sharex=True)
#     plot_axis(ax0, data['datetime'], data['Ax'], 'X-Axis')
#     plot_axis(ax1, data['datetime'], data['Ay'], 'Y-Axis')
#     plot_axis(ax2, data['datetime'], data['Az'], 'Z-Axis')
#     plt.subplots_adjust(hspace=0.2)
#     fig.suptitle(activity)
#     plt.subplots_adjust(top=0.90)
#     plt.show()

# def plot_axis(ax, x, y, title):

#     ax.plot(x, y, 'r')
#     ax.set_title(title)
#     ax.xaxis.set_visible(False)
#     ax.set_ylim([min(y) - np.std(y), max(y) + np.std(y)])
#     ax.set_xlim([min(x), max(x)])
#     ax.grid(True)

# #plot all 3 subplots for each activity
# for activity in np.unique(df['label']):
#     subset = df[df['label'] == activity][:SECONDS*SAMPLING_RATE] 
#     plot_activity(activity, subset)

In [5]:
# Define column name of the label vector
LABEL = 'ActivityEncoded'
# Transform the labels from String to Integer via LabelEncoder
le = preprocessing.LabelEncoder()
# Add a new column to the existing DataFrame with the encoded values
df[LABEL] = le.fit_transform(df['label'].values.ravel())

# PRE PROCESSING

- Make all the elements float if they need be
- Shuffle data frame
- Split to test and train set
- Normalize to a range
- TODO segment
- TODO balance

# Get only relevant subjects

In [6]:
#These are the indexes of the relevant subjects, see FILES
indexes = [0,1,2,7,8,9,13,14,15,16,17]
subjects = [FILES[x] for x in indexes]

#new dataframe with only the horses in subjects
df = df[df['filename'].isin(subjects)]  


# Splitting

In [7]:
# Splitting
def splitBySubject(data, name):
  '''
  Function to split train and test data by subject
  data = dataframe
  name = subject to put in test subset
  '''
  test = data[data['filename'].str.contains(name)]
  train = data[~data['filename'].str.contains(name)]
  return train, test


train, test = splitBySubject(df, 'Galoway')

print(len(train['filename'].unique()))
print(len(test['filename']. unique()))


8
3


# Segmenting 2.0

In [8]:
# def splitBySegment(data):
#     segmented = data[data['segment'].unique()]
#     return segmented

df.head(15)

Unnamed: 0,Ax,Ay,Az,Gx,Gy,Gz,datetime,label,segment,filename,ActivityEncoded
0,6.5,-1.2,3.3,-12.8,54.2,-3.4,2018-06-14 20:06:48.0988,walking-natural,13935,Data/subject_11_Patron_part_1.csv,16
1,6.6,-0.5,3.2,1.7,58.2,-6.8,2018-06-14 20:06:48.1088,walking-natural,13935,Data/subject_11_Patron_part_1.csv,16
2,6.7,0.8,3.5,26.2,60.6,-8.1,2018-06-14 20:06:48.1188,walking-natural,13935,Data/subject_11_Patron_part_1.csv,16
3,6.8,2.1,3.8,48.7,58.9,-8.8,2018-06-14 20:06:48.1288,walking-natural,13935,Data/subject_11_Patron_part_1.csv,16
4,6.8,3.2,4.2,60.0,55.4,-9.1,2018-06-14 20:06:48.1388,walking-natural,13935,Data/subject_11_Patron_part_1.csv,16
5,6.8,3.9,4.7,60.2,50.0,-8.2,2018-06-14 20:06:48.1488,walking-natural,13935,Data/subject_11_Patron_part_1.csv,16
6,6.7,3.5,5.3,50.8,44.5,-6.9,2018-06-14 20:06:48.1588,walking-natural,13935,Data/subject_11_Patron_part_1.csv,16
7,7.2,2.6,5.1,31.4,42.3,-4.8,2018-06-14 20:06:48.1688,walking-natural,13935,Data/subject_11_Patron_part_1.csv,16
8,7.6,1.6,4.3,2.9,43.5,-2.3,2018-06-14 20:06:48.1788,walking-natural,13935,Data/subject_11_Patron_part_1.csv,16
9,7.7,0.8,4.0,-18.1,48.6,1.5,2018-06-14 20:06:48.1888,walking-natural,13935,Data/subject_11_Patron_part_1.csv,16


# Feature scaling

In [9]:
train_x_max = train['Ax'].max()
train_y_max = train['Ay'].max()
train_z_max = train['Az'].max()

pd.options.mode.chained_assignment = None 

#divide all 3 axis with the max value in the training set
train['Ax'] = train['Ax'] / train_x_max
train['Ay'] = train['Ay'] / train_y_max
train['Az'] = train['Az'] / train_z_max

# Segmentation

In [10]:
seg_train = [y for x, y in train.groupby('segment', as_index=False)]
seg_test = [y for x, y in test.groupby('segment', as_index=False)]
train.shape

(6033137, 11)

# Windowing

In [96]:
def createWindows(df, time_steps, step, label_name):
    
    N_FEATURES = 6
    windows = []
    labels = []
    for i in range(0, len(df)-time_steps, step):
        axs = df['Ax'].values[i: i + time_steps]
        ays = df['Ay'].values[i: i + time_steps]
        azs = df['Az'].values[i: i + time_steps]
        
        gxs = df['Gx'].values[i: i + time_steps]
        gys = df['Gy'].values[i: i + time_steps]
        gzs = df['Gz'].values[i: i + time_steps]
        # Retrieve the most often used label in this segment
        label = stats.mode(df[label_name][i: i + time_steps])[0][0]
        windows.append([axs, ays, azs, gxs, gys, gzs])
        labels.append(label)

    # Bring the segments into a better shape
    reshaped_windows = np.asarray(windows, dtype= np.float32).reshape(-1, time_steps, N_FEATURES)
    labels = np.asarray(labels)

    return reshaped_windows, labels

x_trains=[]
y_trains=[]
for i in seg_train:
    x_train, y_train = createWindows(i, 200, 100, LABEL)
    x_trains.append(x_train)
    y_trains.append(y_train)
    

#where x_trains[0].shape[0] =! 8 moet eruit
one_d_xtrain = []
for i in x_trains:
    if i.shape[0] == 8:
        a = i.astype('float32')
        one_d_xtrain.append(a)
        
one_d_ytrain = []
for i in y_trains:
    if i.shape[0] == 8:
        a = i.astype('float32')
        one_d_ytrain.append(a)
            
print(one_d_xtrain[2].shape[0])

8


# Suffle Data

In [76]:
len(x_trains[2])

0

In [75]:
len(y_trains[2])

0

In [12]:
#shuffling the whole dataframe
# def shuffleData(data_frame):
#     '''Function to shuffle dataframe'''
#     return data_frame.sample(frac=1).reset_index(drop=True)

# shuf_train = []

# for i in seg_train:
#     shuf_train.append(shuffleData(i))
    
    
# train = shuf_train
# train[0].head(20)

# Store dimensions

In [18]:
input_shape = 1200
x_train_shape = 8
num_classes = le.classes_.size


In [97]:
input_shape = (200*6)

trainx = []
for x in one_d_xtrain:
    trainx.append(x.reshape(x.shape[0], input_shape))

AttributeError: 'list' object has no attribute 'astype'

In [98]:
# Applying one hot coding to y_train
y_train_hots = []
for i in one_d_ytrain:
    y_train_hot = np_utils.to_categorical(i, num_classes)
    y_train_hots.append(y_train_hot)
print('New y_train shape: ', y_train_hots[0].shape)

New y_train shape:  (8, 18)


# Classifier

In [19]:
model_m = Sequential()
# Remark: since coreml cannot accept vector shapes of complex shape like
# [80,3] this workaround is used in order to reshape the vector internally
# prior feeding it into the network
model_m.add(Reshape((200, 6), input_shape=(input_shape,)))
model_m.add(Dense(100, activation='relu'))
model_m.add(Dense(100, activation='relu'))
model_m.add(Dense(100, activation='relu'))
model_m.add(Flatten())
model_m.add(Dense(num_classes, activation='softmax'))
print(model_m.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
reshape_1 (Reshape)          (None, 200, 6)            0         
_________________________________________________________________
dense_3 (Dense)              (None, 200, 100)          700       
_________________________________________________________________
dense_4 (Dense)              (None, 200, 100)          10100     
_________________________________________________________________
dense_5 (Dense)              (None, 200, 100)          10100     
_________________________________________________________________
flatten_1 (Flatten)          (None, 20000)             0         
_________________________________________________________________
dense_6 (Dense)              (None, 18)                360018    
Total params: 380,918
Trainable params: 380,918
Non-trainable params: 0
________________________________________________

In [104]:
zipped_list = list(zip(trainx, y_train_hots))

In [105]:
zipped_list[0][1][0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0.], dtype=float32)

In [108]:
callbacks_list = [
    keras.callbacks.ModelCheckpoint(
        filepath='best_model.{epoch:02d}-{val_loss:.2f}.h5',
        monitor='val_loss', save_best_only=True),
    keras.callbacks.EarlyStopping(monitor='accuracy', patience=1)
]

model_m.compile(loss='categorical_crossentropy',
                optimizer='adam', metrics=['accuracy'])

# Hyper-parameters
EPOCHS = 2

for (x, y) in zipped_list:
    
    for i in range(len(x)):
        
        xs = np.array(x[i])
        ys = np.array(y[i])

        history = model_m.fit(xs,
                              ys,
                              epochs=EPOCHS,
                              callbacks=callbacks_list,
                              verbose=1)



ValueError: Data cardinality is ambiguous:
  x sizes: 1200
  y sizes: 18
Make sure all arrays contain the same number of samples.

In [21]:
# for loop over segmented data but idk how



callbacks_list = [
    keras.callbacks.ModelCheckpoint(
        filepath='best_model.{epoch:02d}-{val_loss:.2f}.h5',
        monitor='val_loss', save_best_only=True),
    keras.callbacks.EarlyStopping(monitor='accuracy', patience=1)
]

model_m.compile(loss='categorical_crossentropy',
                optimizer='adam', metrics=['accuracy'])

# Hyper-parameters
BATCH_SIZE = 400
EPOCHS = 50




# Enable validation to use ModelCheckpoint and EarlyStopping callbacks.
history = model_m.fit(one_d_xtrain,
                      one_d_ytrain,
                      batch_size=BATCH_SIZE,
                      epochs=EPOCHS,
                      callbacks=callbacks_list,
                      validation_split=0.2,
                      verbose=1)

Epoch 1/50


ValueError: in user code:

    /home/rosalie/.local/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:805 train_function  *
        return step_function(self, iterator)
    /home/rosalie/.local/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:795 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /home/rosalie/.local/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:1259 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /home/rosalie/.local/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2730 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /home/rosalie/.local/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:3417 _call_for_each_replica
        return fn(*args, **kwargs)
    /home/rosalie/.local/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:788 run_step  **
        outputs = model.train_step(data)
    /home/rosalie/.local/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:755 train_step
        loss = self.compiled_loss(
    /home/rosalie/.local/lib/python3.8/site-packages/tensorflow/python/keras/engine/compile_utils.py:203 __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    /home/rosalie/.local/lib/python3.8/site-packages/tensorflow/python/keras/losses.py:152 __call__
        losses = call_fn(y_true, y_pred)
    /home/rosalie/.local/lib/python3.8/site-packages/tensorflow/python/keras/losses.py:256 call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    /home/rosalie/.local/lib/python3.8/site-packages/tensorflow/python/util/dispatch.py:201 wrapper
        return target(*args, **kwargs)
    /home/rosalie/.local/lib/python3.8/site-packages/tensorflow/python/keras/losses.py:1537 categorical_crossentropy
        return K.categorical_crossentropy(y_true, y_pred, from_logits=from_logits)
    /home/rosalie/.local/lib/python3.8/site-packages/tensorflow/python/util/dispatch.py:201 wrapper
        return target(*args, **kwargs)
    /home/rosalie/.local/lib/python3.8/site-packages/tensorflow/python/keras/backend.py:4833 categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)
    /home/rosalie/.local/lib/python3.8/site-packages/tensorflow/python/framework/tensor_shape.py:1134 assert_is_compatible_with
        raise ValueError("Shapes %s and %s are incompatible" % (self, other))

    ValueError: Shapes (None, 1) and (None, 18) are incompatible
