In [1]:
from __future__ import print_function
from matplotlib import pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from IPython.display import display, HTML

from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn import preprocessing

import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Reshape
from keras.layers import Conv2D, MaxPooling2D
from keras.utils import np_utils

import glob
import os

In [2]:
#Some styling
pd.options.display.float_format = '{:.1f}'.format
sns.set()
plt.style.use('ggplot')

#Label encoder used to get a numeric representation of a label
le = preprocessing.LabelEncoder()

#The activities
LABELS = ['standing',
          'walking-natural',
          'walking-rider',
          'trotting-natural',
          'trotting-rider',
          'running-natural',
          'running-rider',
          'jumping',
          'grazing',
          'eating',
          'head shake',
          'shaking',
          'scratch-biting',
          'rubbing',
          'fighting',
          'rolling',
          'scared']

#Sliding windows
TIME_PERIODS = 80
STEP_DISTANCE = 40

#Datasets
FILES = sorted(glob.glob('Data/*'))

# Set up dataframe

In [3]:
REMOVE_COLUMNS = ['Mx', 'My', 'Mz','A3D','G3D','M3D'] #Add columns to drop from dataframe

def loadDataFrame(files):
    """
    Simple function to set up dataframe and initial clean-up of the data
    files: path to files
    returns: combined dataframe of all files
    """
    df = pd.DataFrame()
    for file in files:
        csv = pd.read_csv(file)
        csv['filename']=file
        df = df.append(csv)
        
    df.drop(REMOVE_COLUMNS, axis=1, inplace=True)
    df['ActivityEncoded'] = le.fit_transform(df['label'].values.ravel())

    return df

def convert_to_float(x):

    try:
        return np.float(x)
    except:
        return np.nan


df = loadDataFrame(FILES)
df.head(5000)



Unnamed: 0,Ax,Ay,Az,Gx,Gy,Gz,datetime,label,segment,filename,ActivityEncoded
0,6.5,-1.2,3.3,-12.8,54.2,-3.4,2018-06-14 20:06:48.0988,walking-natural,13935,Data/subject_11_Patron_part_1.csv,16
1,6.6,-0.5,3.2,1.7,58.2,-6.8,2018-06-14 20:06:48.1088,walking-natural,13935,Data/subject_11_Patron_part_1.csv,16
2,6.7,0.8,3.5,26.2,60.6,-8.1,2018-06-14 20:06:48.1188,walking-natural,13935,Data/subject_11_Patron_part_1.csv,16
3,6.8,2.1,3.8,48.7,58.9,-8.8,2018-06-14 20:06:48.1288,walking-natural,13935,Data/subject_11_Patron_part_1.csv,16
4,6.8,3.2,4.2,60.0,55.4,-9.1,2018-06-14 20:06:48.1388,walking-natural,13935,Data/subject_11_Patron_part_1.csv,16
...,...,...,...,...,...,...,...,...,...,...,...
4995,8.4,-3.1,4.2,0.1,1.5,-0.2,2018-06-14 20:07:47.1672,standing,13942,Data/subject_11_Patron_part_1.csv,13
4996,8.4,-3.1,4.1,0.0,1.2,-0.3,2018-06-14 20:07:47.1772,standing,13942,Data/subject_11_Patron_part_1.csv,13
4997,8.3,-3.0,4.0,0.1,1.0,-0.5,2018-06-14 20:07:47.1872,standing,13942,Data/subject_11_Patron_part_1.csv,13
4998,8.3,-3.0,4.1,0.1,0.4,-0.5,2018-06-14 20:07:47.1972,standing,13942,Data/subject_11_Patron_part_1.csv,13


# Plot data composition

In [4]:

# #Training examples per activity type
# df['label'].value_counts().plot(kind='bar', title='Training Examples of subject Viva by Activity Type')
# plt.show()

# SECONDS = 10 #nr of seconds to display accelerometer data
# SAMPLING_RATE = 20 #the sampling rate at which data was recorded

# def plot_activity(activity, data):

#     fig, (ax0, ax1, ax2) = plt.subplots(nrows=3, figsize=(15, 10), sharex=True)
#     plot_axis(ax0, data['datetime'], data['Ax'], 'X-Axis')
#     plot_axis(ax1, data['datetime'], data['Ay'], 'Y-Axis')
#     plot_axis(ax2, data['datetime'], data['Az'], 'Z-Axis')
#     plt.subplots_adjust(hspace=0.2)
#     fig.suptitle(activity)
#     plt.subplots_adjust(top=0.90)
#     plt.show()

# def plot_axis(ax, x, y, title):

#     ax.plot(x, y, 'r')
#     ax.set_title(title)
#     ax.xaxis.set_visible(False)
#     ax.set_ylim([min(y) - np.std(y), max(y) + np.std(y)])
#     ax.set_xlim([min(x), max(x)])
#     ax.grid(True)

# #plot all 3 subplots for each activity
# for activity in np.unique(df['label']):
#     subset = df[df['label'] == activity][:SECONDS*SAMPLING_RATE] 
#     plot_activity(activity, subset)

In [5]:
# Define column name of the label vector
LABEL = 'ActivityEncoded'
# Transform the labels from String to Integer via LabelEncoder
le = preprocessing.LabelEncoder()
# Add a new column to the existing DataFrame with the encoded values
df[LABEL] = le.fit_transform(df['label'].values.ravel())

# PRE PROCESSING

- Make all the elements float if they need be
- Shuffle data frame
- Split to test and train set
- Normalize to a range
- TODO segment
- TODO balance

# Get only relevant subjects

In [6]:
#These are the indexes of the relevant subjects, see FILES
indexes = [0,1,2,7,8,9,13,14,15,16,17]
subjects = [FILES[x] for x in indexes]

#new dataframe with only the horses in subjects
df = df[df['filename'].isin(subjects)]  


# Splitting

In [7]:
# Splitting
def splitBySubject(data, name):
  '''
  Function to split train and test data by subject
  data = dataframe
  name = subject to put in test subset
  '''
  test = data[data['filename'].str.contains(name)]
  train = data[~data['filename'].str.contains(name)]
  return train, test


train, test = splitBySubject(df, 'Galoway')

print(len(train['filename'].unique()))
print(len(test['filename']. unique()))


8
3


# Feature scaling

In [8]:
train_x_max = train['Ax'].max()
train_y_max = train['Ay'].max()
train_z_max = train['Az'].max()

pd.options.mode.chained_assignment = None 

#divide all 3 axis with the max value in the training set
train['Ax'] = train['Ax'] / train_x_max
train['Ay'] = train['Ay'] / train_y_max
train['Az'] = train['Az'] / train_z_max


# Segmentation

In [9]:
#segmenting based on segments column
def create_segments(data_frame):
    
    segmented = data_frame.groupby('segment')
    segments = []
    for k, v in segmented:
        segments.append(v)
    return segments


train = create_segments(train)
test = create_segments(test)

train_segments = []
for i in train:
    trainlen = len(i)
    train_segments.append(trainlen)
    
test_segments = []
for i in test:
    testlen = len(i)
    test_segments.append(testlen)
    
train[1].shape
test[0].shape

(805, 11)

# Suffle Data

In [10]:
#shuffling the whole dataframe
def shuffle(data_frame):
    '''Function to shuffle dataframe'''
    return data_frame.sample(frac=1).reset_index(drop=True)


shuffled_train = []

for t in train:
    shuffled_train.append(shuffle(t))
 
shuffled_test = []

for t in train:
    shuffled_test.append(shuffle(t))
    
    
train = shuffled_train

# Apply one-hot coding

In [28]:
# for i in train:
#     i = pd.get_dummies(i.label, prefix="ola")


print(shuffled_train[66].label.unique())

# for i in train:
#     i.drop(drop_columns, axis=1, inplace=True)
#     i = i.astype('float32')
    
# for i in test:
#     i.drop(drop_columns, axis=1, inplace=True)
#     i = i.astype('float32')
    

# train = np_utils.to_categorical(train, 18)
# train = np_utils.to_categorical(train, 18)

['walking-rider']


In [30]:
pd.get_dummies(train[77].label)

Unnamed: 0,trotting-rider
73597,1
73598,1
73599,1
73600,1
73601,1
...,...
74592,1
74593,1
74594,1
74595,1
