In [1]:
from __future__ import print_function
from matplotlib import pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from IPython.display import display, HTML

from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn import preprocessing

import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Reshape
from keras.layers import Conv2D, MaxPooling2D
from keras.utils import np_utils

import glob
import os

In [2]:
# Set some standard parameters upfront
pd.options.display.float_format = '{:.1f}'.format
sns.set() # Default seaborn look and feel
plt.style.use('ggplot')
# Same labels will be reused throughout the program
LABELS = ['running_rider', 'scared', 'standing', 'trotting_rider', 'unknown', 'walking_rider', 'head_shake', 'scratch_biting', 'walking_natural', 'grazing', 'running_natural', 'trotting_natural', 'eating', 'jumping', 'shaking', 'rolling', 'fighting', 'rubbing']
# The number of steps within one time segment
TIME_PERIODS = 100
# The steps to take from one segment to the next; if this value is equal to
# TIME_PERIODS, then there is no overlap between the segments
STEP_DISTANCE = 50

In [3]:
def read_data(file_path):

    column_names = ['Ax',
                    'Ay',
                    'Az',
                    'Gx',
                    'Gy',
                    'Gz',
                    'Mx',
                    'My',
                    'Mz',
                    'A3D',
                    'G3D',
                    'M3D',
                    'datetime',
                    'label',
                    'segment',
                    'subject']
    df = pd.read_csv(file_path,
                     header=None,
                     names=column_names)
    return df
 
def show_basic_dataframe_info(dataframe):
    # Shape and how many rows and columns
    print('Number of columns in the dataframe: %i' % (dataframe.shape[1]))
    print('Number of rows in the dataframe: %i\n' % (dataframe.shape[0]))

In [4]:
#allfiles = glob.glob('Data/*.csv')
#print(allfiles)
#df = pd.concat((pd.read_data(f) for f in allfiles), ignore_index=True)

df = pd.concat((read_data(f) for f in glob.glob('Data/*')), ignore_index=True)

  op = _Concatenator(


In [5]:
# Describe the data
show_basic_dataframe_info(df)


Number of columns in the dataframe: 16
Number of rows in the dataframe: 10763585



# PRE PROCESSING

- Make all the elements float if they need be
- Shuffle data frame
- Split to test and train set
- Normalize to a range
- TODO segment
- TODO balance

In [6]:
len(df)

10763585

In [7]:
#shuffling the whole dataframe
def shuffle(data_frame):
    return data_frame.sample(frac=1).reset_index(drop=True)


In [8]:
#splitting in to training and test data
def split_test_train(data_frame, ratio):
    ratio = len(data_frame) * ratio
    ratio = int(ratio)
    train = data_frame[:ratio]
    test = data_frame[ratio:]
    return train, test

In [9]:
#im guessing we need some sort of range to adapt the normalization but idk how for now

#normalize the training set
def normalize(train, axis):
#     pd.options.mode.chained_assignment = None  # default='warn'
#     train['x-axis'] = train['x-axis'] / train['x-axis'].max()
#     train['y-axis'] = train['y-axis'] / train['y-axis'].max()
#     train['z-axis'] = train['z-axis'] / train['z-axis'].max()
    for dim in axis:
        train[dim] = train[dim] / train[dim].max()
#     return train

In [10]:
#segmenting the data
def segment(data_frame, time_step, step, label_name):
    feature_nr = 16
    segments = []
    lables = []
    
    for i in range(0, len(data_frame)):
        xs = data_frame['Ax'].values[i: i + time_step]
        ys = data_frame['Ay'].values[i: i + time_step]
        zs = data_frame['Az'].values[i: i + time_step]
        segments.append([xs,ys,zs])
        
        label = stats.mode(data_frame[label_name][i: i + time_step])[0][0] #TODO check shape
        labels.append(label)
        
    segments = np.asarray(segments, dtype= np.float32).reshape(-1, time_step, feature_nr)      
    labels = np.asarray(labels)
    return segments, labels

In [11]:
#balancing the dataframe
def balanced_data_frame(data_frame):
    return -1
    

In [12]:
df

Unnamed: 0,Ax,Ay,Az,Gx,Gy,Gz,Mx,My,Mz,A3D,G3D,M3D,datetime,label,segment,subject
0,Ax,Ay,Az,Gx,Gy,Gz,Mx,My,Mz,A3D,G3D,M3D,datetime,label,segment,
1,-0.90504052734375,5.2003916015625,-8.32733056640625,1.46484375,-2.9296875,0.8544921875,,,,9.8593917422902,3.38511343356007,,2018-04-25 16:13:03.5924,standing,82926,
2,-0.9672919921875,5.14771728515625,-8.3991591796875,1.46484375,-2.9296875,0.8544921875,,,,9.89851109873476,3.38511343356007,,2018-04-25 16:13:03.6024,standing,82926,
3,-0.9289833984375,5.1333515625,-8.427890625,1.46484375,-2.99072265625,0.79345703125,,,,9.91179342025229,3.42341398007328,,2018-04-25 16:13:03.6124,standing,82926,
4,-1.03433203125,5.08067724609375,-8.45183349609375,1.3427734375,-2.8076171875,0.91552734375,,,,9.91547343678237,3.24406305320271,,2018-04-25 16:13:03.6224,standing,82926,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10763580,12.3,0.3,2.3,2.2,-75.4,13.8,,,,12.5,76.7,,2018-06-28 21:06:08.4734,walking-natural,140908,
10763581,11.7,0.1,2.4,11.8,-94.1,11.1,,,,11.9,95.5,,2018-06-28 21:06:08.4834,walking-natural,140908,
10763582,11.1,-0.2,2.8,-8.2,-97.1,11.3,,,,11.4,98.1,,2018-06-28 21:06:08.4933,walking-natural,140908,
10763583,10.1,-1.0,3.5,-41.8,-90.9,12.0,,,,10.8,100.8,,2018-06-28 21:06:08.5033,walking-natural,140908,


In [None]:
normalize(df, ['Ax', 'Ay', 'Az'])

In [30]:
strs = []
for a in df['Az']:
    if type(a) == str:
        strs.append(a)
strs[1:60]

['-8.32733056640625',
 '-8.3991591796875',
 '-8.427890625',
 '-8.45183349609375',
 '-8.40394775390625',
 '-8.33690771484375',
 '-8.46141064453125',
 '-8.3512734375',
 '-8.3512734375',
 '-8.33690771484375',
 '-8.34648486328125',
 '-8.38479345703125',
 '-8.43267919921875',
 '-8.57633642578125',
 '-8.42310205078125',
 '-8.44225634765625',
 '-8.39437060546875',
 '-8.31296484375',
 '-8.29859912109375',
 '-8.38958203125',
 '-8.42310205078125',
 '-8.3416962890625',
 '-8.447044921875',
 '-8.3512734375',
 '-8.3416962890625',
 '-8.255501953125',
 '-8.21240478515625',
 '-8.3225419921875',
 '-8.33690771484375',
 '-8.44225634765625',
 '-8.370427734375',
 '-8.4183134765625',
 '-8.38958203125',
 '-8.4566220703125',
 '-8.40394775390625',
 '-8.427890625',
 '-8.370427734375',
 '-8.34648486328125',
 '-8.35606201171875',
 '-8.5332392578125',
 '-8.447044921875',
 '-8.34648486328125',
 '-8.27944482421875',
 '-8.45183349609375',
 '-8.41352490234375',
 '-8.49971923828125',
 '-8.37521630859375',
 '-8.356062011

In [27]:
float(df['Az'].head(1))

ValueError: could not convert string to float: 'Az'

In [32]:
ds = []
for d in df['Az']:
    try:
         ds.append(float(d))
    except ValueError:
        pass