# Data Augmentation
## Purpose of this notebook is to take both prepped datasets and combined into a singular dataset ready to train models with

In [76]:
# Import required libraries
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import datetime
import os.path

In [77]:
# Constants for changing
WINDOW_SIZE = 20
SENSORS = ['back_angle', 'left_angle', 'right_angle']# ,'back_1der' , 'left_1der', 'right_1der', 'back_2der', 'left_2der', 'right_2der']
PLOT_ON = False

### First retrieve the patient data

In [78]:
# Retrieve data and place with pd dataframe
patient_file_path = '/Users/jamesmeyer/University of Bath/Patient Simulator FYP - General/datasets/patient/'
patient_file_name = 'patient_data.csv'
dataframe = pd.read_csv(f'{patient_file_path}{patient_file_name}', parse_dates=['datetime'])
print(dataframe.head())

   action  subject  frame  index  back_angle  left_angle  right_angle  \
0     NaN      NaN    NaN    0.0    0.424307    0.572584     0.215395   
1     NaN      NaN    NaN    1.0    0.456290    0.568733     0.200416   
2     NaN      NaN    NaN    2.0    0.488273    0.564883     0.200416   
3     NaN      NaN    NaN    3.0    0.530917    0.557181     0.201110   
4     NaN      NaN    NaN    4.0    0.573561    0.541779     0.200416   

                 datetime  back_1der  left_1der  right_1der     back_2der  \
0 2011-12-01 11:04:05.000   0.028175   0.000385   -0.011288  4.568992e-03   
1 2011-12-01 11:04:05.050   0.032744  -0.003466   -0.007107  4.568992e-03   
2 2011-12-01 11:04:05.100   0.037313  -0.007316   -0.002926  4.568992e-03   
3 2011-12-01 11:04:05.150   0.038380  -0.013092    0.000083 -2.775558e-16   
4 2011-12-01 11:04:05.200   0.037313  -0.018868   -0.000083 -4.568992e-03   

   left_2der  right_2der  
0  -0.003851    0.004181  
1  -0.003851    0.004181  
2  -0.003851    0

In [79]:
# Obtaining only the required columns
patient_df = dataframe.loc[:,SENSORS]
patient_df.loc[:,'Date'] = pd.to_datetime(dataframe.loc[:,'datetime'])
patient_df.tail()

Unnamed: 0,left_angle,right_angle,Date
38182,0.276088,0.527046,2011-12-01 11:35:54.100
38183,0.276088,0.525659,2011-12-01 11:35:54.150
38184,0.276088,0.524272,2011-12-01 11:35:54.200
38185,0.276088,0.524272,2011-12-01 11:35:54.250
38186,0.276088,0.524272,2011-12-01 11:35:54.300


In [80]:
# Initial plot of data
if PLOT_ON:
    patient_df.plot(subplots=True, figsize=(16, 16), x='Date')

In [81]:
# Split test and train data
patient_train = patient_df.loc[patient_df['Date'] <= '2011-12-01 11:38:00'] 
patient_test = patient_df.loc[patient_df['Date'] > '2011-12-01 11:38:00']
print("Not really being used at this stage")

Not really being used at this stage


In [82]:
# scaler = StandardScaler()
# scaler = scaler.fit(train[['back_angle']])

# train['back_angle'] = scaler.transform(train[['back_angle']])
# print(test)
# test['back_angle'] = scaler.transform(test[['back_angle']])
print("We might need to do some additional normalising or scaling here (derivatives)")
# train

We might need to do some additional normalising or scaling here (derivatives)


### Then retrieve the control data

In [83]:
# Location of processed control data
control_file_path = '/Users/jamesmeyer/University of Bath/Patient Simulator FYP - General/datasets/control/'
control_file_name = 'control_data.csv'

# Load the data in pd df
control_df = pd.read_csv(f'{control_file_path}{control_file_name}', index_col=False)
control_df.head()

Unnamed: 0,action,subject,frame,back_angle,left_angle,right_angle,back_1der,left_1der,right_1der,back_2der,left_2der,right_2der
0,1,1,1,0.309028,0.052406,0.99199,-0.059143,-0.018081,0.000857,0.024875,0.006771,-1.4e-05
1,1,1,2,0.212165,0.023802,0.993044,-0.034267,-0.01131,0.000843,0.024875,0.006771,-1.4e-05
2,1,1,3,0.251407,0.033033,0.993763,-0.009392,-0.004539,0.00083,0.024875,0.006771,-1.4e-05
3,1,1,4,0.232627,0.025384,0.994412,0.006291,-0.000403,0.000699,-0.008194,-0.00249,-8.1e-05
4,1,1,5,0.251839,0.028918,0.995454,0.002175,-0.001812,0.000567,0.005269,0.001213,-0.000224


In [84]:
# Loop through each action and plot the angles
if PLOT_ON:
    subject_nos = list(range(1,11))
    fig, axs = plt.subplots(3, 3, figsize=(16, 16))
    fig.suptitle(f'Action: 1 - Drinking')

    for subject in subject_nos:

        angle_df = control_df.loc[
            (control_df['action'] == 1) & (control_df['subject'] == subject)]

        axs[0, 0].plot(angle_df['back_angle'])
        axs[0, 0].set_ylabel('Normalised Angle')
        axs[0, 0].set_title('Back')
        axs[0, 1].plot(angle_df['left_angle'])
        axs[0, 2].plot(angle_df['right_angle'])

        deriv = 1

        axs[1, 0].plot(angle_df[f'back_{deriv}der'])
        axs[1, 1].plot(angle_df[f'left_{deriv}der'])
        axs[1, 0].set_ylabel('RoC of Normalised Angle')
        axs[0, 1].set_title('Left')
        axs[1, 2].plot(angle_df[f'right_{deriv}der'])

        deriv = 2

        axs[2, 0].plot(angle_df[f'back_{deriv}der'])
        axs[2, 1].plot(angle_df[f'left_{deriv}der'])
        axs[2, 2].plot(angle_df[f'right_{deriv}der'], label=f'Subject {subject}')
        axs[2, 0].set_ylabel('RoC of RoC of Normalised Angle')
        axs[0, 2].set_title('Right')

    lines, labels = fig.axes[-1].get_legend_handles_labels()
    fig.legend(lines, labels, loc = 'center right')

### Now to convert both to windowed sequences and combine for model training

In [85]:
def to_sequences(x, y, seq_size=1):
    '''Function to return windowed versions'''
    x_values = []
    y_values = []

    for i in range(len(x)-seq_size):
        x_values.append(x.iloc[i:(i+seq_size)].values)
        y_values.append(y.iloc[i+seq_size])
        
    return np.array(x_values), np.array(y_values)

In [86]:
# Patient data
patientX, _ = to_sequences(patient_train[SENSORS], patient_train[SENSORS], WINDOW_SIZE)

In [87]:
print(f'Number of patient training samples: {patientX.shape}')

Number of patient training samples: (38067, 120, 2)


In [88]:
# Control data is harder because the data itself is segmented between individuals and actions so only windows from same action and person can be extracted

# Loop through each action and subject
action_nos = [1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 14]
subject_nos = list(range(1,11))

extract_data = []
for action in action_nos:
    for subject in subject_nos:
        sample = control_df.loc[(control_df['action'] == action) & (control_df['subject'] == subject), SENSORS]
        control_samples, _ = to_sequences(sample, sample, WINDOW_SIZE)
        extract_data.extend(control_samples)
controlX = np.array(extract_data)

In [89]:
print(f'Number of control training samples: {controlX.shape}')

Number of control training samples: (2382, 120, 2)


# IS THERE PREPROCESSING THAT NEEDS TO HAPPEN HERE?

In [90]:
augX = np.append(patientX, controlX, axis=0)
print(f'Number of combined training samples: {augX.shape}')

Number of combined training samples: (40449, 120, 2)


In [91]:
# Set up saves
output_file_path = '/Users/jamesmeyer/University of Bath/Patient Simulator FYP - General/datasets/'

deriv_flag = False
for data in SENSORS:
    if 'der' in data:
        deriv_flag = True
        break

output_id = f'ws{WINDOW_SIZE}num{len(SENSORS)}der{deriv_flag}'

In [92]:
path = output_file_path + output_id

np.save(f'{path}-patient-shaved.npy', patientX)
np.save(f'{path}-control.npy', controlX)
np.save(f'{path}-augment-shaved.npy', augX)


print(f'Saved to: {path}')

Saved to: /Users/jamesmeyer/University of Bath/Patient Simulator FYP - General/datasets/ws120num2derFalse
