In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import dask
from dask import dataframe as dd
from dask.diagnostics import ProgressBar
import os
import gc

## Configuration

In [2]:
dask.config.set(num_workers=8, scheduler='processes')

# Path where the data is stored
SOURCE_PATH = '../../../data'
# Directory inside SOURCE_PATH where the original data is stored
ORIGINAL_DATA_DIR = '/EXOSAFE'
# Directory inside SOURCE_PATH where the derived data is stored
DERIVED_DATA_DIR = '/derived_data'

# ID fo the training and test data resulting from this notebook, stored in DERIVED_DATA_DIR
RESULTS_ID = '0001_26032021'

# % of the data for the test set
TEST_SIZE = 0.3

# Number of force cells in the robotic leg
N_CELLS = 8

# Experiment params
DATE_EXPERIMENTS = '24022021'
N_EXPERIMENTS = 15


## Preprocessing

In [3]:
# Rotate force vectors of each force cell to align them
rotations = {
    1: [180, 90, 0],
    2: [180, 90, 0],
    3: [180, 0, -90],
    4: [0, 0, -90],
    5: [0, 0, 0],
    6: [0, 180, 0],
    7: [0, 90, 0],
    8: [0, 0, 90],
}

def rotate_vector(v, axis, angle):
    '''
    Args:
    - v (np.array): Vector to be rotated
    - axis (int): Axis along the rotation is performed
    - angle (int): Rotation angle
    
    Returns:
    - (np.array)): Rotated vector
    '''
    if axis == 0:
        # X
        v = v.dot(np.array([[1, 0, 0], [0, np.cos(np.radians(angle)), np.sin(np.radians(angle))], [0, np.sin(np.radians(angle)), np.cos(np.radians(angle))]]))
    elif axis == 1:
        # Y
        v = v.dot(np.array([[np.cos(np.radians(angle)), 0, np.sin(np.radians(angle))], [0, 1, 0], [-np.sin(np.radians(angle)), 0, np.cos(np.radians(angle))]]))
    elif axis == 2:
        # Z
        v = v.dot(np.array([[np.cos(np.radians(angle)), -np.sin(np.radians(angle)), 0], [np.sin(np.radians(angle)), np.cos(np.radians(angle)), 0], [0, 0, 1]]))
    else:
        raise ValueError('Invalid axis')

    return v

@dask.delayed
def rotate_row(row):
    for i in range(1, N_CELLS + 1):
        cols = ['F{}x'.format(str(i)), 'F{}y'.format(str(i)), 'F{}z'.format(str(i))]
        for ax in range(3):
            row[cols] = rotate_vector(row[cols], ax, rotations[i][ax])
            
    return row

In [6]:
for i in range(N_EXPERIMENTS):
    print('Processing file {}:'.format(i + 1))
    # Create the directory to save the resulting data
    save_dir = os.path.join(SOURCE_PATH + DERIVED_DATA_DIR, DATE_EXPERIMENTS, str(i + 1))
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    
    # Select only the relevant excel sheets 
    sheets = ['ForceCells', 'H3processed']
    # Load the data
    data_df = pd.read_excel(SOURCE_PATH + ORIGINAL_DATA_DIR + '/' + DATE_EXPERIMENTS + '/0{}-'.format(i + 1) + DATE_EXPERIMENTS + '.xlsx', sheet_name=sheets)

    forces_ddf = dd.from_pandas(data_df[sheets[0]], npartitions=int(len(data_df[sheets[0]]) / 100))

    # Apply the rotation matrix to each force vector
    forces_ddf = forces_ddf.apply(rotate_row, axis=1, meta=forces_ddf)

    with ProgressBar():
        data_df[sheets[0]] = forces_ddf.compute()
        
    
    data_df[sheets[0]].to_csv(save_dir + '/force_cells.csv', index=False)
    data_df[sheets[1]].to_csv(save_dir + '/H3_processed.csv', index=False)
    
    print('')


Processing file 1
[########################################] | 100% Completed | 58.5s
Processing file 2
[########################################] | 100% Completed | 59.0s
Processing file 3
[########################################] | 100% Completed | 57.4s
Processing file 4
[########################################] | 100% Completed | 58.6s
Processing file 5
[########################################] | 100% Completed | 57.7s
Processing file 6
[########################################] | 100% Completed | 59.1s
Processing file 7
[########################################] | 100% Completed | 57.6s
Processing file 8
[########################################] | 100% Completed | 58.7s
Processing file 9
[########################################] | 100% Completed | 58.8s
Processing file 10
[########################################] | 100% Completed | 58.7s
Processing file 11
[########################################] | 100% Completed | 58.0s
Processing file 12
[################################

## Features and target selection

In [7]:
targets_df_ls = []
features_df_ls = []
for i in tqdm(range(N_EXPERIMENTS)):
    data_dir = os.path.join(SOURCE_PATH + DERIVED_DATA_DIR, DATE_EXPERIMENTS, str(i + 1))
    
    targets_df = pd.read_csv(data_dir + '/force_cells.csv')
    targets_df_ls.append(targets_df)
    
    features_df = pd.read_csv(data_dir + '/H3_processed.csv')
    features_df_ls.append(features_df)

targets_df = pd.concat(targets_df_ls, axis=0)
features_df = pd.concat(features_df_ls, axis=0)

100%|██████████| 15/15 [00:01<00:00, 10.41it/s]


In [8]:
# Rename columns to manage with some typos
features_df = features_df.rename(columns={'LankleTorque': 'LAnkleTorque', 'RankleTorque': 'RAnkleTorque'})

In [9]:
H3_LEG = 'L' # L|R

features = [H3_LEG + a + m for a in ['Hip', 'Knee', 'Ankle'] for m in ['Pos', 'Vel', 'Acc', 'Torque']]
targets = ['F' + str(i + 1) + ax for i in range(N_CELLS) for ax in ['x', 'y', 'z']]

print('Number of features: {}'.format(len(features)))
print('Selected features: {}'.format(features))
print('\n')
print('Number of targets: {}'.format(len(targets)))
print('Selected targets: {}'.format(targets))

Number of features: 12
Selected features: ['LHipPos', 'LHipVel', 'LHipAcc', 'LHipTorque', 'LKneePos', 'LKneeVel', 'LKneeAcc', 'LKneeTorque', 'LAnklePos', 'LAnkleVel', 'LAnkleAcc', 'LAnkleTorque']


Number of targets: 24
Selected targets: ['F1x', 'F1y', 'F1z', 'F2x', 'F2y', 'F2z', 'F3x', 'F3y', 'F3z', 'F4x', 'F4y', 'F4z', 'F5x', 'F5y', 'F5z', 'F6x', 'F6y', 'F6z', 'F7x', 'F7y', 'F7z', 'F8x', 'F8y', 'F8z']


In [10]:
X = features_df[features]
Y = targets_df[targets]

print('X: {}, Y: {}'.format(X.shape, Y.shape))

X: (268576, 12), Y: (268576, 24)


## Nulls handeling, split and normalization

In [14]:
# Drop null values
idx = X.notna().all(axis=1)
print('Droping {} data points by null features'.format(len(idx[idx == False])))

X = X.loc[idx].values
Y = Y.loc[idx].values
print('X: {}, Y: {}'.format(X.shape, Y.shape))

Droping 30 data points by null features
X: (268546, 12), Y: (268546, 24)


In [15]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=0)

print('Train -> X: {}, Y: {}'.format(X_train.shape, Y_train.shape))
print('Test -> X: {}, Y: {}'.format(X_test.shape, Y_test.shape))

Train -> X: (187982, 12), Y: (187982, 24)
Test -> X: (80564, 12), Y: (80564, 24)


In [16]:
scaler = StandardScaler().fit(X_train)

X_train_norm = scaler.transform(X_train)
X_test_norm =  scaler.transform(X_test)

## Save data

In [17]:
save_dir = os.path.join(SOURCE_PATH + DERIVED_DATA_DIR, DATE_EXPERIMENTS)
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

np.save(save_dir + '/X_train_' + RESULTS_ID + '.npy', X_train_norm)    
np.save(save_dir + '/X_test_' + RESULTS_ID + '.npy', X_test_norm)    
np.save(save_dir + '/Y_train_' + RESULTS_ID + '.npy', Y_train)    
np.save(save_dir + '/Y_test_' + RESULTS_ID + '.npy', Y_test)    