In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import dask
from dask import dataframe as dd
from dask.diagnostics import ProgressBar
import math
import json
import os
import gc

## Configuration

In [2]:
dask.config.set(num_workers=8, scheduler='processes')

# Path where the data is stored
SOURCE_PATH = '../../../data'
# Directory inside SOURCE_PATH where the original data is stored
ORIGINAL_DATA_DIR = '/EXOSAFE'
# Directory inside SOURCE_PATH where the derived data is stored
DERIVED_DATA_DIR = '/derived_data'

# ID fo the training and test data resulting from this notebook, stored in DERIVED_DATA_DIR
RESULTS_ID = '0002_31032021'

# % of the data for the test set
TEST_SIZE = 0.3

# Number of force cells in the robotic leg
N_CELLS = 8

# Experiment params
DATE_EXPERIMENTS = '24022021'
N_EXPERIMENTS = 15


## Preprocessing

In [17]:
# Rotate force vectors of each force cell to align them
rotations = {
    1: [180, 90, 0],
    2: [180, 90, 0],
    3: [180, 0, -90],
    4: [0, 0, -90],
    5: [0, 0, 0],
    6: [0, 180, 0],
    7: [0, 90, 0],
    8: [0, 0, 90],
}

def rotate_vector(v, axis, angle):
    '''
    Args:
    - v (np.array): Vector to be rotated
    - axis (int): Axis along the rotation is performed
    - angle (int): Rotation angle
    
    Returns:
    - (np.array)): Rotated vector
    '''
    if axis == 0:
        # X
        v = v.dot(np.array([[1, 0, 0], [0, np.cos(np.radians(angle)), np.sin(np.radians(angle))], [0, np.sin(np.radians(angle)), np.cos(np.radians(angle))]]))
    elif axis == 1:
        # Y
        v = v.dot(np.array([[np.cos(np.radians(angle)), 0, np.sin(np.radians(angle))], [0, 1, 0], [-np.sin(np.radians(angle)), 0, np.cos(np.radians(angle))]]))
    elif axis == 2:
        # Z
        v = v.dot(np.array([[np.cos(np.radians(angle)), -np.sin(np.radians(angle)), 0], [np.sin(np.radians(angle)), np.cos(np.radians(angle)), 0], [0, 0, 1]]))
    else:
        raise ValueError('Invalid axis')

    return v

@dask.delayed
def rotate_row(row):
    '''
    Rotate the force vectors in a row. Dask function.
    '''
    for i in range(1, N_CELLS + 1):
        cols = ['F{}x'.format(str(i)), 'F{}y'.format(str(i)), 'F{}z'.format(str(i))]
        for ax in range(3):
            row[cols] = rotate_vector(row[cols], ax, rotations[i][ax])
            
    return row

def process_parameters_sheet(params_df):
    '''
    Process the data in the given pd.DataFrame from the raw excel sheet. 
    
    Args:
    - params_df (pd.DataFrame): DataFrame of the parameters excel sheet.
    
    Returns:
    - params_dict (dict): Dictionary with all the parameter in the input DataFrame.
    '''
    params_dict = {}
    params_dict['ExoHipMissalign'] = params_df.iloc[2, 1]
    params_dict['ExoKneeMissalign'] = params_df.iloc[2, 2]
    params_dict['MarchVelocity'] = params_df.iloc[0, 11]
    params_dict['TimeShift'] = params_df.iloc[0, 12]
    params_dict['SkinConfig'] = params_df.iloc[0, 13]
    
    return params_dict

def shift_leg_data(df, time_shift, total_len, data_res=0.01):
    '''
    Shift the data from the leg replica using the known time_shift from the experiment
    parameters to match the exoskeleton data in time and lenght.
    
    Args:
    - df (pd.DataFrame): DataFrame with the data of the leg replica
    - time_shift (float): Shifting time to applied to the data.
    - total_len (int): Total desired lenght for the data.
    - data_res (float): Data resolution (in seconds).
    
    Returns:
    - (pd.DataFrame): DataFrame with the data of the leg replica shifted.
    '''
    idx_start = math.ceil(time_shift / data_res)
    idx_end = total_len + idx_start
    return df.iloc[idx_start:idx_end].reset_index(drop=True)

In [4]:
for i in range(N_EXPERIMENTS):
    print('Processing file {}:'.format(i + 1))
    # Create the directory to save the resulting data
    save_dir = os.path.join(SOURCE_PATH + DERIVED_DATA_DIR, DATE_EXPERIMENTS, str(i + 1))
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    
    # Select only the relevant excel sheets 
    sheets = ['Parameters', 'RawForces', 'ForceCells', 'H3raw', 'H3processed', 'Leg-Replica']
    # Load the data
    data_df = pd.read_excel(SOURCE_PATH + ORIGINAL_DATA_DIR + '/' + DATE_EXPERIMENTS + '/0{}-'.format(i + 1) + DATE_EXPERIMENTS + '.xlsx', sheet_name=sheets)

    # Pre-process the data
    data_df[sheets[0]] = process_parameters_sheet(data_df[sheets[0]])
    
    # Apply the rotation matrix to each force vector
    forces_ddf = dd.from_pandas(data_df[sheets[2]], npartitions=int(len(data_df[sheets[2]]) / 100))
    forces_ddf = forces_ddf.apply(rotate_row, axis=1, meta=forces_ddf)
    with ProgressBar():
        data_df[sheets[2]] = forces_ddf.compute()
        
        
    leg_df_raw = data_df[sheets[5]].iloc[:, :3]
    # Correct the time shift between the data from the leg and the data from the exo
    leg_df_processed = shift_leg_data(data_df[sheets[5]].iloc[:, 3:], data_df[sheets[0]]['TimeShift'], len(data_df[sheets[4]]))
    
    assert(len(leg_df_processed) == len(data_df[sheets[4]]))    

    json.dump(data_df[sheets[0]], open(save_dir + '/parameters.json', 'w'))
    data_df[sheets[1]].to_csv(save_dir + '/force_cells_raw.csv', index=False)
    data_df[sheets[2]].to_csv(save_dir + '/force_cells_processed.csv', index=False)
    data_df[sheets[3]].to_csv(save_dir + '/H3_raw.csv', index=False)
    data_df[sheets[4]].to_csv(save_dir + '/H3_processed.csv', index=False)
    leg_df_raw.to_csv(save_dir + '/leg_raw.csv', index=False)
    leg_df_processed.to_csv(save_dir + '/leg_processed.csv', index=False)
    
    print('')


Processing file 1:
[########################################] | 100% Completed |  1min  2.9s

Processing file 2:
[########################################] | 100% Completed |  1min  1.3s

Processing file 3:
[########################################] | 100% Completed |  1min  1.6s

Processing file 4:
[########################################] | 100% Completed |  1min  1.1s

Processing file 5:
[########################################] | 100% Completed |  1min  0.4s

Processing file 6:
[########################################] | 100% Completed |  1min  1.8s

Processing file 7:
[########################################] | 100% Completed |  1min  0.6s

Processing file 8:
[########################################] | 100% Completed |  1min  1.7s

Processing file 9:
[########################################] | 100% Completed |  1min  1.8s

Processing file 10:
[########################################] | 100% Completed |  1min  1.4s

Processing file 11:
[######################################

## Features and target selection

In [12]:
targets_df_ls = []
features_df_ls = []
for i in tqdm(range(N_EXPERIMENTS)):
    data_dir = os.path.join(SOURCE_PATH + DERIVED_DATA_DIR, DATE_EXPERIMENTS, str(i + 1))
    
    targets_df = pd.read_csv(data_dir + '/force_cells_processed.csv')
    targets_df_ls.append(targets_df)
    
    exo_df = pd.read_csv(data_dir + '/H3_processed.csv')
    leg_df = pd.read_csv(data_dir + '/leg_processed.csv')
    print(exo_df.shape, leg_df.shape)
    features_df = pd.concat([exo_df, leg_df], axis=1)
    features_df_ls.append(features_df)

targets_df = pd.concat(targets_df_ls, axis=0)
features_df = pd.concat(features_df_ls, axis=0)

 13%|█▎        | 2/15 [00:00<00:01,  8.63it/s]

(17954, 18) (18637, 3)
(17982, 18) (18582, 3)


 27%|██▋       | 4/15 [00:00<00:01,  8.76it/s]

(17680, 18) (19529, 3)
(17959, 18) (18555, 3)


 40%|████      | 6/15 [00:00<00:01,  8.66it/s]

(17749, 18) (18606, 3)
(18027, 18) (18610, 3)


 53%|█████▎    | 8/15 [00:00<00:00,  8.65it/s]

(17770, 18) (18644, 3)
(18044, 18) (18681, 3)


 67%|██████▋   | 10/15 [00:01<00:00,  8.58it/s]

(18038, 18) (18653, 3)
(17905, 18) (19021, 3)


 80%|████████  | 12/15 [00:01<00:00,  8.34it/s]

(17805, 18) (18610, 3)
(18079, 18) (18576, 3)


 93%|█████████▎| 14/15 [00:01<00:00,  8.35it/s]

(17676, 18) (18520, 3)
(17987, 18) (18637, 3)


100%|██████████| 15/15 [00:01<00:00,  8.55it/s]

(17921, 18) (18692, 3)





In [11]:
features_df

Unnamed: 0,LHipPos,LKneePos,LAnklePos,LHipVel,LKneeVel,LAnkleVel,LHipAcc,LKneeAcc,LAnkleAcc,RHipPos,...,RAnklePos,LHipTorque,LKneeTorque,LAnkleTorque,RHipTorque,RKneeTorque,RAnkleTorque,LegKneePositionFiltered,LegKneeVelocityFiltered,LegKneeTorqueFiltered
0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000e+00,0.000000e+00,3.0,-2.0,-2.0,0.000000e+00,2.0,0.515107,-0.300000,-11.509124
1,1.299052e-07,-2.298261e-07,-5.414176e-08,-3.434135e-07,-6.993128e-07,0.000003,-0.000002,0.000006,0.000026,0.000139,...,-7.800055e-07,-1.104156e-07,3.0,-2.0,-2.0,8.712907e-09,2.0,0.674303,-0.520887,-11.483704
2,2.568636e-07,-4.596521e-07,-9.074711e-08,-7.141649e-07,-1.398625e-06,0.000006,-0.000004,0.000012,0.000057,0.000287,...,-1.558817e-06,-2.208312e-07,3.0,-2.0,-2.0,1.742581e-08,2.0,0.658745,-0.566428,-11.475699
3,3.806841e-07,-6.894781e-07,-1.074613e-07,-1.112411e-06,-2.097935e-06,0.000010,-0.000007,0.000019,0.000093,0.000444,...,-2.336275e-06,-3.312468e-07,3.0,-2.0,-2.0,2.613872e-08,2.0,0.648140,-0.570789,-11.475418
4,5.011630e-07,-9.193040e-07,-1.016134e-07,-1.538627e-06,-2.797245e-06,0.000014,-0.000010,0.000025,0.000135,0.000612,...,-3.112197e-06,-4.416624e-07,3.0,-2.0,-2.0,3.485163e-08,2.0,0.603196,-0.550283,-11.477499
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18687,,,,,,,,,,,...,,,,,,,,-1.441310,-0.521679,-46.131497
18688,,,,,,,,,,,...,,,,,,,,-1.437976,-0.575845,-46.121120
18689,,,,,,,,,,,...,,,,,,,,-1.445230,-0.656832,-46.132762
18690,,,,,,,,,,,...,,,,,,,,-1.452764,-0.678228,-46.113721


In [6]:
# Rename columns to manage with some typos
features_df = features_df.rename(columns={'LankleTorque': 'LAnkleTorque', 'RankleTorque': 'RAnkleTorque'})

In [7]:
H3_LEG = 'L' # L|R

features = [H3_LEG + a + m for a in ['Hip', 'Knee', 'Ankle'] for m in ['Pos', 'Vel', 'Acc', 'Torque']] + ['LegKnee{}Filtered'.format(m) for m in ['Position', 'Velocity', 'Torque']]
targets = ['F' + str(i + 1) + ax for i in range(N_CELLS) for ax in ['x', 'y', 'z']]

print('Number of features: {}'.format(len(features)))
print('Selected features: {}'.format(features))
print('\n')
print('Number of targets: {}'.format(len(targets)))
print('Selected targets: {}'.format(targets))

Number of features: 15
Selected features: ['LHipPos', 'LHipVel', 'LHipAcc', 'LHipTorque', 'LKneePos', 'LKneeVel', 'LKneeAcc', 'LKneeTorque', 'LAnklePos', 'LAnkleVel', 'LAnkleAcc', 'LAnkleTorque', 'LegKneePositionFiltered', 'LegKneeVelocityFiltered', 'LegKneeTorqueFiltered']


Number of targets: 24
Selected targets: ['F1x', 'F1y', 'F1z', 'F2x', 'F2y', 'F2z', 'F3x', 'F3y', 'F3z', 'F4x', 'F4y', 'F4z', 'F5x', 'F5y', 'F5z', 'F6x', 'F6y', 'F6z', 'F7x', 'F7y', 'F7z', 'F8x', 'F8y', 'F8z']


In [8]:
X = features_df[features]
Y = targets_df[targets]

print('X: {}, Y: {}'.format(X.shape, Y.shape))

X: (280553, 15), Y: (268576, 24)


## Nulls handeling, split and normalization

In [14]:
# Drop null values
idx = X.notna().all(axis=1)
print('Droping {} data points by null features'.format(len(idx[idx == False])))

X = X.loc[idx].values
Y = Y.loc[idx].values
print('X: {}, Y: {}'.format(X.shape, Y.shape))

Droping 30 data points by null features
X: (268546, 12), Y: (268546, 24)


In [15]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=0)

print('Train -> X: {}, Y: {}'.format(X_train.shape, Y_train.shape))
print('Test -> X: {}, Y: {}'.format(X_test.shape, Y_test.shape))

Train -> X: (187982, 12), Y: (187982, 24)
Test -> X: (80564, 12), Y: (80564, 24)


In [16]:
scaler = StandardScaler().fit(X_train)

X_train_norm = scaler.transform(X_train)
X_test_norm =  scaler.transform(X_test)

## Save data

In [17]:
save_dir = os.path.join(SOURCE_PATH + DERIVED_DATA_DIR, DATE_EXPERIMENTS)
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

np.save(save_dir + '/X_train_' + RESULTS_ID + '.npy', X_train_norm)    
np.save(save_dir + '/X_test_' + RESULTS_ID + '.npy', X_test_norm)    
np.save(save_dir + '/Y_train_' + RESULTS_ID + '.npy', Y_train)    
np.save(save_dir + '/Y_test_' + RESULTS_ID + '.npy', Y_test)    