In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import dask
from dask import dataframe as dd
from dask.diagnostics import ProgressBar
import random
import math
import json
import os
import gc

## Configuration

In [2]:
dask.config.set(num_workers=8, scheduler='processes')
random.seed(0)

# Path where the data is stored
SOURCE_PATH = '../../../data'
# Directory inside SOURCE_PATH where the original data is stored
ORIGINAL_DATA_DIR = '/EXOSAFE'
# Directory inside SOURCE_PATH where the derived data is stored
DERIVED_DATA_DIR = '/derived_data'

# Number of force cells in the robotic leg
N_CELLS = 8

# Experiment params
DATE_EXPERIMENTS = '24022021'
N_EXPERIMENTS = 15


# ID fo the training and test data resulting from this notebook, stored in DERIVED_DATA_DIR
RESULTS_ID = '0003_08042021'

# % of the data for the test set
TEST_SIZE = 0.3

## Preprocessing

In [3]:
# Rotate force vectors of each force cell to align them
rotations = {
    1: [180, 90, 0],
    2: [180, 90, 0],
    3: [180, 0, -90],
    4: [0, 0, -90],
    5: [0, 0, 0],
    6: [0, 180, 0],
    7: [0, 90, 0],
    8: [0, 0, 90],
}

def rotate_vector(v, axis, angle):
    '''
    Args:
    - v (np.array): Vector to be rotated
    - axis (int): Axis along the rotation is performed
    - angle (int): Rotation angle
    
    Returns:
    - (np.array)): Rotated vector
    '''
    if axis == 0:
        # X
        v = v.dot(np.array([[1, 0, 0], [0, np.cos(np.radians(angle)), np.sin(np.radians(angle))], [0, np.sin(np.radians(angle)), np.cos(np.radians(angle))]]))
    elif axis == 1:
        # Y
        v = v.dot(np.array([[np.cos(np.radians(angle)), 0, np.sin(np.radians(angle))], [0, 1, 0], [-np.sin(np.radians(angle)), 0, np.cos(np.radians(angle))]]))
    elif axis == 2:
        # Z
        v = v.dot(np.array([[np.cos(np.radians(angle)), -np.sin(np.radians(angle)), 0], [np.sin(np.radians(angle)), np.cos(np.radians(angle)), 0], [0, 0, 1]]))
    else:
        raise ValueError('Invalid axis')

    return v

@dask.delayed
def rotate_row(row):
    '''
    Rotate the force vectors in a row. Dask function.
    '''
    for i in range(1, N_CELLS + 1):
        cols = ['F{}x'.format(str(i)), 'F{}y'.format(str(i)), 'F{}z'.format(str(i))]
        for ax in range(3):
            row[cols] = rotate_vector(row[cols], ax, rotations[i][ax])
            
    return row

def process_parameters_sheet(params_df):
    '''
    Process the data in the given pd.DataFrame from the raw excel sheet. 
    
    Args:
    - params_df (pd.DataFrame): DataFrame of the parameters excel sheet.
    
    Returns:
    - params_dict (dict): Dictionary with all the parameter in the input DataFrame.
    '''
    params_dict = {}
    params_dict['ExoHipMissalign'] = params_df.iloc[2, 1]
    params_dict['ExoKneeMissalign'] = params_df.iloc[2, 2]
    params_dict['MarchVelocity'] = params_df.iloc[0, 11]
    params_dict['TimeShift'] = params_df.iloc[0, 12]
    params_dict['SkinConfig'] = params_df.iloc[0, 13]
    
    return params_dict

def shift_leg_data(df, time_shift, total_len, data_res=0.01):
    '''
    Shift the data from the leg replica using the known time_shift from the experiment
    parameters to match the exoskeleton data in time and lenght.
    
    Args:
    - df (pd.DataFrame): DataFrame with the data of the leg replica
    - time_shift (float): Shifting time to applied to the data.
    - total_len (int): Total desired lenght for the data.
    - data_res (float): Data resolution (in seconds).
    
    Returns:
    - (pd.DataFrame): DataFrame with the data of the leg replica shifted.
    '''
    idx_start = math.ceil(time_shift / data_res)
    idx_end = total_len + idx_start
    return df.iloc[idx_start:idx_end].reset_index(drop=True)

In [4]:
for i in range(N_EXPERIMENTS):
    print('Processing file {}:'.format(i + 1))
    # Create the directory to save the resulting data
    save_dir = os.path.join(SOURCE_PATH + DERIVED_DATA_DIR, DATE_EXPERIMENTS, str(i + 1))
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    
    # Select only the relevant excel sheets 
    sheets = ['Parameters', 'RawForces', 'ForceCells', 'H3raw', 'H3processed', 'Leg-Replica']
    # Load the data
    data_df = pd.read_excel(SOURCE_PATH + ORIGINAL_DATA_DIR + '/' + DATE_EXPERIMENTS + '/0{}-'.format(i + 1) + DATE_EXPERIMENTS + '.xlsx', sheet_name=sheets)

    # Pre-process the data
    data_df[sheets[0]] = process_parameters_sheet(data_df[sheets[0]])
    
    # Apply the rotation matrix to each force vector
    forces_ddf = dd.from_pandas(data_df[sheets[2]], npartitions=int(len(data_df[sheets[2]]) / 100))
    forces_ddf = forces_ddf.apply(rotate_row, axis=1, meta=forces_ddf)
    with ProgressBar():
        data_df[sheets[2]] = forces_ddf.compute()
        
        
    leg_df_raw = data_df[sheets[5]].iloc[:, :3]
    # Correct the time shift between the data from the leg and the data from the exo
    leg_df_processed = shift_leg_data(data_df[sheets[5]].iloc[:, 3:], data_df[sheets[0]]['TimeShift'], len(data_df[sheets[4]]))
    
    assert(len(leg_df_processed) == len(data_df[sheets[4]]) == len(data_df[sheets[2]]))    

    json.dump(data_df[sheets[0]], open(save_dir + '/parameters.json', 'w'))
    data_df[sheets[1]].to_csv(save_dir + '/force_cells_raw.csv', index=False)
    data_df[sheets[2]].to_csv(save_dir + '/force_cells_processed.csv', index=False)
    data_df[sheets[3]].to_csv(save_dir + '/H3_raw.csv', index=False)
    data_df[sheets[4]].to_csv(save_dir + '/H3_processed.csv', index=False)
    leg_df_raw.to_csv(save_dir + '/leg_raw.csv', index=False)
    leg_df_processed.to_csv(save_dir + '/leg_processed.csv', index=False)
    
    print('')


Processing file 1:
[########################################] | 100% Completed |  1min  6.0s

Processing file 2:
[########################################] | 100% Completed |  1min  4.3s

Processing file 3:
[########################################] | 100% Completed |  1min  3.8s

Processing file 4:
[########################################] | 100% Completed |  1min  6.3s

Processing file 5:
[########################################] | 100% Completed |  1min  4.4s

Processing file 6:
[########################################] | 100% Completed |  1min 11.3s

Processing file 7:
[########################################] | 100% Completed |  1min  3.9s

Processing file 8:
[########################################] | 100% Completed |  1min 10.8s

Processing file 9:
[########################################] | 100% Completed |  1min  6.1s

Processing file 10:
[########################################] | 100% Completed |  1min  2.6s

Processing file 11:
[######################################

## Features and target selection

In [3]:
H3_LEG = 'L' # L|R

features = [H3_LEG + a + m for a in ['Hip', 'Knee', 'Ankle'] for m in ['Pos', 'Vel', 'Acc', 'Torque']] + ['LegKnee{}Filtered'.format(m) for m in ['Position', 'Velocity', 'Torque']]
targets = ['F' + str(i + 1) + ax for i in range(N_CELLS) for ax in ['x', 'y', 'z']]

print('Number of features: {}'.format(len(features)))
print('Selected features: {}'.format(features))
print('\n')
print('Number of targets: {}'.format(len(targets)))
print('Selected targets: {}'.format(targets))

Number of features: 15
Selected features: ['LHipPos', 'LHipVel', 'LHipAcc', 'LHipTorque', 'LKneePos', 'LKneeVel', 'LKneeAcc', 'LKneeTorque', 'LAnklePos', 'LAnkleVel', 'LAnkleAcc', 'LAnkleTorque', 'LegKneePositionFiltered', 'LegKneeVelocityFiltered', 'LegKneeTorqueFiltered']


Number of targets: 24
Selected targets: ['F1x', 'F1y', 'F1z', 'F2x', 'F2y', 'F2z', 'F3x', 'F3y', 'F3z', 'F4x', 'F4y', 'F4z', 'F5x', 'F5y', 'F5z', 'F6x', 'F6y', 'F6z', 'F7x', 'F7y', 'F7z', 'F8x', 'F8y', 'F8z']


In [4]:
targets_dict = {}
features_dict = {}
for i in range(1, N_EXPERIMENTS + 1):
    # Define the path to load the data
    data_dir = os.path.join(SOURCE_PATH + DERIVED_DATA_DIR, DATE_EXPERIMENTS, str(i))
    
    # Load targets
    targets_df = pd.read_csv(data_dir + '/force_cells_processed.csv')
    
    # Load features
    exo_df = pd.read_csv(data_dir + '/H3_processed.csv')
    leg_df = pd.read_csv(data_dir + '/leg_processed.csv')
    features_df = pd.concat([exo_df, leg_df], axis=1)
    
    # Rename columns to manage with some typos
    features_df = features_df.rename(columns={'LankleTorque': 'LAnkleTorque', 'RankleTorque': 'RAnkleTorque'})

    # Drop null values
    idx = features_df.notna().all(axis=1)
    features_df = features_df.loc[idx]
    targets_df = targets_df.loc[idx]
    print('Droping {} data points by null features'.format(len(idx[idx == False])))

    # Store the final array
    targets_dict[i] = targets_df[targets].values
    features_dict[i] = features_df[features].values
    
    print('Experiment {} -> X: {}, Y: {} \n'.format(i, features_dict[i].shape, targets_dict[i].shape))

Droping 2 data points by null features
Experiment 1 -> X: (17952, 15), Y: (17952, 24) 

Droping 2 data points by null features
Experiment 2 -> X: (17980, 15), Y: (17980, 24) 

Droping 2 data points by null features
Experiment 3 -> X: (17678, 15), Y: (17678, 24) 

Droping 2 data points by null features
Experiment 4 -> X: (17957, 15), Y: (17957, 24) 

Droping 2 data points by null features
Experiment 5 -> X: (17747, 15), Y: (17747, 24) 

Droping 2 data points by null features
Experiment 6 -> X: (18025, 15), Y: (18025, 24) 

Droping 2 data points by null features
Experiment 7 -> X: (17768, 15), Y: (17768, 24) 

Droping 2 data points by null features
Experiment 8 -> X: (18042, 15), Y: (18042, 24) 

Droping 2 data points by null features
Experiment 9 -> X: (18036, 15), Y: (18036, 24) 

Droping 2 data points by null features
Experiment 10 -> X: (17903, 15), Y: (17903, 24) 

Droping 2 data points by null features
Experiment 11 -> X: (17803, 15), Y: (17803, 24) 

Droping 2 data points by null 

## Split and normalization

In [5]:
experiments = list(range(1, N_EXPERIMENTS + 1))
random.shuffle(experiments)

train_experiments = experiments[int(N_EXPERIMENTS * TEST_SIZE):]
test_experiments = experiments[:int(N_EXPERIMENTS * TEST_SIZE)]

print('Train experiments ids ({}): {}'.format(len(train_experiments), train_experiments))
print('Test experiments ids ({}): {}'.format(len(test_experiments), test_experiments))

# Check that no test experiment is in train
assert(not any([i in test_experiments for i in train_experiments]))

Train experiments ids (11): [12, 3, 4, 8, 9, 5, 1, 15, 13, 7, 14]
Test experiments ids (4): [2, 11, 10, 6]


In [6]:
X_train = np.concatenate([features_dict[i] for i in train_experiments], axis=0)
Y_train = np.concatenate([targets_dict[i] for i in train_experiments], axis=0)
X_test = np.concatenate([features_dict[i] for i in test_experiments], axis=0)
Y_test = np.concatenate([targets_dict[i] for i in test_experiments], axis=0)

print('Train -> X: {}, Y: {}'.format(X_train.shape, Y_train.shape))
print('Test -> X: {}, Y: {}'.format(X_test.shape, Y_test.shape))

Train -> X: (196835, 15), Y: (196835, 24)
Test -> X: (71711, 15), Y: (71711, 24)


In [7]:
scaler = StandardScaler().fit(X_train)

X_train_norm = scaler.transform(X_train)
X_test_norm =  scaler.transform(X_test)

print('Train -> min: {}, max: {}, mean: {}, std: {}'.format(np.min(X_train_norm), np.max(X_train_norm), np.mean(X_train_norm), np.std(X_train_norm)))
print('Test -> min: {}, max: {}, mean: {}, std: {}'.format(np.min(X_test_norm), np.max(X_test_norm), np.mean(X_test_norm), np.std(X_test_norm)))

Train -> min: -4.7494235121468265, max: 9.309577254501134, mean: -9.857268086378168e-18, std: 1.0000000000000002
Test -> min: -4.681015530353451, max: 4.014120323189233, mean: -0.03396178956156562, std: 1.0124789966278167


## Save data

In [8]:
save_dir = os.path.join(SOURCE_PATH + DERIVED_DATA_DIR, DATE_EXPERIMENTS)
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

np.save(save_dir + '/X_train_' + RESULTS_ID + '.npy', X_train_norm)    
np.save(save_dir + '/X_test_' + RESULTS_ID + '.npy', X_test_norm)    
np.save(save_dir + '/Y_train_' + RESULTS_ID + '.npy', Y_train)    
np.save(save_dir + '/Y_test_' + RESULTS_ID + '.npy', Y_test)    