In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import dask
from dask import dataframe as dd
from dask.diagnostics import ProgressBar
import random
import math
import json
import os
import gc

## Configuration

In [2]:
dask.config.set(num_workers=8, scheduler='processes')
random.seed(0)

# Path where the data is stored
SOURCE_PATH = '../../../data'
# Directory inside SOURCE_PATH where the original data is stored
ORIGINAL_DATA_DIR = '/EXOSAFE'
# Directory inside SOURCE_PATH where the derived data is stored
DERIVED_DATA_DIR = '/derived_data'

# Number of force cells in the robotic leg
N_CELLS = 8

# Experiment params
DATE_EXPERIMENTS = '24022021'
N_EXPERIMENTS = 15

# Path where the results are stored
RESULTS_PATH = '../../../results'
# ID of the training and test data resulting from this notebook, stored in RESULTS_PATH
DATA_ID = '0003_11042021'

# Number of folds for cross-validation
CV = 5
# % of the data for the train, validation and test sets
SPLIT_PCT = (0.6, 0.2, 0.2) # (train, validation, test)

## Preprocessing

In [3]:
# Rotate force vectors of each force cell to align them
rotations = {
    1: [180, 90, 0],
    2: [180, 90, 0],
    3: [180, 0, -90],
    4: [0, 0, -90],
    5: [0, 0, 0],
    6: [0, 180, 0],
    7: [0, 90, 0],
    8: [0, 0, 90],
}

def rotate_vector(v, axis, angle):
    '''
    Args:
    - v (np.array): Vector to be rotated
    - axis (int): Axis along the rotation is performed
    - angle (int): Rotation angle
    
    Returns:
    - (np.array)): Rotated vector
    '''
    if axis == 0:
        # X
        v = v.dot(np.array([[1, 0, 0], [0, np.cos(np.radians(angle)), np.sin(np.radians(angle))], [0, np.sin(np.radians(angle)), np.cos(np.radians(angle))]]))
    elif axis == 1:
        # Y
        v = v.dot(np.array([[np.cos(np.radians(angle)), 0, np.sin(np.radians(angle))], [0, 1, 0], [-np.sin(np.radians(angle)), 0, np.cos(np.radians(angle))]]))
    elif axis == 2:
        # Z
        v = v.dot(np.array([[np.cos(np.radians(angle)), -np.sin(np.radians(angle)), 0], [np.sin(np.radians(angle)), np.cos(np.radians(angle)), 0], [0, 0, 1]]))
    else:
        raise ValueError('Invalid axis')

    return v

@dask.delayed
def rotate_row(row):
    '''
    Rotate the force vectors in a row. Dask function.
    '''
    for i in range(1, N_CELLS + 1):
        cols = ['F{}x'.format(str(i)), 'F{}y'.format(str(i)), 'F{}z'.format(str(i))]
        for ax in range(3):
            row[cols] = rotate_vector(row[cols], ax, rotations[i][ax])
            
    return row

def process_parameters_sheet(params_df):
    '''
    Process the data in the given pd.DataFrame from the raw excel sheet. 
    
    Args:
    - params_df (pd.DataFrame): DataFrame of the parameters excel sheet.
    
    Returns:
    - params_dict (dict): Dictionary with all the parameter in the input DataFrame.
    '''
    params_dict = {}
    params_dict['ExoHipMissalign'] = params_df.iloc[2, 1]
    params_dict['ExoKneeMissalign'] = params_df.iloc[2, 2]
    params_dict['MarchVelocity'] = params_df.iloc[0, 11]
    params_dict['TimeShift'] = params_df.iloc[0, 12]
    params_dict['SkinConfig'] = params_df.iloc[0, 13]
    
    return params_dict

def shift_leg_data(df, time_shift, total_len, data_res=0.01):
    '''
    Shift the data from the leg replica using the known time_shift from the experiment
    parameters to match the exoskeleton data in time and lenght.
    
    Args:
    - df (pd.DataFrame): DataFrame with the data of the leg replica
    - time_shift (float): Shifting time to applied to the data.
    - total_len (int): Total desired lenght for the data.
    - data_res (float): Data resolution (in seconds).
    
    Returns:
    - (pd.DataFrame): DataFrame with the data of the leg replica shifted.
    '''
    idx_start = math.ceil(time_shift / data_res)
    idx_end = total_len + idx_start
    return df.iloc[idx_start:idx_end].reset_index(drop=True)

In [4]:
for i in range(N_EXPERIMENTS):
    print('Processing file {}:'.format(i + 1))
    # Create the directory to save the resulting data
    save_dir = os.path.join(SOURCE_PATH + DERIVED_DATA_DIR, DATE_EXPERIMENTS, str(i + 1))
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    
    # Select only the relevant excel sheets 
    sheets = ['Parameters', 'RawForces', 'ForceCells', 'H3raw', 'H3processed', 'Leg-Replica']
    # Load the data
    data_df = pd.read_excel(SOURCE_PATH + ORIGINAL_DATA_DIR + '/' + DATE_EXPERIMENTS + '/0{}-'.format(i + 1) + DATE_EXPERIMENTS + '.xlsx', sheet_name=sheets)

    # Pre-process the data
    data_df[sheets[0]] = process_parameters_sheet(data_df[sheets[0]])
    
    # Apply the rotation matrix to each force vector
    forces_ddf = dd.from_pandas(data_df[sheets[2]], npartitions=int(len(data_df[sheets[2]]) / 100))
    forces_ddf = forces_ddf.apply(rotate_row, axis=1, meta=forces_ddf)
    with ProgressBar():
        data_df[sheets[2]] = forces_ddf.compute()
        
        
    leg_df_raw = data_df[sheets[5]].iloc[:, :3]
    # Correct the time shift between the data from the leg and the data from the exo
    leg_df_processed = shift_leg_data(data_df[sheets[5]].iloc[:, 3:], data_df[sheets[0]]['TimeShift'], len(data_df[sheets[4]]))
    
    assert(len(leg_df_processed) == len(data_df[sheets[4]]) == len(data_df[sheets[2]]))    

    json.dump(data_df[sheets[0]], open(save_dir + '/parameters.json', 'w'))
    data_df[sheets[1]].to_csv(save_dir + '/force_cells_raw.csv', index=False)
    data_df[sheets[2]].to_csv(save_dir + '/force_cells_processed.csv', index=False)
    data_df[sheets[3]].to_csv(save_dir + '/H3_raw.csv', index=False)
    data_df[sheets[4]].to_csv(save_dir + '/H3_processed.csv', index=False)
    leg_df_raw.to_csv(save_dir + '/leg_raw.csv', index=False)
    leg_df_processed.to_csv(save_dir + '/leg_processed.csv', index=False)
    
    print('')


Processing file 1:
[########################################] | 100% Completed |  1min  6.0s

Processing file 2:
[########################################] | 100% Completed |  1min  4.3s

Processing file 3:
[########################################] | 100% Completed |  1min  3.8s

Processing file 4:
[########################################] | 100% Completed |  1min  6.3s

Processing file 5:
[########################################] | 100% Completed |  1min  4.4s

Processing file 6:
[########################################] | 100% Completed |  1min 11.3s

Processing file 7:
[########################################] | 100% Completed |  1min  3.9s

Processing file 8:
[########################################] | 100% Completed |  1min 10.8s

Processing file 9:
[########################################] | 100% Completed |  1min  6.1s

Processing file 10:
[########################################] | 100% Completed |  1min  2.6s

Processing file 11:
[######################################

## Features and target selection

In [3]:
H3_LEG = 'L' # L|R

features = [H3_LEG + a + m for a in ['Hip', 'Knee', 'Ankle'] for m in ['Pos', 'Vel', 'Acc', 'Torque']] + ['LegKnee{}Filtered'.format(m) for m in ['Position', 'Velocity', 'Torque']]
targets = ['F' + str(i + 1) + ax for i in range(N_CELLS) for ax in ['x', 'y', 'z']]

print('Number of features: {}'.format(len(features)))
print('Selected features: {}'.format(features))
print('\n')
print('Number of targets: {}'.format(len(targets)))
print('Selected targets: {}'.format(targets))

Number of features: 15
Selected features: ['LHipPos', 'LHipVel', 'LHipAcc', 'LHipTorque', 'LKneePos', 'LKneeVel', 'LKneeAcc', 'LKneeTorque', 'LAnklePos', 'LAnkleVel', 'LAnkleAcc', 'LAnkleTorque', 'LegKneePositionFiltered', 'LegKneeVelocityFiltered', 'LegKneeTorqueFiltered']


Number of targets: 24
Selected targets: ['F1x', 'F1y', 'F1z', 'F2x', 'F2y', 'F2z', 'F3x', 'F3y', 'F3z', 'F4x', 'F4y', 'F4z', 'F5x', 'F5y', 'F5z', 'F6x', 'F6y', 'F6z', 'F7x', 'F7y', 'F7z', 'F8x', 'F8y', 'F8z']


In [4]:
targets_dict = {}
features_dict = {}
for i in range(1, N_EXPERIMENTS + 1):
    # Define the path to load the data
    data_dir = os.path.join(SOURCE_PATH + DERIVED_DATA_DIR, DATE_EXPERIMENTS, str(i))
    
    # Load targets
    targets_df = pd.read_csv(data_dir + '/force_cells_processed.csv')
    
    # Load features
    exo_df = pd.read_csv(data_dir + '/H3_processed.csv')
    leg_df = pd.read_csv(data_dir + '/leg_processed.csv')
    features_df = pd.concat([exo_df, leg_df], axis=1)
    
    idx_aux = targets_df.duplicated(keep='first')
    targets_df = targets_df.loc[~idx_aux]
    features_df = features_df.loc[~idx_aux]
    print('Droping {} duplicated data points'.format(len(idx_aux[idx_aux == False])))
    
    # Rename columns to manage with some typos
    features_df = features_df.rename(columns={'LankleTorque': 'LAnkleTorque', 'RankleTorque': 'RAnkleTorque'})

    # Drop null values
    idx = features_df.notna().all(axis=1)
    features_df = features_df.loc[idx]
    targets_df = targets_df.loc[idx]
    print('Droping {} data points by null features'.format(len(idx[idx == False])))

    # Store the final array
    targets_dict[i] = targets_df[targets].values
    features_dict[i] = features_df[features].values
    
    print('Experiment {} -> X: {}, Y: {} \n'.format(i, features_dict[i].shape, targets_dict[i].shape))

Droping 8722 duplicated data points
Droping 1 data points by null features
Experiment 1 -> X: (8721, 15), Y: (8721, 24) 

Droping 8736 duplicated data points
Droping 1 data points by null features
Experiment 2 -> X: (8735, 15), Y: (8735, 24) 

Droping 8589 duplicated data points
Droping 1 data points by null features
Experiment 3 -> X: (8588, 15), Y: (8588, 24) 

Droping 8726 duplicated data points
Droping 1 data points by null features
Experiment 4 -> X: (8725, 15), Y: (8725, 24) 

Droping 8624 duplicated data points
Droping 1 data points by null features
Experiment 5 -> X: (8623, 15), Y: (8623, 24) 

Droping 8760 duplicated data points
Droping 1 data points by null features
Experiment 6 -> X: (8759, 15), Y: (8759, 24) 

Droping 8639 duplicated data points
Droping 1 data points by null features
Experiment 7 -> X: (8638, 15), Y: (8638, 24) 

Droping 8773 duplicated data points
Droping 1 data points by null features
Experiment 8 -> X: (8772, 15), Y: (8772, 24) 

Droping 8769 duplicated 

## Normalization and split for cross-validation

In [5]:
experiments = list(range(1, N_EXPERIMENTS + 1))
random.shuffle(experiments)

train_experiments = experiments[:int(N_EXPERIMENTS * (SPLIT_PCT[0] + SPLIT_PCT[1]))]
test_experiments = experiments[-int(N_EXPERIMENTS * SPLIT_PCT[2]):]

print('Train experiments ids ({}): {}'.format(len(train_experiments), train_experiments))
print('Test experiments ids ({}): {}'.format(len(test_experiments), test_experiments))

# Check that no test experiment is in train
assert(not any([i in test_experiments for i in train_experiments]))

Train experiments ids (12): [2, 11, 10, 6, 12, 3, 4, 8, 9, 5, 1, 15]
Test experiments ids (3): [13, 7, 14]


In [6]:
X_train = np.concatenate([features_dict[i] for i in train_experiments], axis=0)
Y_train = np.concatenate([targets_dict[i] for i in train_experiments], axis=0)
X_test = np.concatenate([features_dict[i] for i in test_experiments], axis=0)
Y_test = np.concatenate([targets_dict[i] for i in test_experiments], axis=0)

print('Train -> X: {}, Y: {}'.format(X_train.shape, Y_train.shape))
print('Test -> X: {}, Y: {}'.format(X_test.shape, Y_test.shape))

Train -> X: (104561, 15), Y: (104561, 24)
Test -> X: (25985, 15), Y: (25985, 24)


In [7]:
scaler = StandardScaler().fit(X_train)

# Only transform test data because the training data has to be split in CV folds
X_test_norm =  scaler.transform(X_test)

print('Test -> \n min: {}, \n max: {}, \n mean: {}, \n std: {}\n'.format(np.min(X_test_norm, axis=0), np.max(X_test_norm, axis=0), np.mean(X_test_norm, axis=0), np.std(X_test_norm, axis=0)))

Test -> 
 min: [-1.31007256 -1.21947194 -2.39689804 -1.98521388 -0.72154655 -2.87493359
 -4.27516027 -1.81340945 -2.38972514 -2.35581996 -1.61207976 -1.77671285
 -0.79982154 -2.54003847 -2.684084  ], 
 max: [ 1.6556621   2.07734193  2.74539279  3.07156386  2.83887633  2.19956122
  2.22465618  3.90613018  1.65454986  2.09792576  3.56101584 11.30813031
  3.02245127  3.71964724  0.77958802], 
 mean: [ 2.16157527e-03  1.70658398e-03 -8.90862413e-04  3.66093570e-03
 -1.32857969e-02 -1.00024901e-03  3.47354476e-04  8.04899012e-02
  5.92649776e-04 -1.47336199e-03 -1.86910567e-04  5.57907982e-01
 -7.28462021e-02  2.18776814e-03 -6.56859997e-01], 
 std: [0.99188675 0.99029596 0.97062917 0.98235935 0.96690402 0.9477156
 0.91826453 1.17648008 0.9993203  1.00255372 1.00280558 1.43140207
 0.96839934 0.93785194 1.07885434]



In [9]:
save_dir = os.path.join(RESULTS_PATH, DATA_ID, 'data')
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

np.save(save_dir + '/X_test_' + DATA_ID + '.npy', X_test_norm)    
np.save(save_dir + '/Y_test_' + DATA_ID + '.npy', Y_test)    

In [10]:
for fold_id in range(CV):
    print('Fold {}'.format(fold_id + 1))
    
    # From the training experiment ids, a different validation set is created for each fold
    random.shuffle(train_experiments)
    train_experiments_fold = train_experiments[:int(N_EXPERIMENTS * SPLIT_PCT[0])]
    valid_experiments_fold = train_experiments[-int(N_EXPERIMENTS * SPLIT_PCT[1]):]

    print('Train experiments ids ({}): {}'.format(len(train_experiments_fold), train_experiments_fold))
    print('Validation experiments ids ({}): {}'.format(len(valid_experiments_fold), valid_experiments_fold))

    # Check that no validation experiments are in train
    assert(not any([i in valid_experiments_fold for i in train_experiments_fold]))
    # Check that no test experiments are in train or validation folds
    assert(not any([i in test_experiments for i in train_experiments_fold]))
    assert(not any([i in test_experiments for i in valid_experiments_fold]))
    
    X_train = np.concatenate([features_dict[i] for i in train_experiments_fold], axis=0)
    Y_train = np.concatenate([targets_dict[i] for i in train_experiments_fold], axis=0)
    X_valid = np.concatenate([features_dict[i] for i in valid_experiments_fold], axis=0)
    Y_valid = np.concatenate([targets_dict[i] for i in valid_experiments_fold], axis=0)

    print('Train -> X: {}, Y: {}'.format(X_train.shape, Y_train.shape))
    print('Validation -> X: {}, Y: {}'.format(X_valid.shape, Y_valid.shape))
    
    # Normalize the data
    scaler = StandardScaler().fit(X_train)

    X_train_norm = scaler.transform(X_train)
    X_valid_norm =  scaler.transform(X_valid)

    print('Train -> \n min: {}, \n max: {}, \n mean: {}, \n std: {}\n'.format(np.min(X_train_norm, axis=0), np.max(X_train_norm, axis=0), np.mean(X_train_norm, axis=0), np.std(X_train_norm, axis=0)))
    print('Valid -> \n min: {}, \n max: {}, \n mean: {}, \n std: {}\n'.format(np.min(X_valid_norm, axis=0), np.max(X_valid_norm, axis=0), np.mean(X_valid_norm, axis=0), np.std(X_valid_norm, axis=0)))
    
    save_dir = os.path.join(RESULTS_PATH, DATA_ID, 'data')
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    np.save(save_dir + '/X_train_cv{}_{}.npy'.format(fold_id + 1, DATA_ID), X_train_norm)    
    np.save(save_dir + '/X_valid_cv{}_{}.npy'.format(fold_id + 1, DATA_ID), X_valid_norm)    
    np.save(save_dir + '/Y_train_cv{}_{}.npy'.format(fold_id + 1, DATA_ID), Y_train)    
    np.save(save_dir + '/Y_valid_cv{}_{}.npy'.format(fold_id + 1, DATA_ID), Y_valid)      
    
    print('\n')

Fold 1
Train experiments ids (9): [3, 2, 6, 8, 4, 15, 5, 11, 1]
Validation experiments ids (3): [12, 10, 9]
Train -> X: (78297, 15), Y: (78297, 24)
Validation -> X: (26264, 15), Y: (26264, 24)
Train -> 
 min: [-1.33046351 -1.26041507 -3.07943234 -2.32243966 -0.72124103 -2.88740256
 -4.3262475  -1.84145692 -2.55391811 -2.36680569 -4.68163954 -2.20758705
 -0.75096241 -2.50721035 -3.28458617], 
 max: [1.66244978 2.11289163 2.8470798  3.18053987 2.82469056 2.17309058
 2.52474468 3.99678025 1.65037305 3.29353628 3.691768   7.21402248
 3.02857536 3.89603932 0.65756598], 
 mean: [-1.08899611e-18  8.07672114e-18 -5.44498054e-18 -2.90398962e-18
 -3.62998703e-18 -4.26523476e-18 -4.26523476e-18  1.08899611e-17
  4.90048249e-18 -1.54274449e-18 -3.62998703e-18  5.80797924e-17
 -1.45199481e-18  1.45199481e-18  2.38127149e-16], 
 std: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]

Valid -> 
 min: [-1.29641619 -1.20590834 -2.67022555 -2.08122493 -0.72108716 -2.87900821
 -4.25836556 -1.8748658  -2.552