In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.signal import savgol_filter
from tqdm import tqdm
import dask
from dask import dataframe as dd
from dask.diagnostics import ProgressBar
import glob
import random
import math
import json
import os
import gc

## Configuration

In [2]:
dask.config.set(num_workers=8, scheduler='processes')
random.seed(0)

# Directory where the original data is stored
ORIGINAL_DATA_DIR = '../../../../EXOSAFE-DATA'
# Directory where the derived data is stored
DERIVED_DATA_DIR = '../../../data'

# Number of force cells in the robotic leg
N_CELLS = 8

## Preprocessing

In [3]:
data_ls = glob.glob(ORIGINAL_DATA_DIR + '/*/*.xlsx')

print('Files found ({}):'.format(len(data_ls)))
print([file.split('/')[-1] for file in data_ls])

Files found (101):
['01-06022021.xlsx', '02-06022021.xlsx', '03-06022021.xlsx', '07-06022021.xlsx', '08-06022021.xlsx', '10-06022021.xlsx', '03-08022021.xlsx', '04-08022021.xlsx', '02-10022021.xlsx', '03-10022021.xlsx', '04-10022021.xlsx', '05-10022021.xlsx', '01-10032021.xlsx', '01-12022021.xlsx', '02-12022021.xlsx', '03-12022021.xlsx', '04-12022021.xlsx', '05-12022021.xlsx', '06-12022021.xlsx', '07-12022021.xlsx', '02-15022021.xlsx', '03-15022021.xlsx', '04-15022021.xlsx', '01-16022021.xlsx', '02-16022021.xlsx', '03-16022021.xlsx', '04-16022021.xlsx', '05-16022021.xlsx', '06-16022021.xlsx', '02-17022021.xlsx', '03-17022021.xlsx', '04-17022021.xlsx', '01-19022021.xlsx', '010-19022021.xlsx', '011-19022021.xlsx', '012-19022021.xlsx', '013-19022021.xlsx', '014-19022021.xlsx', '015-19022021.xlsx', '016-19022021.xlsx', '017-19022021.xlsx', '018-19022021.xlsx', '02-19022021.xlsx', '03-19022021.xlsx', '04-19022021.xlsx', '05-19022021.xlsx', '06-19022021.xlsx', '07-19022021.xlsx', '08-1902202

In [4]:
# Dictionary to exclude specific experiments (date: [experiment ids])
# These experiments are excluded by lack of data
EXPERIMENTS_TO_EXCLUDE = {
    '06022021': ['01', '02', '03', '07', '08', '10'],
    '08022021': ['03', '04'],
    '10022021': ['02', '03', '04', '05'],
    '12022021': ['01', '02', '03', '04', '05', '06', '07'],
    '15022021': ['02', '03', '04'],
    '26032021': ['01', '02', '03', '04', '06', '07', '09', '011', '012'],
    '21042021': ['02', '03', '04', '05', '06', '07', '08']
}

# Exclude some experiments from the list of files to process
for exp_date, exp_ids in EXPERIMENTS_TO_EXCLUDE.items():
    for i in exp_ids:
        data_ls.remove(ORIGINAL_DATA_DIR + '/{}/{}-{}.xlsx'.format(exp_date, i, exp_date))
        
print('Files to process ({}):'.format(len(data_ls)))
print([file.split('/')[-1] for file in data_ls])

Files to process (63):
['01-10032021.xlsx', '01-16022021.xlsx', '02-16022021.xlsx', '03-16022021.xlsx', '04-16022021.xlsx', '05-16022021.xlsx', '06-16022021.xlsx', '02-17022021.xlsx', '03-17022021.xlsx', '04-17022021.xlsx', '01-19022021.xlsx', '010-19022021.xlsx', '011-19022021.xlsx', '012-19022021.xlsx', '013-19022021.xlsx', '014-19022021.xlsx', '015-19022021.xlsx', '016-19022021.xlsx', '017-19022021.xlsx', '018-19022021.xlsx', '02-19022021.xlsx', '03-19022021.xlsx', '04-19022021.xlsx', '05-19022021.xlsx', '06-19022021.xlsx', '07-19022021.xlsx', '08-19022021.xlsx', '09-19022021.xlsx', '01-22022021.xlsx', '010-22022021.xlsx', '011-22022021.xlsx', '012-22022021.xlsx', '013-22022021.xlsx', '014-22022021.xlsx', '015-22022021.xlsx', '016-22022021.xlsx', '017-22022021.xlsx', '018-22022021.xlsx', '019-22022021.xlsx', '02-22022021.xlsx', '03-22022021.xlsx', '04-22022021.xlsx', '05-22022021.xlsx', '06-22022021.xlsx', '07-22022021.xlsx', '08-22022021.xlsx', '09-22022021.xlsx', '01-24022021.xlsx

In [5]:
# Rotate force vectors of each force cell to align them
rotations = {
    1: [180, 90, 0],
    2: [180, 90, 0],
    3: [180, 0, -90],
    4: [0, 0, -90],
    5: [0, 0, 0],
    6: [0, 180, 0],
    7: [0, 90, 0],
    8: [0, 0, 90],
}

def rotate_vector(v, axis, angle):
    '''
    Args:
    - v (np.array): Vector to be rotated
    - axis (int): Axis along the rotation is performed
    - angle (int): Rotation angle
    
    Returns:
    - (np.array)): Rotated vector
    '''
    if axis == 0:
        # X
        v = v.dot(np.array([[1, 0, 0], [0, np.cos(np.radians(angle)), np.sin(np.radians(angle))], [0, np.sin(np.radians(angle)), np.cos(np.radians(angle))]]))
    elif axis == 1:
        # Y
        v = v.dot(np.array([[np.cos(np.radians(angle)), 0, np.sin(np.radians(angle))], [0, 1, 0], [-np.sin(np.radians(angle)), 0, np.cos(np.radians(angle))]]))
    elif axis == 2:
        # Z
        v = v.dot(np.array([[np.cos(np.radians(angle)), -np.sin(np.radians(angle)), 0], [np.sin(np.radians(angle)), np.cos(np.radians(angle)), 0], [0, 0, 1]]))
    else:
        raise ValueError('Invalid axis')

    return v

@dask.delayed
def rotate_row(row):
    '''
    Rotate the force vectors in a row. Dask function.
    '''
    for i in range(1, N_CELLS + 1):
        cols = ['F{}x'.format(str(i)), 'F{}y'.format(str(i)), 'F{}z'.format(str(i))]
        for ax in range(3):
            row[cols] = rotate_vector(row[cols], ax, rotations[i][ax])
            
    return row

def shift_leg_data(df, time_shift, total_len, data_res=0.01):
    '''
    Shift the data from the leg replica using the known time_shift from the experiment
    parameters to match the exoskeleton data in time and lenght.
    
    Args:
    - df (pd.DataFrame): DataFrame with the data of the leg replica
    - time_shift (float): Shifting time to applied to the data.
    - total_len (int): Total desired lenght for the data.
    - data_res (float): Data resolution (in seconds).
    
    Returns:
    - (pd.DataFrame): DataFrame with the data of the leg replica shifted.
    '''
    idx_start = math.ceil(time_shift / data_res)
    idx_end = total_len + idx_start
    return df.iloc[idx_start:idx_end].reset_index(drop=True)

def compute_time_shift(exo_knee_pos, leg_knee_pos, data_res=0.01, window_size=51, pol_order=3, threshold=30, max_time_shift=10):
    '''
    Find the time shift between the exo knee position and the robotic leg knee position based on the local maximas
    
    Args:
    - exo_knee_pos (np.array): Exoeskeleton knee position signal.
    - leg_knee_pos (np.array): Robotic leg knee position signal.
    - data_res (float) [default = 0.01]: Time resolution of the data (0.01 = 100Hz)
    - window_size (int) [default = 51]: Window size to smooth the signals with savgol filter
    - pol_order (int) [default = 3]: Polinomial degree to smooth the signals with savgol filter
    - threshold (int) [default = 30]: Threshold to filter the 0 cross of the derivates of the signals
    - max_time_shift (int) [default = 10]: Max value of seconds to generate the cost function and find the minumum
    
    Return:
    - time_shift (float): Time shift between the exoeskeleton and robotic leg signals
    '''
    # Smooth the exo and leg position signal
    exo_arr_smooth = savgol_filter(exo_knee_pos, window_size, pol_order) # window size 51, polynomial order 3
    leg_arr_smooth = savgol_filter(leg_knee_pos, window_size, pol_order) 
    
    # Compute derivative of the signal to find de local maxima
    exo_dev = np.gradient(exo_arr_smooth, data_res)
    exo_dev_smooth = savgol_filter(exo_dev, window_size, pol_order) 
    leg_dev = np.gradient(leg_arr_smooth, data_res)
    leg_dev_smooth = savgol_filter(leg_dev, window_size, pol_order)
    
    # Find the x coordinate of the maximum points
    exo_idx_max = []
    searching = False
    for i in range(1, exo_dev_smooth.shape[0]):
        # Search for a point where the gradient is decreasing before cross 0
        if exo_dev_smooth[i - 1] > threshold and exo_dev_smooth[i] < threshold:
            searching = True
        # Only if the gradient is decreasing from a point higher than threshold,
        # then search for a 0 crossing
        if searching and exo_dev_smooth[i - 1] > 0 and exo_dev_smooth[i] < 0:
            searching = False
            exo_idx_max.append(i)

    leg_idx_max = []
    searching = False
    for i in range(1, leg_dev_smooth.shape[0]):
        # Search for a point where the gradient is decreasing before cross 0
        if leg_dev_smooth[i - 1] > threshold and leg_dev_smooth[i] < threshold:
            searching = True
        # Only if the gradient is decreasing from a point higher than threshold,
        # then search for a 0 crossing
        if searching and leg_dev_smooth[i - 1] > 0 and leg_dev_smooth[i] < 0:
            searching = False
            leg_idx_max.append(i)

    # Compute the location of the max points in time scale
    exo_maxs = np.array(np.arange(0, len(exo_arr) * data_res, data_res)[exo_idx_max])
    leg_maxs = np.array(np.arange(0, len(leg_arr) * data_res, data_res)[leg_idx_max])

    min_aux = np.min([len(exo_maxs), len(leg_maxs)])

    exo_maxs = exo_maxs[:min_aux]
    leg_maxs = leg_maxs[:min_aux]
    
    time_shift_ls = np.arange(0, max_time_shift, data_res)

    MAE_ls = []
    for ts in time_shift_ls:
        MAE_ls.append(mean_absolute_error(exo_maxs, leg_maxs - ts))

    time_shift = time_shift_ls[np.argmin(MAE_ls)]
    
    return time_shift

In [6]:
for file in tqdm(data_ls):
    # print('Processing file {}'.format(file))
    exp_aux_ls = file.split('/')[-1].split('-')
    exp_date = exp_aux_ls[-1][:8]
    exp_id = int(exp_aux_ls[0])

    # Create the directory to save the resulting data
    save_dir = os.path.join(DERIVED_DATA_DIR, exp_date, str(exp_id))
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
        
    # Load the data
    data_dict = pd.read_excel(file, sheet_name=None)
    
    # Extract the time shift between the exo and the robotic leg for the experiment
    exo_arr = data_dict['H3processed'].iloc[:, 1].values
    leg_arr = data_dict['Leg-Replica'].iloc[:, 3].values
    
    time_shift = compute_time_shift(exo_arr, leg_arr)
    assert(not np.isnan(time_shift))
    
    # Fix typos
    data_dict['H3processed'].columns = (['L{}Pos'.format(a) for a in ['Hip', 'Knee', 'Ankle']] + list(data_dict['H3processed'].columns))[:-3]
    data_dict['H3processed'] = data_dict['H3processed'].rename(columns={'LankleTorque': 'LAnkleTorque', 'RankleTorque': 'RAnkleTorque'})
    
    # Extract the necessary data from the xlsx
    force_cols = ['F{}{}'.format(i + 1, ax) for i in range(N_CELLS) for ax in ['x', 'y', 'z']]
    leg_cols = ['LegKnee{}Filtered'.format(m) for m in ['Position', 'Velocity', 'Torque']]
    h3_cols = ['L{}{}'.format(a, m) for a in ['Hip', 'Knee', 'Ankle'] for m in ['Pos', 'Vel', 'Acc', 'Torque']]
    forces_df = data_dict['ForceCells'][force_cols]
    leg_df = data_dict['Leg-Replica'][leg_cols]
    h3_df = data_dict['H3processed'][h3_cols]
    
    # Apply the rotation matrix to each force vector
    forces_ddf = dd.from_pandas(forces_df, npartitions=int(len(forces_df) / 100))
    forces_ddf = forces_ddf.apply(rotate_row, axis=1, meta=forces_ddf)
    forces_df = forces_ddf.compute()
    # with ProgressBar():
    #     forces_df = forces_ddf.compute()

    # Correct the time shift between the data from the leg and the data from the exo
    forces_df = shift_leg_data(forces_df, time_shift, len(h3_df))
    leg_df = shift_leg_data(leg_df, time_shift, len(h3_df))
    
    # Cut data of different sources to get the same lenght
    max_available_data = min([len(forces_df), len(h3_df), len(leg_df)])
    forces_df = forces_df.iloc[:max_available_data]
    leg_df = leg_df.iloc[:max_available_data]
    h3_df = h3_df.iloc[:max_available_data]
    assert(len(forces_df) == len(h3_df) == len(leg_df))
    # print('Total data points: {}'.format(len(forces_df)))

    forces_df.to_csv(save_dir + '/force_cells_processed.csv', index=False)
    h3_df.to_csv(save_dir + '/H3_processed.csv', index=False)
    leg_df.to_csv(save_dir + '/leg_processed.csv', index=False)
    
    # print('\n')


100%|██████████| 63/63 [2:31:02<00:00, 143.85s/it]  
