In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from tqdm import tqdm
import dask
from dask import dataframe as dd
from dask.diagnostics import ProgressBar
import glob
import random
import math
import json
import os
import gc

## Configuration

In [2]:
dask.config.set(num_workers=8, scheduler='processes')
random.seed(0)

# Directory where the derived data is stored
DERIVED_DATA_DIR = '../../../data'

# Number of force cells in the robotic leg
N_CELLS = 8

# Path where the results are stored
RESULTS_PATH = '../../../results'
# ID of the training and test data resulting from this notebook, stored in RESULTS_PATH
DATA_ID = '0005_19042021'

# Scaler to normalize the data
SCALER = MinMaxScaler() # StandardScaler()

# Total number of experiments
N_EXPERIMENTS = 63
# Number of folds for cross-validation
CV = 6
# Experiments for training set (int)
TRAIN_SIZE = 54
# Experiments for test set (int)
TEST_SIZE = 9

assert(TRAIN_SIZE + TEST_SIZE == N_EXPERIMENTS)
assert(TRAIN_SIZE % CV == 0)

## Features and target selection

In [3]:
H3_LEG = 'L' # L|R

features = [H3_LEG + a + m for a in ['Hip', 'Knee', 'Ankle'] for m in ['Pos', 'Vel', 'Acc', 'Torque']] + ['LegKnee{}Filtered'.format(m) for m in ['Position', 'Velocity', 'Torque']]
targets = ['F' + str(i + 1) + ax for i in range(N_CELLS) for ax in ['x', 'y', 'z']]

print('Number of features: {}'.format(len(features)))
print('Selected features: {}'.format(features))
print('\n')
print('Number of targets: {}'.format(len(targets)))
print('Selected targets: {}'.format(targets))

Number of features: 15
Selected features: ['LHipPos', 'LHipVel', 'LHipAcc', 'LHipTorque', 'LKneePos', 'LKneeVel', 'LKneeAcc', 'LKneeTorque', 'LAnklePos', 'LAnkleVel', 'LAnkleAcc', 'LAnkleTorque', 'LegKneePositionFiltered', 'LegKneeVelocityFiltered', 'LegKneeTorqueFiltered']


Number of targets: 24
Selected targets: ['F1x', 'F1y', 'F1z', 'F2x', 'F2y', 'F2z', 'F3x', 'F3y', 'F3z', 'F4x', 'F4y', 'F4z', 'F5x', 'F5y', 'F5z', 'F6x', 'F6y', 'F6z', 'F7x', 'F7y', 'F7z', 'F8x', 'F8y', 'F8z']


In [4]:
experiments_dirs_path = glob.glob(DERIVED_DATA_DIR + '/*/*')

assert(len(experiments_dirs_path) == N_EXPERIMENTS)

In [5]:
targets_dict = {}
features_dict = {}
for i, exp_path in enumerate(experiments_dirs_path):
    print('{} - Experiment {} from {}'.format(i, exp_path.split('/')[-1], exp_path.split('/')[-2]))
    
    # Load targets
    targets_df = pd.read_csv(exp_path + '/force_cells_processed.csv')
    
    # Load features
    exo_df = pd.read_csv(exp_path + '/H3_processed.csv')
    leg_df = pd.read_csv(exp_path + '/leg_processed.csv')
    features_df = pd.concat([exo_df, leg_df], axis=1)
    
#     idx_aux = targets_df.duplicated(keep='first')
#     targets_df = targets_df.loc[~idx_aux]
#     features_df = features_df.loc[~idx_aux]
#     print('Droping {} duplicated data points'.format(len(idx_aux[idx_aux == False])))
    
    # Drop first row to remove noise in the start of the data recording
    targets_df = targets_df.iloc[1:]
    features_df = features_df.iloc[1:]
    # Drop null values
    idx = features_df.notna().all(axis=1)
    features_df = features_df.loc[idx]
    targets_df = targets_df.loc[idx]
    print('Droping {} data points by null features'.format(len(idx[idx == False])))

    assert(len(features_df) == len(targets_df))
    # Store the final array
    targets_dict[i] = targets_df[targets].values
    features_dict[i] = features_df[features].values
    
    print('Experiment {} -> X: {}, Y: {} \n'.format(i, features_dict[i].shape, targets_dict[i].shape))

0 - Experiment 1 from 10032021
Droping 2 data points by null features
Experiment 0 -> X: (6072, 15), Y: (6072, 24) 

1 - Experiment 1 from 16022021
Droping 0 data points by null features
Experiment 1 -> X: (17907, 15), Y: (17907, 24) 

2 - Experiment 2 from 16022021
Droping 0 data points by null features
Experiment 2 -> X: (17875, 15), Y: (17875, 24) 

3 - Experiment 3 from 16022021
Droping 0 data points by null features
Experiment 3 -> X: (17914, 15), Y: (17914, 24) 

4 - Experiment 4 from 16022021
Droping 0 data points by null features
Experiment 4 -> X: (17960, 15), Y: (17960, 24) 

5 - Experiment 5 from 16022021
Droping 0 data points by null features
Experiment 5 -> X: (17898, 15), Y: (17898, 24) 

6 - Experiment 6 from 16022021
Droping 0 data points by null features
Experiment 6 -> X: (17841, 15), Y: (17841, 24) 

7 - Experiment 2 from 17022021
Droping 0 data points by null features
Experiment 7 -> X: (17914, 15), Y: (17914, 24) 

8 - Experiment 3 from 17022021
Droping 0 data poin

## Normalization and split for cross-validation

In [6]:
experiments = list(range(N_EXPERIMENTS))
random.shuffle(experiments)

train_experiments = experiments[:TRAIN_SIZE]
test_experiments = experiments[TRAIN_SIZE:]

print('Train experiments ids ({}): {}'.format(len(train_experiments), train_experiments))
print('Test experiments ids ({}): {}'.format(len(test_experiments), test_experiments))

assert(len(train_experiments) + len(test_experiments) == N_EXPERIMENTS)
# Check that no test experiment is in train
assert(not any([i in test_experiments for i in train_experiments]))

Train experiments ids (54): [53, 41, 3, 60, 33, 58, 27, 5, 7, 44, 49, 28, 23, 29, 46, 12, 57, 0, 61, 1, 43, 40, 14, 15, 17, 62, 20, 36, 10, 47, 11, 35, 52, 21, 4, 42, 51, 9, 38, 34, 59, 39, 6, 45, 18, 8, 55, 13, 37, 22, 30, 19, 50, 25]
Test experiments ids (9): [31, 32, 16, 2, 26, 56, 48, 24, 54]


In [7]:
X_train = np.concatenate([features_dict[i] for i in train_experiments], axis=0)
Y_train = np.concatenate([targets_dict[i] for i in train_experiments], axis=0)
X_test = np.concatenate([features_dict[i] for i in test_experiments], axis=0)
Y_test = np.concatenate([targets_dict[i] for i in test_experiments], axis=0)

print('Train -> X: {}, Y: {}'.format(X_train.shape, Y_train.shape))
print('Test -> X: {}, Y: {}'.format(X_test.shape, Y_test.shape))

Train -> X: (943372, 15), Y: (943372, 24)
Test -> X: (161200, 15), Y: (161200, 24)


In [8]:
s = SCALER.fit(X_train)

X_train_norm = s.transform(X_train)
X_test_norm = s.transform(X_test)

print('Train -> \n min: {}, \n max: {}, \n mean: {}, \n std: {}\n'.format(np.min(X_train_norm, axis=0), np.max(X_train_norm, axis=0), np.mean(X_train_norm, axis=0), np.std(X_train_norm, axis=0)))
print('Test -> \n min: {}, \n max: {}, \n mean: {}, \n std: {}\n'.format(np.min(X_test_norm, axis=0), np.max(X_test_norm, axis=0), np.mean(X_test_norm, axis=0), np.std(X_test_norm, axis=0)))

Train -> 
 min: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.], 
 max: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.], 
 mean: [0.44316332 0.51939702 0.50213284 0.4341789  0.2024983  0.63145772
 0.61899083 0.51941409 0.60775526 0.56178418 0.56970183 0.77066847
 0.26046828 0.42886819 0.67576674], 
 std: [0.33186054 0.11283305 0.1592397  0.16795629 0.279992   0.07667341
 0.11699678 0.1092075  0.23715171 0.06415873 0.09721195 0.06513208
 0.24029363 0.0676148  0.24251379]

Test -> 
 min: [1.66156419e-03 1.18331027e-01 1.47064973e-02 1.42088729e-02
 7.84836338e-05 4.08905180e-03 2.55045132e-02 7.77319844e-02
 4.41265436e-02 3.63196507e-01 3.62820351e-01 1.00349322e-02
 1.24428368e-02 2.54399174e-01 3.46046431e-02], 
 max: [0.99830005 1.0088766  0.98112243 0.9863905  0.99926046 0.99539474
 0.95661875 1.01847498 0.99940925 0.97938504 0.99811034 0.98918614
 0.99778234 0.69072286 0.94985511], 
 mean: [0.44285283 0.51929124 0.5011479  0.44217772 0.2034032  0.63144101
 0.61860992 0.52355774 0.607

In [9]:
save_dir = os.path.join(RESULTS_PATH, DATA_ID, 'data')
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

np.save(save_dir + '/X_train_' + DATA_ID + '.npy', X_train_norm)    
np.save(save_dir + '/X_test_' + DATA_ID + '.npy', X_test_norm)    
np.save(save_dir + '/Y_train_' + DATA_ID + '.npy', Y_train)    
np.save(save_dir + '/Y_test_' + DATA_ID + '.npy', Y_test)    

In [10]:
# Split the experiments of the training sets in different folds
exp_per_fold = len(train_experiments) // CV
cv_folds = [train_experiments[x:x+exp_per_fold] for x in range(0, len(train_experiments), exp_per_fold)]
print('CV folds ({}): {}\n'.format(len(cv_folds), cv_folds))

for fold_id in range(CV):
    print('Fold {}'.format(fold_id + 1))
    
    cv_folds_cp = cv_folds.copy()
    valid_experiments_fold = cv_folds_cp.pop(fold_id)
    train_experiments_fold = [item for sublist in cv_folds_cp for item in sublist]

    print('Train experiments ids ({}): {}'.format(len(train_experiments_fold), train_experiments_fold))
    print('Validation experiments ids ({}): {}'.format(len(valid_experiments_fold), valid_experiments_fold))

    assert(len(train_experiments_fold) + len(valid_experiments_fold) == len(train_experiments))
    # Check that no validation experiments are in train
    assert(not any([i in valid_experiments_fold for i in train_experiments_fold]))
    # Check that no test experiments are in train or validation folds
    assert(not any([i in test_experiments for i in train_experiments_fold]))
    assert(not any([i in test_experiments for i in valid_experiments_fold]))
    
    X_train = np.concatenate([features_dict[i] for i in train_experiments_fold], axis=0)
    Y_train = np.concatenate([targets_dict[i] for i in train_experiments_fold], axis=0)
    X_valid = np.concatenate([features_dict[i] for i in valid_experiments_fold], axis=0)
    Y_valid = np.concatenate([targets_dict[i] for i in valid_experiments_fold], axis=0)

    print('Train -> X: {}, Y: {}'.format(X_train.shape, Y_train.shape))
    print('Validation -> X: {}, Y: {}'.format(X_valid.shape, Y_valid.shape))
    
    # Normalize the data
    s = SCALER.fit(X_train)

    X_train_norm = s.transform(X_train)
    X_valid_norm =  s.transform(X_valid)

    print('Train -> \n min: {}, \n max: {}, \n mean: {}, \n std: {}\n'.format(np.min(X_train_norm, axis=0), np.max(X_train_norm, axis=0), np.mean(X_train_norm, axis=0), np.std(X_train_norm, axis=0)))
    print('Valid -> \n min: {}, \n max: {}, \n mean: {}, \n std: {}'.format(np.min(X_valid_norm, axis=0), np.max(X_valid_norm, axis=0), np.mean(X_valid_norm, axis=0), np.std(X_valid_norm, axis=0)))
    
    save_dir = os.path.join(RESULTS_PATH, DATA_ID, 'data')
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    np.save(save_dir + '/X_train_cv{}_{}.npy'.format(fold_id + 1, DATA_ID), X_train_norm)    
    np.save(save_dir + '/X_valid_cv{}_{}.npy'.format(fold_id + 1, DATA_ID), X_valid_norm)    
    np.save(save_dir + '/Y_train_cv{}_{}.npy'.format(fold_id + 1, DATA_ID), Y_train)    
    np.save(save_dir + '/Y_valid_cv{}_{}.npy'.format(fold_id + 1, DATA_ID), Y_valid)      
    
    print('\n')

CV folds (6): [[53, 41, 3, 60, 33, 58, 27, 5, 7], [44, 49, 28, 23, 29, 46, 12, 57, 0], [61, 1, 43, 40, 14, 15, 17, 62, 20], [36, 10, 47, 11, 35, 52, 21, 4, 42], [51, 9, 38, 34, 59, 39, 6, 45, 18], [8, 55, 13, 37, 22, 30, 19, 50, 25]]

Fold 1
Train experiments ids (45): [44, 49, 28, 23, 29, 46, 12, 57, 0, 61, 1, 43, 40, 14, 15, 17, 62, 20, 36, 10, 47, 11, 35, 52, 21, 4, 42, 51, 9, 38, 34, 59, 39, 6, 45, 18, 8, 55, 13, 37, 22, 30, 19, 50, 25]
Validation experiments ids (9): [53, 41, 3, 60, 33, 58, 27, 5, 7]
Train -> X: (782395, 15), Y: (782395, 24)
Validation -> X: (160977, 15), Y: (160977, 24)
Train -> 
 min: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.], 
 max: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.], 
 mean: [0.44348832 0.51942255 0.50165771 0.43484463 0.20263287 0.6314615
 0.61879597 0.52085447 0.60775433 0.56177931 0.56951611 0.77066066
 0.26131274 0.4289091  0.67072041], 
 std: [0.33165048 0.11118223 0.1609837  0.16936138 0.28042794 0.07558097
 0.11822377 0.11043009 0.23711