In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from tqdm import tqdm
import dask
from dask import dataframe as dd
from dask.diagnostics import ProgressBar
import random
import math
import json
import os
import gc

## Configuration

In [2]:
dask.config.set(num_workers=8, scheduler='processes')
random.seed(0)

# Path where the data is stored
SOURCE_PATH = '../../../data'
# Directory inside SOURCE_PATH where the derived data is stored
DERIVED_DATA_DIR = '/derived_data'

# Number of force cells in the robotic leg
N_CELLS = 8

# Experiment params
DATE_EXPERIMENTS = '24022021'
N_EXPERIMENTS = 15

# Path where the results are stored
RESULTS_PATH = '../../../results'
# ID of the training and test data resulting from this notebook, stored in RESULTS_PATH
DATA_ID = '0004_15042021'

# Scaler to normalize the data
SCALER = MinMaxScaler() # StandardScaler()

# Number of folds for cross-validation
CV = 6
# Experiments for training set (int)
TRAIN_SIZE = 12
# Experiments for test set (int)
TEST_SIZE = 3

assert(TRAIN_SIZE + TEST_SIZE == N_EXPERIMENTS)
assert(TRAIN_SIZE % CV == 0)

## Features and target selection

In [3]:
H3_LEG = 'L' # L|R

features = [H3_LEG + a + m for a in ['Hip', 'Knee', 'Ankle'] for m in ['Pos', 'Vel', 'Acc', 'Torque']] + ['LegKnee{}Filtered'.format(m) for m in ['Position', 'Velocity', 'Torque']]
targets = ['F' + str(i + 1) + ax for i in range(N_CELLS) for ax in ['x', 'y', 'z']]

print('Number of features: {}'.format(len(features)))
print('Selected features: {}'.format(features))
print('\n')
print('Number of targets: {}'.format(len(targets)))
print('Selected targets: {}'.format(targets))

Number of features: 15
Selected features: ['LHipPos', 'LHipVel', 'LHipAcc', 'LHipTorque', 'LKneePos', 'LKneeVel', 'LKneeAcc', 'LKneeTorque', 'LAnklePos', 'LAnkleVel', 'LAnkleAcc', 'LAnkleTorque', 'LegKneePositionFiltered', 'LegKneeVelocityFiltered', 'LegKneeTorqueFiltered']


Number of targets: 24
Selected targets: ['F1x', 'F1y', 'F1z', 'F2x', 'F2y', 'F2z', 'F3x', 'F3y', 'F3z', 'F4x', 'F4y', 'F4z', 'F5x', 'F5y', 'F5z', 'F6x', 'F6y', 'F6z', 'F7x', 'F7y', 'F7z', 'F8x', 'F8y', 'F8z']


In [4]:
targets_dict = {}
features_dict = {}
for i in range(1, N_EXPERIMENTS + 1):
    # Define the path to load the data
    data_dir = os.path.join(SOURCE_PATH + DERIVED_DATA_DIR, DATE_EXPERIMENTS, str(i))
    
    # Load targets
    targets_df = pd.read_csv(data_dir + '/force_cells_processed.csv')
    
    # Load features
    exo_df = pd.read_csv(data_dir + '/H3_processed.csv')
    leg_df = pd.read_csv(data_dir + '/leg_processed.csv')
    features_df = pd.concat([exo_df, leg_df], axis=1)
    
    idx_aux = targets_df.duplicated(keep='first')
    targets_df = targets_df.loc[~idx_aux]
    features_df = features_df.loc[~idx_aux]
    print('Droping {} duplicated data points'.format(len(idx_aux[idx_aux == False])))
    
    # Rename columns to manage with some typos
    features_df = features_df.rename(columns={'LankleTorque': 'LAnkleTorque', 'RankleTorque': 'RAnkleTorque'})

    # Drop null values
    idx = features_df.notna().all(axis=1)
    features_df = features_df.loc[idx]
    targets_df = targets_df.loc[idx]
    print('Droping {} data points by null features'.format(len(idx[idx == False])))

    # Store the final array
    targets_dict[i] = targets_df[targets].values
    features_dict[i] = features_df[features].values
    
    print('Experiment {} -> X: {}, Y: {} \n'.format(i, features_dict[i].shape, targets_dict[i].shape))

Droping 8722 duplicated data points
Droping 1 data points by null features
Experiment 1 -> X: (8721, 15), Y: (8721, 24) 

Droping 8736 duplicated data points
Droping 1 data points by null features
Experiment 2 -> X: (8735, 15), Y: (8735, 24) 

Droping 8589 duplicated data points
Droping 1 data points by null features
Experiment 3 -> X: (8588, 15), Y: (8588, 24) 

Droping 8726 duplicated data points
Droping 1 data points by null features
Experiment 4 -> X: (8725, 15), Y: (8725, 24) 

Droping 8624 duplicated data points
Droping 1 data points by null features
Experiment 5 -> X: (8623, 15), Y: (8623, 24) 

Droping 8760 duplicated data points
Droping 1 data points by null features
Experiment 6 -> X: (8759, 15), Y: (8759, 24) 

Droping 8639 duplicated data points
Droping 1 data points by null features
Experiment 7 -> X: (8638, 15), Y: (8638, 24) 

Droping 8773 duplicated data points
Droping 1 data points by null features
Experiment 8 -> X: (8772, 15), Y: (8772, 24) 

Droping 8769 duplicated 

## Normalization and split for cross-validation

In [5]:
experiments = list(range(1, N_EXPERIMENTS + 1))
random.shuffle(experiments)

train_experiments = experiments[:TRAIN_SIZE]
test_experiments = experiments[TRAIN_SIZE:]

print('Train experiments ids ({}): {}'.format(len(train_experiments), train_experiments))
print('Test experiments ids ({}): {}'.format(len(test_experiments), test_experiments))

assert(len(train_experiments) + len(test_experiments) == N_EXPERIMENTS)
# Check that no test experiment is in train
assert(not any([i in test_experiments for i in train_experiments]))

Train experiments ids (12): [2, 11, 10, 6, 12, 3, 4, 8, 9, 5, 1, 15]
Test experiments ids (3): [13, 7, 14]


In [6]:
X_train = np.concatenate([features_dict[i] for i in train_experiments], axis=0)
Y_train = np.concatenate([targets_dict[i] for i in train_experiments], axis=0)
X_test = np.concatenate([features_dict[i] for i in test_experiments], axis=0)
Y_test = np.concatenate([targets_dict[i] for i in test_experiments], axis=0)

print('Train -> X: {}, Y: {}'.format(X_train.shape, Y_train.shape))
print('Test -> X: {}, Y: {}'.format(X_test.shape, Y_test.shape))

Train -> X: (104561, 15), Y: (104561, 24)
Test -> X: (25985, 15), Y: (25985, 24)


In [7]:
s = SCALER.fit(X_train)

X_train_norm = s.transform(X_train)
X_test_norm = s.transform(X_test)

print('Train -> \n min: {}, \n max: {}, \n mean: {}, \n std: {}\n'.format(np.min(X_train_norm, axis=0), np.max(X_train_norm, axis=0), np.mean(X_train_norm, axis=0), np.std(X_train_norm, axis=0)))
print('Test -> \n min: {}, \n max: {}, \n mean: {}, \n std: {}\n'.format(np.min(X_test_norm, axis=0), np.max(X_test_norm, axis=0), np.mean(X_test_norm, axis=0), np.std(X_test_norm, axis=0)))

Train -> 
 min: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.], 
 max: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.], 
 mean: [0.4448378  0.37159194 0.51588258 0.4190689  0.20284009 0.56795055
 0.63143291 0.31957625 0.60723926 0.41817225 0.56123806 0.23682047
 0.23266228 0.39591114 0.76709162], 
 std: [0.33288867 0.29378422 0.16624394 0.18524527 0.2809568  0.19569883
 0.14494188 0.17550486 0.23793802 0.17673375 0.11891252 0.1018684
 0.25310102 0.15488524 0.28605986]

Test -> 
 min: [ 8.72949314e-03  1.33303228e-02  1.17412805e-01  5.13174234e-02
  1.16680663e-04  5.32940183e-03  1.17831535e-02  1.31408591e-03
  3.86328051e-02  1.81936517e-03  3.69541600e-01  5.58295788e-02
  3.02266353e-02  2.49667728e-03 -7.17077873e-04], 
 max: [0.99598895 0.98188223 0.9722875  0.98806157 1.00044171 0.99840212
 0.95387876 1.00512108 1.00091957 0.78894653 0.98468742 1.38876155
 0.99764779 0.97202958 0.99010046], 
 mean: [0.44555736 0.37209331 0.51573448 0.41974707 0.19910736 0.5677548
 0.63148326 0.3

In [8]:
save_dir = os.path.join(RESULTS_PATH, DATA_ID, 'data')
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

np.save(save_dir + '/X_train_' + DATA_ID + '.npy', X_train_norm)    
np.save(save_dir + '/X_test_' + DATA_ID + '.npy', X_test_norm)    
np.save(save_dir + '/Y_train_' + DATA_ID + '.npy', Y_train)    
np.save(save_dir + '/Y_test_' + DATA_ID + '.npy', Y_test)    

In [9]:
# Split the experiments of the training sets in different folds
exp_per_fold = len(train_experiments) // CV
cv_folds = [train_experiments[x:x+exp_per_fold] for x in range(0, len(train_experiments), exp_per_fold)]
print('CV folds {}\n'.format(cv_folds))

for fold_id in range(CV):
    print('Fold {}'.format(fold_id + 1))
    
    cv_folds_cp = cv_folds.copy()
    valid_experiments_fold = cv_folds_cp.pop(fold_id)
    train_experiments_fold = [item for sublist in cv_folds_cp for item in sublist]

    print('Train experiments ids ({}): {}'.format(len(train_experiments_fold), train_experiments_fold))
    print('Validation experiments ids ({}): {}'.format(len(valid_experiments_fold), valid_experiments_fold))

    assert(len(train_experiments_fold) + len(valid_experiments_fold) == len(train_experiments))
    # Check that no validation experiments are in train
    assert(not any([i in valid_experiments_fold for i in train_experiments_fold]))
    # Check that no test experiments are in train or validation folds
    assert(not any([i in test_experiments for i in train_experiments_fold]))
    assert(not any([i in test_experiments for i in valid_experiments_fold]))
    
    X_train = np.concatenate([features_dict[i] for i in train_experiments_fold], axis=0)
    Y_train = np.concatenate([targets_dict[i] for i in train_experiments_fold], axis=0)
    X_valid = np.concatenate([features_dict[i] for i in valid_experiments_fold], axis=0)
    Y_valid = np.concatenate([targets_dict[i] for i in valid_experiments_fold], axis=0)

    print('Train -> X: {}, Y: {}'.format(X_train.shape, Y_train.shape))
    print('Validation -> X: {}, Y: {}'.format(X_valid.shape, Y_valid.shape))
    
    # Normalize the data
    s = SCALER.fit(X_train)

    X_train_norm = s.transform(X_train)
    X_valid_norm =  s.transform(X_valid)

    print('Train -> \n min: {}, \n max: {}, \n mean: {}, \n std: {}\n'.format(np.min(X_train_norm, axis=0), np.max(X_train_norm, axis=0), np.mean(X_train_norm, axis=0), np.std(X_train_norm, axis=0)))
    print('Valid -> \n min: {}, \n max: {}, \n mean: {}, \n std: {}'.format(np.min(X_valid_norm, axis=0), np.max(X_valid_norm, axis=0), np.mean(X_valid_norm, axis=0), np.std(X_valid_norm, axis=0)))
    
    save_dir = os.path.join(RESULTS_PATH, DATA_ID, 'data')
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    np.save(save_dir + '/X_train_cv{}_{}.npy'.format(fold_id + 1, DATA_ID), X_train_norm)    
    np.save(save_dir + '/X_valid_cv{}_{}.npy'.format(fold_id + 1, DATA_ID), X_valid_norm)    
    np.save(save_dir + '/Y_train_cv{}_{}.npy'.format(fold_id + 1, DATA_ID), Y_train)    
    np.save(save_dir + '/Y_valid_cv{}_{}.npy'.format(fold_id + 1, DATA_ID), Y_valid)      
    
    print('\n')

CV folds [[2, 11], [10, 6], [12, 3], [4, 8], [9, 5], [1, 15]]

Fold 1
Train experiments ids (10): [10, 6, 12, 3, 4, 8, 9, 5, 1, 15]
Validation experiments ids (2): [2, 11]
Train -> X: (87168, 15), Y: (87168, 24)
Validation -> X: (17393, 15), Y: (17393, 24)
Train -> 
 min: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.], 
 max: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.], 
 mean: [0.44445726 0.37136815 0.51588106 0.41800079 0.2019976  0.56798724
 0.63141213 0.3231011  0.60725098 0.41963963 0.56365113 0.23903505
 0.22978311 0.39586013 0.78802145], 
 std: [0.3332355  0.29434303 0.16660117 0.18290819 0.27904884 0.19376346
 0.14324514 0.18067721 0.23779791 0.17718663 0.11925173 0.10372276
 0.25141175 0.15299483 0.2753975 ]

Valid -> 
 min: [ 0.00998084 -0.00052571  0.11892211  0.05554243  0.00013737  0.0036844
  0.00157334  0.00568987 -0.00028417  0.00098134  0.00476934  0.00620614
  0.05807598  0.00729467 -0.00025077], 
 max: [0.99897912 0.95801894 0.94284543 0.9975903  0.99943121 0.9953