In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from tqdm import tqdm
import dask
from dask import dataframe as dd
from dask.diagnostics import ProgressBar
import glob
import random
import math
import json
import os
import gc

## Configuration

In [2]:
dask.config.set(num_workers=8, scheduler='processes')
random.seed(0)

# Directory where the derived data is stored
DERIVED_DATA_DIR = '../../../data'

# Number of force cells in the robotic leg
N_CELLS = 8

# Path where the results are stored
RESULTS_PATH = '../../../results'
# ID of the training and test data resulting from this notebook, stored in RESULTS_PATH
DATA_ID = '0007_19072021'

# Scaler to normalize the data
SCALER = MinMaxScaler() # StandardScaler()

# Total number of experiments
N_EXPERIMENTS = 63
# Number of folds for cross-validation
CV = 6
# Experiments for training set (int)
TRAIN_SIZE = 54
# Experiments for test set (int)
TEST_SIZE = 9

assert(TRAIN_SIZE + TEST_SIZE == N_EXPERIMENTS)
assert(TRAIN_SIZE % CV == 0)

## Features and target selection

In [4]:
H3_LEG = 'L' # L|R

# features = [H3_LEG + a + m for a in ['Hip', 'Knee', 'Ankle'] for m in ['Pos', 'Vel', 'Acc', 'Torque']] + ['LegKnee{}Filtered'.format(m) for m in ['Position', 'Velocity', 'Torque']]
# targets = ['F' + str(i + 1) + ax for i in range(N_CELLS) for ax in ['x', 'y', 'z']]
features = ['LKneePos'] + ['F' + str(i) + 'z' for i in [3, 4, 7, 8]]
targets = ['F' + str(i) + ax for i in [3, 4, 7, 8] for ax in ['y', 'z']]

print('Number of features: {}'.format(len(features)))
print('Selected features: {}'.format(features))
print('\n')
print('Number of targets: {}'.format(len(targets)))
print('Selected targets: {}'.format(targets))

Number of features: 5
Selected features: ['LKneePos', 'F3z', 'F4z', 'F7z', 'F8z']


Number of targets: 8
Selected targets: ['F3y', 'F3z', 'F4y', 'F4z', 'F7y', 'F7z', 'F8y', 'F8z']


In [5]:
experiments_dirs_path = glob.glob(DERIVED_DATA_DIR + '/*/*')

assert(len(experiments_dirs_path) == N_EXPERIMENTS)

In [11]:
# Index to crop the data and use only this section of each experiment (start idx, end idx)
# The indexes can be defined manually defining crop_by_index=(start idx, end idx) or seleted at random setting crop_by_index=True
window_size = 200
crop_by_index = False #(1500, 1700) # True

# Sample the experiment data to use only this sample datapoints
random_sample = True
random_sample_pct = 0.05

targets_dict = {}
features_dict = {}
for i, exp_path in enumerate(experiments_dirs_path):
    print('{} - Experiment {} from {}'.format(i, exp_path.split('/')[-1], exp_path.split('/')[-2]))
    
    # Load targets
    targets_df = pd.read_csv(exp_path + '/force_cells_processed.csv')
    
    # Load features
    exo_df = pd.read_csv(exp_path + '/H3_processed.csv')
    # leg_df = pd.read_csv(exp_path + '/leg_processed.csv')
    # features_df = pd.concat([exo_df, leg_df], axis=1)
    features_df = exo_df
    
    idx_aux = targets_df.duplicated(keep='first')
    targets_df = targets_df.loc[~idx_aux]
    features_df = features_df.loc[~idx_aux]
    print('Droping {} duplicated data points'.format(len(idx_aux[idx_aux == False])))
    
    # Drop first row to remove noise in the start of the data recording
    targets_df = targets_df.iloc[1:]
    features_df = features_df.iloc[1:]
    # Drop null values
    idx = features_df.notna().all(axis=1)
    features_df = features_df.loc[idx]
    targets_df = targets_df.loc[idx]
    print('Droping {} data points by null features'.format(len(idx[idx == False])))

    assert(len(features_df) == len(targets_df))
    data_df = pd.concat([features_df, targets_df], axis=1)
    
    # Crop the data by the indicated indexes
    if crop_by_index:
        if crop_by_index == True:
            start_idx = random.randint(100, len(data_df) - window_size - 100)
            crop_by_index = (start_idx, start_idx + window_size)
            
        data_df = data_df.iloc[crop_by_index[0]:crop_by_index[1]]
        
    if random_sample:
        data_df = data_df.sample(frac=random_sample_pct, random_state=0)
        
    # Store the final array
    targets_dict[i] = data_df[targets].values
    features_dict[i] = data_df[features].values
    
    print('Experiment {} -> X: {}, Y: {} \n'.format(i, features_dict[i].shape, targets_dict[i].shape))

0 - Experiment 1 from 10032021
Droping 2917 duplicated data points
Droping 0 data points by null features
Experiment 0 -> X: (146, 5), Y: (146, 8) 

1 - Experiment 1 from 16022021
Droping 8709 duplicated data points
Droping 0 data points by null features
Experiment 1 -> X: (435, 5), Y: (435, 8) 

2 - Experiment 2 from 16022021
Droping 8696 duplicated data points
Droping 0 data points by null features
Experiment 2 -> X: (435, 5), Y: (435, 8) 

3 - Experiment 3 from 16022021
Droping 8708 duplicated data points
Droping 0 data points by null features
Experiment 3 -> X: (435, 5), Y: (435, 8) 

4 - Experiment 4 from 16022021
Droping 8736 duplicated data points
Droping 0 data points by null features
Experiment 4 -> X: (437, 5), Y: (437, 8) 

5 - Experiment 5 from 16022021
Droping 8706 duplicated data points
Droping 0 data points by null features
Experiment 5 -> X: (435, 5), Y: (435, 8) 

6 - Experiment 6 from 16022021
Droping 8680 duplicated data points
Droping 0 data points by null features


## Normalization and split for cross-validation

In [12]:
experiments = list(range(N_EXPERIMENTS))
random.shuffle(experiments)

train_experiments = experiments[:TRAIN_SIZE]
test_experiments = experiments[TRAIN_SIZE:]

print('Train experiments ids ({}): {}'.format(len(train_experiments), train_experiments))
print('Test experiments ids ({}): {}'.format(len(test_experiments), test_experiments))

assert(len(train_experiments) + len(test_experiments) == N_EXPERIMENTS)
# Check that no test experiment is in train
assert(not any([i in test_experiments for i in train_experiments]))

Train experiments ids (54): [53, 41, 3, 60, 33, 58, 27, 5, 7, 44, 49, 28, 23, 29, 46, 12, 57, 0, 61, 1, 43, 40, 14, 15, 17, 62, 20, 36, 10, 47, 11, 35, 52, 21, 4, 42, 51, 9, 38, 34, 59, 39, 6, 45, 18, 8, 55, 13, 37, 22, 30, 19, 50, 25]
Test experiments ids (9): [31, 32, 16, 2, 26, 56, 48, 24, 54]


In [13]:
X_train = np.concatenate([features_dict[i] for i in train_experiments], axis=0)
Y_train = np.concatenate([targets_dict[i] for i in train_experiments], axis=0)
X_test = np.concatenate([features_dict[i] for i in test_experiments], axis=0)
Y_test = np.concatenate([targets_dict[i] for i in test_experiments], axis=0)

print('Train -> X: {}, Y: {}'.format(X_train.shape, Y_train.shape))
print('Test -> X: {}, Y: {}'.format(X_test.shape, Y_test.shape))
print('Total data points: {}'.format(X_train.shape[0] + X_test.shape[0]))

Train -> X: (22908, 5), Y: (22908, 8)
Test -> X: (3905, 5), Y: (3905, 8)
Total data points: 26813


In [14]:
s = SCALER.fit(X_train)

X_train_norm = s.transform(X_train)
X_test_norm = s.transform(X_test)

print('Train -> \n min: {}, \n max: {}, \n mean: {}, \n std: {}\n'.format(np.min(X_train_norm, axis=0), np.max(X_train_norm, axis=0), np.mean(X_train_norm, axis=0), np.std(X_train_norm, axis=0)))
print('Test -> \n min: {}, \n max: {}, \n mean: {}, \n std: {}\n'.format(np.min(X_test_norm, axis=0), np.max(X_test_norm, axis=0), np.mean(X_test_norm, axis=0), np.std(X_test_norm, axis=0)))

Train -> 
 min: [0. 0. 0. 0. 0.], 
 max: [1. 1. 1. 1. 1.], 
 mean: [0.20872137 0.57762517 0.14325474 0.58093556 0.26200923], 
 std: [0.28592846 0.17740653 0.11688366 0.11681586 0.14901657]

Test -> 
 min: [7.61433427e-05 1.41734393e-01 1.28950507e-02 6.59659297e-02
 3.71567044e-03], 
 max: [0.99987681 0.91890685 0.93820811 0.94642987 0.59515347], 
 mean: [0.20544116 0.56314337 0.15947602 0.57992486 0.23765596], 
 std: [0.28550568 0.13531224 0.13424705 0.12395833 0.12962729]



In [15]:
save_dir = os.path.join(RESULTS_PATH, DATA_ID, 'data')
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

np.save(save_dir + '/X_train_' + DATA_ID + '.npy', X_train_norm)    
np.save(save_dir + '/X_test_' + DATA_ID + '.npy', X_test_norm)    
np.save(save_dir + '/Y_train_' + DATA_ID + '.npy', Y_train)    
np.save(save_dir + '/Y_test_' + DATA_ID + '.npy', Y_test)    

In [16]:
# Split the experiments of the training sets in different folds
exp_per_fold = len(train_experiments) // CV
cv_folds = [train_experiments[x:x+exp_per_fold] for x in range(0, len(train_experiments), exp_per_fold)]
print('CV folds ({}): {}\n'.format(len(cv_folds), cv_folds))

for fold_id in range(CV):
    print('Fold {}'.format(fold_id + 1))
    
    cv_folds_cp = cv_folds.copy()
    valid_experiments_fold = cv_folds_cp.pop(fold_id)
    train_experiments_fold = [item for sublist in cv_folds_cp for item in sublist]

    print('Train experiments ids ({}): {}'.format(len(train_experiments_fold), train_experiments_fold))
    print('Validation experiments ids ({}): {}'.format(len(valid_experiments_fold), valid_experiments_fold))

    assert(len(train_experiments_fold) + len(valid_experiments_fold) == len(train_experiments))
    # Check that no validation experiments are in train
    assert(not any([i in valid_experiments_fold for i in train_experiments_fold]))
    # Check that no test experiments are in train or validation folds
    assert(not any([i in test_experiments for i in train_experiments_fold]))
    assert(not any([i in test_experiments for i in valid_experiments_fold]))
    
    X_train = np.concatenate([features_dict[i] for i in train_experiments_fold], axis=0)
    Y_train = np.concatenate([targets_dict[i] for i in train_experiments_fold], axis=0)
    X_valid = np.concatenate([features_dict[i] for i in valid_experiments_fold], axis=0)
    Y_valid = np.concatenate([targets_dict[i] for i in valid_experiments_fold], axis=0)

    print('Train -> X: {}, Y: {}'.format(X_train.shape, Y_train.shape))
    print('Validation -> X: {}, Y: {}'.format(X_valid.shape, Y_valid.shape))
    
    # Normalize the data
    s = SCALER.fit(X_train)

    X_train_norm = s.transform(X_train)
    X_valid_norm =  s.transform(X_valid)

    print('Train -> \n min: {}, \n max: {}, \n mean: {}, \n std: {}\n'.format(np.min(X_train_norm, axis=0), np.max(X_train_norm, axis=0), np.mean(X_train_norm, axis=0), np.std(X_train_norm, axis=0)))
    print('Valid -> \n min: {}, \n max: {}, \n mean: {}, \n std: {}'.format(np.min(X_valid_norm, axis=0), np.max(X_valid_norm, axis=0), np.mean(X_valid_norm, axis=0), np.std(X_valid_norm, axis=0)))
    
    save_dir = os.path.join(RESULTS_PATH, DATA_ID, 'data')
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    np.save(save_dir + '/X_train_cv{}_{}.npy'.format(fold_id + 1, DATA_ID), X_train_norm)    
    np.save(save_dir + '/X_valid_cv{}_{}.npy'.format(fold_id + 1, DATA_ID), X_valid_norm)    
    np.save(save_dir + '/Y_train_cv{}_{}.npy'.format(fold_id + 1, DATA_ID), Y_train)    
    np.save(save_dir + '/Y_valid_cv{}_{}.npy'.format(fold_id + 1, DATA_ID), Y_valid)      
    
    print('\n')

CV folds (6): [[53, 41, 3, 60, 33, 58, 27, 5, 7], [44, 49, 28, 23, 29, 46, 12, 57, 0], [61, 1, 43, 40, 14, 15, 17, 62, 20], [36, 10, 47, 11, 35, 52, 21, 4, 42], [51, 9, 38, 34, 59, 39, 6, 45, 18], [8, 55, 13, 37, 22, 30, 19, 50, 25]]

Fold 1
Train experiments ids (45): [44, 49, 28, 23, 29, 46, 12, 57, 0, 61, 1, 43, 40, 14, 15, 17, 62, 20, 36, 10, 47, 11, 35, 52, 21, 4, 42, 51, 9, 38, 34, 59, 39, 6, 45, 18, 8, 55, 13, 37, 22, 30, 19, 50, 25]
Validation experiments ids (9): [53, 41, 3, 60, 33, 58, 27, 5, 7]
Train -> X: (19002, 5), Y: (19002, 8)
Validation -> X: (3906, 5), Y: (3906, 8)
Train -> 
 min: [0. 0. 0. 0. 0.], 
 max: [1. 1. 1. 1. 1.], 
 mean: [0.20691427 0.56888815 0.14473881 0.57922498 0.39438059], 
 std: [0.28515417 0.18178602 0.11243276 0.11447013 0.20835643]

Valid -> 
 min: [5.21636237e-05 1.81793478e-01 7.78920692e-03 6.59659297e-02
 5.35381750e-03], 
 max: [0.99706249 1.01422101 0.93820811 0.94773469 1.44087523], 
 mean: [0.21751257 0.66830533 0.13603503 0.58925724 0.29551