In [1]:
import numpy as np
import os
import gc
import sys
import time
import random
import string
from time import strftime
from joblib import dump, load
import json
from collections import defaultdict
from sklearn.model_selection import ParameterGrid, cross_validate
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.utils import shuffle

In [2]:
from IPython.utils.io import Tee

# Redirect all the outputs messages to the terminal and to a log file
logs_dir = './logs'
logfilename = logs_dir + strftime('/ipython_%Y-%m-%d_%H:%M:%S') + '.log' 
if not os.path.exists(logs_dir):
    os.makedirs(logs_dir)
    
sys.stdout = open('/dev/stdout', 'w')
Tee(logfilename, mode='w', channel='stdout')


<IPython.utils.io.Tee at 0x7fd1f8d1f7c0>

## Configuration

In [3]:
# Number of force cells in the robotic leg
N_CELLS = 8

# Path where the results are stored
RESULTS_PATH = '../../../../results'
# ID of the training and validation data resulting from this notebook, stored in RESULTS_PATH
DATA_ID = '0003_11042021'
# Number of folds for cross-validation
CV = 6

print('Model training with data: ' + DATA_ID)

## Random Forest hyperparameters search

In [None]:
# Hyperparameters search date
hs_date = '12042021'

# Parameters grid
param_grid = {
    'n_estimators': [100, 500, 1000, 5000, 10000],
    'max_depth': [5, 7, 10, 13, 15],
    'max_features': [0.01, 0.05, 0.1, 0.2],
    'min_samples_leaf': [0.1, 0.01, 0.001],
    'min_samples_split': [0.01, 0.001, 0.0001]
}



param_grid_ls = list(ParameterGrid(param_grid))
random.shuffle(param_grid_ls)
param_grid_len = len(param_grid_ls)
print('Number of parameters combinations: {}'.format(param_grid_len))

for idx, params in enumerate(param_grid_ls):
    print(strftime('%Y-%m-%d %H:%M:%S'))
    params_id = ''.join(random.choices(string.ascii_uppercase + string.digits, k=10))
    print('Parameters ({}) {}/{}'.format(params_id, idx + 1, param_grid_len))
    print(params)
    
    # Train the model with cross-validation
    cv_results = defaultdict(list)
    for fold_id in range(CV):
        print('Fold {}'.format(fold_id + 1))
        
        # Load data
        X_train = np.load(os.path.join(RESULTS_PATH, DATA_ID, 'data', 'X_train_cv{}_{}.npy'.format(fold_id + 1, DATA_ID)))
        X_valid = np.load(os.path.join(RESULTS_PATH, DATA_ID, 'data', 'X_valid_cv{}_{}.npy'.format(fold_id + 1, DATA_ID)))
        Y_train = np.load(os.path.join(RESULTS_PATH, DATA_ID, 'data', 'Y_train_cv{}_{}.npy'.format(fold_id + 1, DATA_ID)))
        Y_valid = np.load(os.path.join(RESULTS_PATH, DATA_ID, 'data', 'Y_valid_cv{}_{}.npy'.format(fold_id + 1, DATA_ID)))
        
        # Setup the model
        model = RandomForestRegressor(**params, random_state=0, n_jobs=-1, verbose=0)
        
        # Train the model
        t_start = time.time()
        model.fit(X_train, Y_train)
        t_end = time.time()
        
        cv_results['fit_time'].append(t_end - t_start)
        
        # Get the scores
        train_preds = model.predict(X_train)
        valid_preds = model.predict(X_valid)

        results = {
            'Train': {
                'MAE': mean_absolute_error(Y_train, train_preds, multioutput='raw_values'),
                'MSE': mean_squared_error(Y_train, train_preds, multioutput='raw_values'),
                'R2': r2_score(Y_train, train_preds, multioutput='raw_values')
            },
            'Valid': {
                'MAE': mean_absolute_error(Y_valid, valid_preds, multioutput='raw_values'),
                'MSE': mean_squared_error(Y_valid, valid_preds, multioutput='raw_values'),
                'R2': r2_score(Y_valid, valid_preds, multioutput='raw_values')
            }       

        }
        
        for subset in ['Train', 'Valid']:
            for f, force in enumerate(['Fx', 'Fy', 'Fz']):
                for loss in ['MAE', 'MSE', 'R2']:
                    scores = [results[subset][loss][i + f] for i in range(0, N_CELLS * 3, 3)]
                    cv_results['_'.join([subset, force, loss, 'mean'])].append(np.mean(scores))
                    cv_results['_'.join([subset, force, loss, 'std'])].append(np.std(scores))
            
    # Save the obtained results and its parameters into a JSON file
    rd = {}
    rd['id'] = params_id
    rd['parameters'] = params
    rd['cv_results'] = dict(cv_results)
    
    save_dir = os.path.join(RESULTS_PATH, DATA_ID, 'RF_{}'.format(hs_date))
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
        
    with open(os.path.join(save_dir, 'RF_{}_{}.json'.format(hs_date, params_id)), 'w') as fp:
        json.dump(rd, fp)
    
    print('\n\n')
    del model, results, cv_results, rd
    gc.collect()