In [None]:
import numpy as np
import os
import gc
import sys
import time
import random
import string
from time import strftime
import json
from collections import defaultdict
from sklearn.model_selection import ParameterGrid, cross_validate
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb

In [None]:
from IPython.utils.io import Tee

# Redirect all the outputs messages to the terminal and to a log file
logs_dir = './logs'
logfilename = logs_dir + strftime('/ipython_%Y-%m-%d_%H:%M:%S') + '.log' 
if not os.path.exists(logs_dir):
    os.makedirs(logs_dir)
    
sys.stdout = open('/dev/stdout', 'w')
Tee(logfilename, mode='w', channel='stdout')


## Configuration

In [None]:
# Number of force cells in the robotic leg
N_CELLS = 8

# Path where the results are stored
RESULTS_PATH = '../../../../results'
# ID of the training and validation data resulting from this notebook, stored in RESULTS_PATH
DATA_ID = '0005_19042021'
# Number of folds for cross-validation
CV = 6

print('Model training with data: ' + DATA_ID)

## XGBoost hyperparameters search

In [None]:
# Hyperparameters search date
hs_date = '20042021'

# Parameters grid
param_grid = {
    'objective': ['reg:squarederror'],
    'booster': ['gbtree'], 
    'eta': [0.3, 0.4, 0.5], 
    'gamma': [0.05, 0.01, 0.005], 
    'subsample': [0.5, 0.75, 1],
    'max_depth': [3, 4, 5, 6, 7, 8],
    'lambda': [0, 1, 2],
    'nthread': [8],
    'seed': [0]
}


param_grid_ls = list(ParameterGrid(param_grid))
random.shuffle(param_grid_ls)
param_grid_len = len(param_grid_ls)
print('Number of parameters combinations: {}'.format(param_grid_len))

for idx, params in enumerate(param_grid_ls):
    params_id = ''.join(random.choices(string.ascii_uppercase + string.digits, k=10))
    print('Parameters ({}) {}/{}  -  {}'.format(params_id, idx + 1, param_grid_len, strftime('%Y-%m-%d %H:%M:%S')))
    print(params)
    
    # Train the model with cross-validation
    cv_results = defaultdict(list)
    for fold_id in range(CV):
        print('Fold {}'.format(fold_id + 1))
        
        # Load data
        X_train = np.load(os.path.join(RESULTS_PATH, DATA_ID, 'data', 'X_train_cv{}_{}.npy'.format(fold_id + 1, DATA_ID)))
        X_valid = np.load(os.path.join(RESULTS_PATH, DATA_ID, 'data', 'X_valid_cv{}_{}.npy'.format(fold_id + 1, DATA_ID)))
        Y_train = np.load(os.path.join(RESULTS_PATH, DATA_ID, 'data', 'Y_train_cv{}_{}.npy'.format(fold_id + 1, DATA_ID)))
        Y_valid = np.load(os.path.join(RESULTS_PATH, DATA_ID, 'data', 'Y_valid_cv{}_{}.npy'.format(fold_id + 1, DATA_ID)))
        
        results = defaultdict(list)
        tr_time = []
        for target in range(Y_train.shape[1]):

            dtrain = xgb.DMatrix(data=X_train, label=Y_train[:, target])
            dvalid = xgb.DMatrix(data=X_valid, label=Y_valid[:, target])

            callbacks = [xgb.callback.EarlyStopping(rounds=5, metric_name='rmse', maximize=False, save_best=True)]

            t_start = time.time()
            model = xgb.train(params, dtrain, evals=[(dvalid, 'rmse')], callbacks=callbacks, verbose_eval=False)
            tr_time.append(time.time() - t_start)

            train_preds = model.predict(dtrain)
            valid_preds = model.predict(dvalid)

            results['Train_MAE'].append(mean_absolute_error(Y_train[:, target], train_preds))
            results['Train_MSE'].append(mean_squared_error(Y_train[:, target], train_preds))
            results['Train_R2'].append(r2_score(Y_train[:, target], train_preds))
            results['Valid_MAE'].append(mean_absolute_error(Y_valid[:, target], valid_preds))
            results['Valid_MSE'].append(mean_squared_error(Y_valid[:, target], valid_preds))
            results['Valid_R2'].append(r2_score(Y_valid[:, target], valid_preds))

        cv_results['fit_time'].append(sum(tr_time))
        print('Training time: {:.4f}'.format(cv_results['fit_time'][-1]))

        for subset in ['Train', 'Valid']:
            for f, force in enumerate(['Fx', 'Fy', 'Fz']):
                for loss in ['MAE', 'MSE', 'R2']:
                    scores = [results['_'.join([subset, loss])][i + f] for i in range(0, N_CELLS * 3, 3)]
                    cv_results['_'.join([subset, force, loss, 'mean'])].append(np.mean(scores))
                    cv_results['_'.join([subset, force, loss, 'std'])].append(np.std(scores))
            
    # Save the obtained results and its parameters into a JSON file
    rd = {}
    rd['id'] = params_id
    rd['parameters'] = params
    rd['cv_results'] = dict(cv_results)
    
    save_dir = os.path.join(RESULTS_PATH, DATA_ID, 'XGB_{}'.format(hs_date))
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
        
    with open(os.path.join(save_dir, 'XGB_{}_{}.json'.format(hs_date, params_id)), 'w') as fp:
        json.dump(rd, fp)
    
    print('\n\n')
    del model, results, cv_results, rd
    gc.collect()