In [1]:
import numpy as np
import os
import gc

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputRegressor

In [2]:
# Path where the data is stored
SOURCE_PATH = '../data'
# Directory inside SOURCE_PATH where the derived data is stored
DERIVED_DATA_DIR = '/derived_data'

# Experiment params
DATA_ID = '0001_23032021'

In [3]:
X_train = np.load(os.path.join(SOURCE_PATH + DERIVED_DATA_DIR, 'X_train_' + DATA_ID + '.npy'))
X_test = np.load(os.path.join(SOURCE_PATH + DERIVED_DATA_DIR, 'X_test_' + DATA_ID + '.npy'))
Y_train = np.load(os.path.join(SOURCE_PATH + DERIVED_DATA_DIR, 'Y_train_' + DATA_ID + '.npy'))
Y_test = np.load(os.path.join(SOURCE_PATH + DERIVED_DATA_DIR, 'Y_test_' + DATA_ID + '.npy'))

In [4]:
N_CELLS = 8
targets_names = ['F' + str(i + 1) + ax for i in range(N_CELLS) for ax in ['x', 'y', 'z']]

In [None]:
param_grid = {
    'n_estimators': [10, 100, 1000],
    'criterion': ['mae', 'mse'],
    'max_depth': [2, 5, 10, None],
}

rf = RandomForestRegressor(random_state=0, verbose=10)

clf = GridSearchCV(rf, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=10)
clf.fit(X_train, Y_train)

print('Best params: {}'.format(clf.best_params_))

Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [None]:
train_preds = clf.best_estimator_.predict(X_train)
test_preds = clf.best_estimator_.predict(X_test)

results = {
    'Train': {
        'MAE': mean_absolute_error(Y_train, train_preds, multioutput='raw_values'),
        'MSE': mean_squared_error(Y_train, train_preds, multioutput='raw_values'),
        'R2': r2_score(Y_train, train_preds, multioutput='raw_values')
    },
    'Test': {
        'MAE': mean_absolute_error(Y_test, test_preds, multioutput='raw_values'),
        'MSE': mean_squared_error(Y_test, test_preds, multioutput='raw_values'),
        'R2': r2_score(Y_test, test_preds, multioutput='raw_values')
    }       
    
}

for subset in ['Train', 'Test']:
    for f, force in enumerate(['Fx', 'Fy', 'Fz']):
        for err in ['MAE', 'MSE', 'R2']:
            scores = [results[subset][err][i + f] for i in range(0, N_CELLS * 3, 3)]
            print(' '.join([subset, force, err]) + ': {:.4f} ± {:.4f}'.format(np.mean(scores), np.std(scores)))

In [None]:
'''

param_grid = {}
#     'n_estimators': [10, 20, 50, 100, 500],
#     'criterion': ['mse', 'mae'],
#     'max_depth': [1, 2, 5, 10],

# }

for t in range(Y_train.shape[1]):
    target = targets_names[t]
    rf = RandomForestRegressor(random_state=0)

    clf = GridSearchCV(rf, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=8)
    clf.fit(X_train, Y_train[:, t])

    #print('Best params: {}'.format(clf.best_params_))


    test_preds = clf.best_estimator_.predict(X_test)

    # print('{} train MAE: {}'.format(target, mean_absolute_error(Y_train[:, t], clf.best_estimator_.predict(X_train_norm))))
    # print('{} train MSE: {}'.format(target, mean_squared_error(Y_train[:, t], clf.best_estimator_.predict(X_train_norm))))
    # print('{} train R2: {}'.format(target, r2_score(Y_train[:, t], clf.best_estimator_.predict(X_train_norm))))

    print('{} test MAE: {}'.format(target, mean_absolute_error(Y_test[:, t], test_preds)))
    print('{} test MSE: {}'.format(target, mean_squared_error(Y_test[:, t], test_preds)))
    print('{} test R2: {}'.format(target, r2_score(Y_test[:, t], test_preds)))
    
    print()
    print('///////////////////////////////////////////////////')
    print()
    gc.collect()
    
'''