In [1]:
import openturns as ot
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
from shapley.tests import Ishigami

model = Ishigami()
dim = model.dim
model.copula = ot.NormalCopula(dim)
theta = 0.

In [3]:
from shapley.forest import RandomForestModel
ot.RandomGenerator.SetSeed(0)
np.random.seed(0)

model_budget = 300
n_realization = 500

model_rf = RandomForestModel(model=model, input_distribution=model.input_distribution)
model_rf.generate_sample(n_sample=model_budget, sampling='lhs')
model_rf.build(n_estimators=n_realization)

In [4]:
from scipy.stats import randint as sp_randint
from time import time
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# specify parameters and distributions to sample from
param_dist = {"max_features": sp_randint(1, dim),
              "min_samples_split": sp_randint(2, 11),
              "min_samples_leaf": sp_randint(1, 11)}

n_iter_search = 50
n_fold = 5

random_search = RandomizedSearchCV(model_rf.reg_rf, param_distributions=param_dist,
                                   n_iter=n_iter_search, cv=n_fold, n_jobs=7)

X = model_rf.input_sample
y = model_rf.output_sample

start = time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))

RandomizedSearchCV took 31.51 seconds for 50 candidates parameter settings.


In [5]:
# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")
            

report(random_search.cv_results_)

Model with rank: 1
Mean validation score: 0.763 (std: 0.028)
Parameters: {'max_depth': None, 'max_features': 2, 'min_samples_leaf': 2, 'min_samples_split': 5}

Model with rank: 2
Mean validation score: 0.740 (std: 0.029)
Parameters: {'max_depth': None, 'max_features': 2, 'min_samples_leaf': 3, 'min_samples_split': 9}

Model with rank: 3
Mean validation score: 0.735 (std: 0.040)
Parameters: {'max_depth': None, 'max_features': 2, 'min_samples_leaf': 4, 'min_samples_split': 8}



In [8]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

In [18]:
search_spaces = {
    "max_features": Integer(1, dim),
    "min_samples_split": Integer(2, 11),
    "min_samples_leaf": Integer(1, 11)
}
random_search = BayesSearchCV(model_rf.reg_rf, search_spaces=search_spaces, optimizer_kwargs={'n_initial_points': 20},
                                   n_iter=n_iter_search, cv=n_fold, n_jobs=7)

X = model_rf.input_sample
y = model_rf.output_sample

start = time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))



RandomizedSearchCV took 72.45 seconds for 50 candidates parameter settings.


In [14]:
random_search.best_params_

{'max_depth': 92,
 'max_features': 3,
 'min_samples_leaf': 1,
 'min_samples_split': 2}

In [16]:
    random_search.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=92,
           max_features=3, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)