# Validation curves (seleção de parametros para tuning)

In [1]:
from sklearn.model_selection import validation_curve, RandomizedSearchCV

In [None]:
class HyperParameterPpcEvaluator:
    def __init__(self, dataset, regressor):
        self.__dataset = dataset.get_dataframe()
        self.__regressor = regressor
        
    def evaluate_validation_curve(self, metrics):
        self.__evaluate_validation_curve('n_estimators', [50, 100, 150, 200, 250, 300], metrics)
        self.__evaluate_validation_curve('criterion', ['mse', 'mae'], metrics)
        self.__evaluate_validation_curve('max_depth', [x for x in np.linspace(10, 110, num = 10, dtype=int)], metrics)
        self.__evaluate_validation_curve('min_samples_split', [2, 5, 10], metrics)
        self.__evaluate_validation_curve('min_samples_leaf', [1, 2, 4], metrics)
        self.__evaluate_validation_curve('min_weight_fraction_leaf', [x for x in np.linspace(0.0, 0.49, 5, dtype=float)], metrics)
        self.__evaluate_validation_curve('max_features', ['auto', 'sqrt', 'log2'], metrics)
        self.__evaluate_validation_curve('max_leaf_nodes', [x for x in np.linspace(10, 110, num = 11, dtype=int)], metrics)
        self.__evaluate_validation_curve('min_impurity_decrease', [x for x in np.linspace(0.0, 5.0, 20)], metrics)
        self.__evaluate_validation_curve('bootstrap', [True, False], metrics)
        
    def __evaluate_validation_curve(self, param_name, param_range, metrics):
        train_scoreNum, test_scoreNum = validation_curve(
                self.__regressor,
                X = self.__dataset[metrics].values, 
                y = self.__dataset['PrimePathCoverage'].values, 
                param_name = param_name, 
                param_range = param_range, cv = 10
        )

        self.__plot_validation_curve(train_scoreNum, test_scoreNum, param_range, param_name)
        
    def __plot_validation_curve(self, train_score, test_score, param_range, param_name):
        train_scores_mean = np.mean(train_score, axis=1)
        train_scores_std = np.std(train_score, axis=1)
        test_scores_mean = np.mean(test_score, axis=1)
        test_scores_std = np.std(test_score, axis=1)

        plt.title("Validation Curve - " + param_name)
        plt.xlabel(r"$\gamma$")
        plt.ylabel("Score")
        plt.ylim(0.0, 1.1)
        lw = 2
        plt.semilogx(param_range, train_scores_mean, label="Training score",
                     color="darkorange", lw=lw)
        plt.fill_between(param_range, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.2,
                         color="darkorange", lw=lw)
        plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
                     color="navy", lw=lw)
        plt.fill_between(param_range, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.2,
                         color="navy", lw=lw)
        plt.legend(loc="best")
        plt.show()
        
    def evaluate_hyperparam(self, metrics, hyperparam_space):
        random_search = RandomizedSearchCV(
            estimator=RandomForestRegressor(), 
            param_distributions=hyperparam_space,
            n_iter=100,
            cv=10,
            random_state=0
        )

        random_search.fit(self.__dataset[metrics].values, self.__dataset['PrimePathCoverage'].values)

        display(random_search.best_params_)
        