# Import

In [1]:
try:
    %run ../dataset/dataset.ipynb
except:
    pass

from sklearn.model_selection import validation_curve, RandomizedSearchCV

# HyperParameterPpcEvaluator

In [None]:
class HyperParameterPpcEvaluator:
    """
    Responsible for performing a search in order to discover the set of 
    specific model configuration arguments that result in the best performance 
    of the model on a test coverage dataset.
    """
    
    # -------------------------------------------------------------------------
    #           Constructor
    # -------------------------------------------------------------------------
    def __init__(self, dataset: CoverageDataset, regressor: MlPpcEvaluator):
        """
        Performs a search in order to discover the set of specific model 
        configuration arguments that result in the best performance of the model
        on a test coverage dataset.
        
        :param      dataset: Dataset to be evaluated
        :param      regressor: Machine learning algorithm
        """
        self.__dataset = dataset.get_dataframe()
        self.__regressor = regressor
        self.__train_score = 0
        self.__test_score = 0
        
        
    # -------------------------------------------------------------------------
    #           Methods
    # -------------------------------------------------------------------------
    def evaluate_validation_curve(self, metrics):
        self.__evaluate_validation_curve('n_estimators', [50, 100, 150, 200, 250, 300], metrics)
        self.__evaluate_validation_curve('criterion', ['mse', 'mae'], metrics)
        self.__evaluate_validation_curve('max_depth', [x for x in np.linspace(10, 110, num = 10, dtype=int)], metrics)
        self.__evaluate_validation_curve('min_samples_split', [2, 5, 10], metrics)
        self.__evaluate_validation_curve('min_samples_leaf', [1, 2, 4], metrics)
        self.__evaluate_validation_curve('min_weight_fraction_leaf', [x for x in np.linspace(0.0, 0.49, 5, dtype=float)], metrics)
        self.__evaluate_validation_curve('max_features', ['auto', 'sqrt', 'log2'], metrics)
        self.__evaluate_validation_curve('max_leaf_nodes', [x for x in np.linspace(10, 110, num = 11, dtype=int)], metrics)
        self.__evaluate_validation_curve('min_impurity_decrease', [x for x in np.linspace(0.0, 5.0, 20)], metrics)
        self.__evaluate_validation_curve('bootstrap', [True, False], metrics)
        
    def __evaluate_validation_curve(self, param_name, param_range, metrics):
        self.__build_validation_curve(param_name, param_range, metrics)
        self.__build_validation_curve_chart(param_range, param_name)
        self.__display_current_chart()
        
    def __build_validation_curve(self, param_name, param_range, metrics):
        self.__train_score, self.__test_score = validation_curve(
                self.__regressor,
                X = self.__dataset[metrics].values, 
                y = self.__dataset['PrimePathCoverage'].values, 
                param_name = param_name, 
                param_range = param_range, cv = 10
        )
        
    def __build_validation_curve_chart(self, param_range, param_name):
        self.__build_chart_title()
        self.__build_chart_axis()
        self.__build_chart_legend()
        self.__build_chart_training_score_data()
        self.__build_chart_cross_validation_score_data()
        
    def __build_chart_title(self):
        plt.title("Validation Curve - " + param_name)
        
    def __build_chart_axis(self):
        plt.xlabel(r"$\gamma$")
        plt.ylabel("Score")
        plt.ylim(0.0, 1.1)
        
    def __build_chart_legend(self):
        plt.legend(loc="best")
        
    def __build_chart_training_score_data(self):
        train_scores_mean = np.mean(self.__train_score, axis=1)
        train_scores_std = np.std(self.__train_score, axis=1)
        test_scores_mean = np.mean(self.__test_score, axis=1)
        test_scores_std = np.std(self.__test_score, axis=1)
        
        plt.semilogx(param_range, train_scores_mean, label="Training score",
                     color="darkorange", lw=2)
        plt.fill_between(param_range, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.2,
                         color="darkorange", lw=2)
        
    def __build_chart_cross_validation_score_data(self):
        train_scores_mean = np.mean(self.__train_score, axis=1)
        train_scores_std = np.std(self.__train_score, axis=1)
        test_scores_mean = np.mean(self.__test_score, axis=1)
        test_scores_std = np.std(self.__test_score, axis=1)
        
        plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
                     color="navy", lw=2)
        plt.fill_between(param_range, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.2,
                         color="navy", lw=2)
        
    def __display_current_chart(self):
        plt.show()
        
    def evaluate_hyperparam(self, metrics, hyperparam_space):
        hyperparam_table = self.__build_hyperparam_table(metrics, hyperparam_space)
        self.__display_hyperparam_table(hyperparam_table)
        
    def __build_hyperparam_table(self, metrics, hyperparam_space):
        hyperparam_table = RandomizedSearchCV(
            estimator=RandomForestRegressor(), 
            param_distributions=hyperparam_space,
            n_iter=100,
            cv=10,
            random_state=0
        )

        hyperparam_table.fit(self.__dataset[metrics].values, self.__dataset['PrimePathCoverage'].values)
        
        return hyperparam_table
        
    def __display_hyperparam_table(self, hyperparam_table):
        display(hyperparam_table.best_params_)