# Imports

In [None]:
import pandas as pd

# RandomForestFeatureImportance

In [None]:
class RandomForestFeatureImportance:
    
    def __init__(self, dataset, k=10):
        self.__k = k
        self.__dataset = dataset.get_dataframe()
        self.__ppc = self.__dataset['PrimePathCoverage'].values
        self.__independent_variables = None
        self.__error_metrics = [
            'r2', 'max_error', 'neg_mean_absolute_error',
            'neg_mean_squared_error', 'neg_root_mean_squared_error',
            'neg_mean_squared_log_error', 'neg_median_absolute_error'
        ]
        self.__feature_importance_with_scaling = None
        self.__feature_importance_without_scaling = None
    
    def evaluate(self, metrics):
        self.__extract_independent_variables_using_metrics(metrics)
        self.__evaluate_feature_importance_with_scaling(metrics)
        self.__evaluate_feature_importance_without_scaling(metrics)
        
    def __extract_independent_variables_using_metrics(self, metrics):
        self.__independent_variables = self.__dataset[metrics].values
        
    def __evaluate_feature_importance_without_scaling(self, metrics):
        importance_dataset = self.__compute_feature_importance_without_scaling(metrics)
        importance_dataset = importance_dataset['mean'].sort_values(ascending=False)
        
        self.__display_feature_importance_table(importance_dataset, 'Without scaling')
        self.__display_feature_importance_chart(importance_dataset, 'Without scaling', metrics)
        self.__feature_importance_without_scaling = importance_dataset
        
    def __compute_feature_importance_without_scaling(self, metrics):
        return self.__compute_feature_importance_using_independent_variables(
            metrics, 
            self.__independent_variables
        )
    
    def __compute_feature_importance_using_independent_variables(self, metrics, independent_variables):
        importance_dataset = pd.DataFrame(
                [0]*len(metrics),
                index=metrics,
                columns=['importance']
        )

        resultados = cross_validate(
                RandomForestRegressor(), 
                independent_variables, 
                self.__ppc, 
                cv=self.__k, 
                scoring=self.__error_metrics,
                return_estimator=True
        )

        for idx,estimator in enumerate(resultados['estimator']):
            feature_importances = pd.DataFrame(estimator.feature_importances_,
                                               index=metrics,
                                               columns=['importance'])
            importance_dataset += feature_importances

        importance_dataset['mean'] = importance_dataset['importance'] / self.__k
        
        return importance_dataset
    
    def __display_feature_importance_table(self, importance_dataset, title):
        d = pd.DataFrame(columns=['Metrics', 'Feature importance'])
        d['Metrics'] = importance_dataset.index
        d['Feature importance'] = importance_dataset.values
        self.__display_dataframe_using_title(d, title)
        
    def __display_dataframe_using_title(self, dataframe, title):
        styled_dataframe = dataframe.style.set_caption(title).set_table_styles([{
            'selector': 'caption',
            'props': [
                ('color', 'black'),
                ('font-size', '16px')
            ]
        }])
        display(styled_dataframe)
        
    def __display_feature_importance_chart(self, importance_dataset, title, metrics):
        plt.figure(figsize=(10,9))
        plt.title("Feature importance - Random forest - K = " + str(self.__k) + " - " + title)
        plt.barh(metrics, importance_dataset.values)
        plt.show()
        
    def __evaluate_feature_importance_with_scaling(self, metrics):
        importance_dataset = self.__compute_feature_importance_with_scaling(metrics)
        importance_dataset = importance_dataset['mean'].sort_values(ascending=False)
        
        self.__display_feature_importance_table(importance_dataset, 'With scaling')
        self.__display_feature_importance_chart(importance_dataset, 'With scaling', metrics)
        self.__feature_importance_with_scaling = importance_dataset
        
    def __compute_feature_importance_with_scaling(self, metrics):
        return self.__compute_feature_importance_using_independent_variables(
            metrics, 
            self.__scale(self.__independent_variables)
        )
    
    def __scale(self, data):
        scaler = MinMaxScaler()
        return scaler.fit_transform(data)
    
    def get_feature_importance_with_scaling(self):
        return self.__feature_importance_with_scaling
    
    def get_feature_importance_without_scaling(self):
        return self.__feature_importance_without_scaling