In [None]:
from abc import ABC, abstractmethod
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
import matplotlib.pyplot as plt
%matplotlib inline
from imblearn.under_sampling import RandomUnderSampler
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import *
from sklearn.preprocessing import MinMaxScaler

In [5]:
class PpcEvaluator:
    def __init__(self, regressor, dataset):
        self.__regressor = (lambda: regressor)
        self.__dataset = dataset.get_dataframe()
        self.__scaler = MinMaxScaler()
        self.__k = 10
        self.__error_metrics = [
            'r2', 'max_error', 'neg_mean_absolute_error',
            'neg_mean_squared_error', 'neg_root_mean_squared_error',
            'neg_mean_squared_log_error', 'neg_median_absolute_error'
        ]
        self.__error_metrics_table = None
        
    def set_scaler(self, scaler):
        self.__scaler = scaler
    
    def __scale(self, dados):
        return self.__scaler.fit_transform(dados)

    def evaluate(self, metrics, only_scaled=False, display_prediction=False, display_feature_importance=False):
        ppc = self.__dataset['PrimePathCoverage'].values
        previsores_content = self.__dataset[metrics].values

        self.__error_metrics_table = pd.DataFrame(
            index=['Mean Abs Error', 'Mean Sqr Error', 'Mean Sqr Log Error', 'Mean Median Error', 'R2 Score'],
            columns=['no_scaled', 'scaled']
        )

        if not only_scaled:
            resultados = cross_validate(
                    self.__regressor(), 
                    previsores_content, 
                    ppc, 
                    cv=self.__k, 
                    scoring=self.__error_metrics, 
                    return_estimator=True
            )
            self.__build_error_metrics_table(previsores_content, ppc, resultados, 'no_scaled')
            #self.__error_metrics_table['no_scaled']['Mean Abs Error'] = abs(resultados['test_neg_mean_absolute_error'].mean())
            #self.__error_metrics_table['no_scaled']['Mean Sqr Error'] = abs(resultados['test_neg_mean_squared_error'].mean())
            #self.__error_metrics_table['no_scaled']['Mean Sqr Log Error'] = abs(resultados['test_neg_mean_squared_log_error'].mean())
            #self.__error_metrics_table['no_scaled']['Mean Median Error'] = abs(resultados['test_neg_median_absolute_error'].mean())
            #self.__error_metrics_table['no_scaled']['R2 Score'] = abs(resultados['test_r2'].mean())

        resultados_escalonados = cross_validate(
                self.__regressor(),
                self.__scale(previsores_content), 
                ppc, 
                cv=self.__k, 
                scoring=self.__error_metrics, 
                return_estimator=True
        )
        #error_metrics_table['scaled']['Mean Abs Error'] = abs(resultados_escalonados['test_neg_mean_absolute_error'].mean())
        #error_metrics_table['scaled']['Mean Sqr Error'] = abs(resultados_escalonados['test_neg_mean_squared_error'].mean())
        #error_metrics_table['scaled']['Mean Sqr Log Error'] = abs(resultados_escalonados['test_neg_mean_squared_log_error'].mean())
        #error_metrics_table['scaled']['Mean Median Error'] = abs(resultados_escalonados['test_neg_median_absolute_error'].mean())
        #error_metrics_table['scaled']['R2 Score'] = abs(resultados_escalonados['test_r2'].mean())
        self.__build_error_metrics_table(self.__scale(previsores_content), ppc, resultados_escalonados, 'scaled')
        
        if display_feature_importance:
            k_current = 0
            columns = [i for i in range(self.__k)]
            columns.append('Mean')
            fi_scaled_table = pd.DataFrame(columns=columns, index=self.__dataset[metrics].columns)

            for estimator in resultados_escalonados['estimator']:
                fi_scaled_table[k_current] = feature_importance_of(
                    estimator, escalonador(previsores_content), ppc, self.__dataset[metrics].columns
                )
                k_current += 1

            for i in range(fi_scaled_table.shape[0]):
                fi_scaled_table['Mean'][i] = fi_scaled_table.iloc[i,:-1].mean()


            fi_final = pd.DataFrame(columns=['Metrics', 'Importance'])
            fi_final['Metrics'] = fi_scaled_table.index.values
            fi_final['Importance'] = fi_scaled_table['Mean'].values

            fi_final = fi_final.sort_values(ascending=False, by='Importance')
            display(fi_final)
            plt.figure(figsize=(12,8))
            plt.title("Feature importance - K = " + str(self.__k) + " - Mean")
            plt.axis([0, fi_final['Importance'].values.max(), 0, len(fi_final.index.values)])
            sns.barplot(y=fi_final['Metrics'].values, x=fi_final['Importance'].values, orient='h')
            plt.show()


        return self.__error_metrics_table
    
    def __build_error_metrics_table(self, previsores_content, ppc, estimator, label):
        '''
        k_current = 0
        best_mae = 1
        best_predict_table = None
        for estimator in estimators:
            ppc_predict = estimator.predict(previsores_content)
            predict_table = pd.DataFrame(index=[i for i in range(len(ppc_predict))],columns=['PPC correct', 'PPC predict', 'MAE'])

            for i in range(len(ppc_predict)):
                predict_table['PPC correct'][i] = MathUtils.truncate(ppc[i], self.__precision)
                predict_table['PPC predict'][i] = MathUtils.truncate(ppc_predict[i], self.__precision)
                #print(ppc_predict[i], ppc[i])
            current_mae = mean_absolute_error(predict_table['PPC correct'].values, predict_table['PPC predict'].values)
            
            if current_mae < best_mae:
                best_mae = current_mae
                best_predict_table = predict_table.copy()
                
            k_current += 1
        '''
        #ppc_predict = estimator.predict(previsores_content)
        #predict_table = pd.DataFrame(index=[i for i in range(len(ppc_predict))],columns=['PPC correct', 'PPC predict', 'MAE'])

        #for i in range(len(ppc_predict)):
        #    predict_table['PPC correct'][i] = MathUtils.truncate(ppc[i], self.__precision)
        #    predict_table['PPC predict'][i] = MathUtils.truncate(ppc_predict[i], self.__precision)
            #print(ppc_predict[i], ppc[i])
        idx_best_estimator = estimator['test_neg_root_mean_squared_error'].argmax()
        self.__error_metrics_table[label]['Mean Abs Error'] = abs(estimator['test_neg_mean_absolute_error'][idx_best_estimator])#mean_absolute_error(predict_table['PPC correct'].values, predict_table['PPC predict'].values)
        self.__error_metrics_table[label]['Mean Sqr Error'] = abs(estimator['test_neg_mean_squared_error'][idx_best_estimator]) #mean_squared_error(predict_table['PPC correct'].values, predict_table['PPC predict'].values)
        self.__error_metrics_table[label]['Mean Sqr Log Error'] = abs(estimator['test_neg_mean_squared_log_error'][idx_best_estimator]) #mean_squared_log_error(predict_table['PPC correct'].values, predict_table['PPC predict'].values)
        self.__error_metrics_table[label]['Mean Median Error'] = abs(estimator['test_neg_median_absolute_error'][idx_best_estimator]) #median_absolute_error(predict_table['PPC correct'].values, predict_table['PPC predict'].values)
        self.__error_metrics_table[label]['R2 Score'] = abs(estimator['test_r2'][idx_best_estimator]) #r2_score(predict_table['PPC correct'].values, predict_table['PPC predict'].values)
    
    def feature_importance(self, metrics):
        ppc = self.__dataset['PrimePathCoverage'].values
        importance_dataset = pd.DataFrame(
                [0]*len(metrics),
                index=metrics,
                columns=['importance']
        )
        previsores_content = d[metrics].values
        resultados = cross_validate(
                self.__regressor, 
                previsores_content, 
                ppc, 
                cv=self.__k, 
                scoring=self.__error_metrics,
                return_estimator=True
        )
        resultados_escalonados = cross_validate(
                self.__regressor, 
                escalonador(previsores_content), 
                ppc, 
                cv=self.__k, 
                scoring=self.__error_metrics,
                return_estimator=True
        )

        print('-----< SEM ESCALONAMENTO >-----')
        #print(resultados)
        for idx,estimator in enumerate(resultados['estimator']):
            feature_importances = pd.DataFrame(estimator.feature_importances_,
                                               index=metrics,
                                               columns=['importance'])
            importance_dataset += feature_importances

        #print(importance_dataset) 
        importance_dataset['mean'] = importance_dataset['importance'] / self.__k
        plt.figure(figsize=(10,9))
        plt.title("Feature importance - Random forest - K = 10 - Mean")
        plt.barh(metrics, importance_dataset['mean'].values)
        plt.show()

        importance_dataset = pd.DataFrame(
                [0]*len(metrics),
                index=metrics,
                columns=['importance']
        )
        print('\n')

        print('-----< COM ESCALONAMENTO >-----')
        for idx,estimator in enumerate(resultados_escalonados['estimator']):
            feature_importances = pd.DataFrame(estimator.feature_importances_,
                                               index=metrics,
                                               columns=['importance'])
            importance_dataset += feature_importances

        importance_dataset['mean'] = importance_dataset['importance'] / self.__k
        print(importance_dataset['mean'].sort_values(ascending=False))
        plt.figure(figsize=(10,9))
        plt.title("Feature importance - Random forest - K = 10 - Mean")
        plt.barh(metrics, importance_dataset['mean'].values)
        plt.show()
        
    

In [16]:
class MlPpcEvaluator(ABC):
    
    def __init__(self, dataset, tot_seeds=0, auto_display=True):
        self.error_noscaled_metrics_table = None
        self.error_scaled_metrics_table = None
        self.__dataset = dataset
        self.__tot_seeds = tot_seeds
        self.__viewer = MlPpcViewer(self)
        self.__auto_display = auto_display
    
    def evaluate(self, metrics):
        self.__build_noscaled_dataframe()
        self.__build_scaled_dataframe()
        self.evaluate_metrics(metrics)
        
        if self.__auto_display:
            self.display_results()
    
    def __build_noscaled_dataframe(self):
        self.error_noscaled_metrics_table = self.__build_dataframe('Without scaling')
        
    def __build_scaled_dataframe(self):
        self.error_scaled_metrics_table = self.__build_dataframe('With scaling')
        
    def __build_dataframe(self, caption):
        seeds = [i for i in range(self.__tot_seeds+1)]
        
        if self.__tot_seeds > 1:
            seeds.append('Mean')
        
        dataframe = pd.DataFrame(
            index=['Mean Abs Error', 'Mean Sqr Error', 'Mean Sqr Log Error', 'Mean Median Error', 'R2 Score'],
            columns=seeds
        )
        dataframe.columns.name = 'Seed'
        dataframe.index.name = 'Error Metrics'
        
        return dataframe

    @abstractmethod
    def evaluate_metrics(self, metrics):
        pass
        
    def display_results(self):
        self.__viewer.display_scaled_evaluation()
        self.__viewer.display_noscaled_evaluation()
        
    def get_noscaled_metrics_table(self):
        return self.error_noscaled_metrics_table
    
    def get_scaled_metrics_table(self):
        return self.error_scaled_metrics_table
    
    def get_total_seeds(self):
        return self.__tot_seeds
    
    def get_dataset(self):
        return self.__dataset
    
    def get_dataframe(self):
        return self.__dataset.get_dataframe()

## Linear Regression

In [12]:
class LinearRegressionMlPpcEvaluator(MlPpcEvaluator):
    
    def __init__(self, dataset):
        super(LinearRegressionMlPpcEvaluator, self).__init__(dataset, auto_display=False)
    
    def evaluate_metrics(self, metrics):
        only_ec = self.__is_only_ec(metrics)
        
        for metric in metrics:
            self.__evaluate_metric(metric, only_ec)
            self.display_results(metric)
            
    def __is_only_ec(self, metrics):
        return (len(metrics) == 1) and (metrics[0] == 'EdgeCoverage')
    
    def __evaluate_metric(self, metric, only_ec=False):
        evaluator = PpcEvaluator(self.__get_regressor(only_ec), self.get_dataset())
        error_metrics = evaluator.evaluate([metric])
        self.error_noscaled_metrics_table[0] = error_metrics['no_scaled']
        self.error_scaled_metrics_table[0] = error_metrics['scaled']
        
    def __get_regressor(self, only_ec):
        return LinearRegression(fit_intercept=False) if only_ec else LinearRegression(positive=True)
    
    def display_results(self, metric):
        print('Metric:', metric, end='')
        super().display_results()

## SVR

In [9]:
class SvrMlPpcEvaluator(MlPpcEvaluator):
    
    def __init__(self, dataset):
        super(SvrMlPpcEvaluator, self).__init__(dataset)
    
    def evaluate_metrics(self, metrics):
        evaluator = PpcEvaluator(self.__get_regressor(), self.get_dataset())
        error_metrics = evaluator.evaluate(metrics)
        self.error_noscaled_metrics_table[0] = error_metrics['no_scaled']
        self.error_scaled_metrics_table[0] = error_metrics['scaled']
        
    def __get_regressor(self):
        return SVR(epsilon=0.3)

## Random Forest

In [10]:
class RandomForestMlPpcEvaluator(MlPpcEvaluator):
    
    def __init__(self, dataset, tot_seeds=0, auto_display=True):
        super(RandomForestMlPpcEvaluator, self).__init__(dataset, tot_seeds, auto_display=auto_display)

    def evaluate_metrics(self, metrics):
        total_seeds = self.get_total_seeds()
        
        for i in range(0, total_seeds + 1):
            evaluator = PpcEvaluator(self.__get_regressor_using_seed(i), self.get_dataset())
            error_metrics = evaluator.evaluate(metrics)
            self.error_noscaled_metrics_table[i] = error_metrics['no_scaled']
            self.error_scaled_metrics_table[i] = error_metrics['scaled']
            
        if total_seeds > 1:
            self.__compute_mean_error()
            
    def __get_regressor_using_seed(self, seed):
        return RandomForestRegressor(random_state=seed)
    
    def __compute_mean_error(self):
        self.__compute_mean_error_of_dataframe(self.error_noscaled_metrics_table)
        self.__compute_mean_error_of_dataframe(self.error_scaled_metrics_table)
    
    @staticmethod
    def __compute_mean_error_of_dataframe(dataframe):
        dataframe['Mean']['Mean Abs Error'] = dataframe.iloc[0,:-1].mean()
        dataframe['Mean']['Mean Sqr Error'] = dataframe.iloc[1,:-1].mean()
        dataframe['Mean']['Mean Sqr Log Error'] = dataframe.iloc[2,:-1].mean()
        dataframe['Mean']['Mean Median Error'] = dataframe.iloc[3,:-1].mean()
        dataframe['Mean']['R2 Score'] = dataframe.iloc[4,:-1].mean()

In [None]:
class RandomForestFeatureImportance():
    
    def __init__(self, dataset, k=10):
        self.__k = k
        self.__dataset = dataset.get_dataframe()
        self.__ppc = self.__dataset['PrimePathCoverage'].values
        self.__independent_variables = None
        self.__error_metrics = [
            'r2', 'max_error', 'neg_mean_absolute_error',
            'neg_mean_squared_error', 'neg_root_mean_squared_error',
            'neg_mean_squared_log_error', 'neg_median_absolute_error'
        ]
    
    def evaluate(self, metrics):
        self.__extract_independent_variables_using_metrics(metrics)
        self.__evaluate_feature_importance_with_scaling(metrics)
        self.__evaluate_feature_importance_without_scaling(metrics)
        
    def __extract_independent_variables_using_metrics(self, metrics):
        self.__independent_variables = self.__dataset[metrics].values
        
    def __evaluate_feature_importance_without_scaling(self, metrics):
        importance_dataset = self.__compute_feature_importance_without_scaling(metrics)
        self.__display_feature_importance_table(importance_dataset, 'Without scaling')
        self.__display_feature_importance_chart(importance_dataset, 'Without scaling', metrics)
        
    def __compute_feature_importance_without_scaling(self, metrics):
        return self.__compute_feature_importance_using_independent_variables(
            metrics, 
            self.__independent_variables
        )
    
    def __compute_feature_importance_using_independent_variables(self, metrics, independent_variables):
        importance_dataset = pd.DataFrame(
                [0]*len(metrics),
                index=metrics,
                columns=['importance']
        )

        resultados = cross_validate(
                RandomForestRegressor(), 
                independent_variables, 
                self.__ppc, 
                cv=self.__k, 
                scoring=self.__error_metrics,
                return_estimator=True
        )

        for idx,estimator in enumerate(resultados['estimator']):
            feature_importances = pd.DataFrame(estimator.feature_importances_,
                                               index=metrics,
                                               columns=['importance'])
            importance_dataset += feature_importances

        importance_dataset['mean'] = importance_dataset['importance'] / self.__k
        
        return importance_dataset
    
    def __display_feature_importance_table(self, importance_dataset, title):
        feature_importance_series = importance_dataset['mean'].sort_values(ascending=False)
        d = pd.DataFrame(columns=['Metrics', 'Feature importance'])
        d['Metrics'] = feature_importance_series.index
        d['Feature importance'] = feature_importance_series.values
        self.__display_dataframe_using_title(d, title)
        
    def __display_dataframe_using_title(self, dataframe, title):
        styled_dataframe = dataframe.style.set_caption(title).set_table_styles([{
            'selector': 'caption',
            'props': [
                ('color', 'black'),
                ('font-size', '16px')
            ]
        }])
        display(styled_dataframe)
        
    def __display_feature_importance_chart(self, importance_dataset, title, metrics):
        plt.figure(figsize=(10,9))
        plt.title("Feature importance - Random forest - K = " + str(self.__k) + " - " + title)
        plt.barh(metrics, importance_dataset['mean'].values)
        plt.show()
        
    def __evaluate_feature_importance_with_scaling(self, metrics):
        importance_dataset = self.__compute_feature_importance_with_scaling(metrics)
        self.__display_feature_importance_table(importance_dataset, 'With scaling')
        self.__display_feature_importance_chart(importance_dataset, 'With scaling', metrics)
        
    def __compute_feature_importance_with_scaling(self, metrics):
        return self.__compute_feature_importance_using_independent_variables(
            metrics, 
            self.__scale(self.__independent_variables)
        )
    
    def __scale(self, data):
        scaler = MinMaxScaler()
        return scaler.fit_transform(data)

## K-Neighbors

In [15]:
class KNeighborsMlPpcEvaluator(MlPpcEvaluator):
    
    def __init__(self, dataset):
        super(KNeighborsMlPpcEvaluator, self).__init__(dataset, 0)
    
    def evaluate_metrics(self, metrics):
        evaluator = PpcEvaluator(self.__get_regressor(), self.get_dataset())
        error_metrics = evaluator.evaluate(metrics)
        self.error_noscaled_metrics_table[0] = error_metrics['no_scaled']
        self.error_scaled_metrics_table[0] = error_metrics['scaled']
        
    def __get_regressor(self):
        return KNeighborsRegressor()

## Neural network

In [None]:
class NeuralNetworkMlPpcEvaluator(MlPpcEvaluator):
    
    def __init__(self, dataset, tot_seeds=0, epsilon=0.5):
        super(NeuralNetworkMlPpcEvaluator, self).__init__(dataset, tot_seeds)
        self.__epsilon=epsilon

    def evaluate_metrics(self, metrics):
        total_seeds = self.get_total_seeds()
        
        for i in range(0, total_seeds + 1):
            evaluator = PpcEvaluator(self.__get_regressor_using_seed(i), self.get_dataset())
            error_metrics = evaluator.evaluate(metrics)
            self.error_noscaled_metrics_table[i] = error_metrics['no_scaled']
            self.error_scaled_metrics_table[i] = error_metrics['scaled']
            
        if total_seeds > 1:
            self.__compute_mean_error()
            
    def __get_regressor_using_seed(self, seed):
        return MLPRegressor(epsilon=self.__epsilon, random_state=seed)
    
    def __compute_mean_error(self):
        self.__compute_mean_error_of_dataframe(self.error_noscaled_metrics_table)
        self.__compute_mean_error_of_dataframe(self.error_scaled_metrics_table)
    
    @staticmethod
    def __compute_mean_error_of_dataframe(dataframe):
        dataframe['Mean']['Mean Abs Error'] = dataframe.iloc[0,:-1].mean()
        dataframe['Mean']['Mean Sqr Error'] = dataframe.iloc[1,:-1].mean()
        dataframe['Mean']['Mean Sqr Log Error'] = dataframe.iloc[2,:-1].mean()
        dataframe['Mean']['Mean Median Error'] = dataframe.iloc[3,:-1].mean()
        dataframe['Mean']['R2 Score'] = dataframe.iloc[4,:-1].mean()

## Custom

In [None]:
class CustomPpcEvaluator(MlPpcEvaluator):
    
    def __init__(self, dataset, regressor):
        super(CustomPpcEvaluator, self).__init__(dataset)
        self.__regressor = regressor
    
    def evaluate_metrics(self, metrics):
        evaluator = PpcEvaluator(self.__regressor, self.get_dataset())
        error_metrics = evaluator.evaluate(metrics)
        self.error_noscaled_metrics_table[0] = error_metrics['no_scaled']
        self.error_scaled_metrics_table[0] = error_metrics['scaled']