In [502]:
class ExternalValidation:
    def __init__(self, dataset_internal_validation, dataset_external_validation, k=10):
        self.__dataset_internal_validation = dataset_internal_validation.get_dataframe()
        self.__dataset_external_validation = dataset_external_validation.get_dataframe()
        self.__k = k
        self.__ppc = self.__dataset_external_validation['PrimePathCoverage'].values
        self.__predict_error_table_scaled = None
        self.__predict_error_table_noscaled = None
        self.__mean_error_scaled_table = None
        self.__mean_error_noscaled_table = None

    def __scale(self, data):
        scaler = MinMaxScaler()
        return scaler.fit_transform(data)
    
    def evaluate(self, metrics):
        self.__evaluate_scaled(metrics)
        self.__evaluate_noscaled(metrics)
        self.display_mean_error_table_scaled()
        self.display_mean_error_table_noscaled()
        
    def __evaluate_scaled(self, independent_variables):
        independent_variables_values = self.__scale(self.__dataset_external_validation[independent_variables].values)
        estimator_metrics_scaled = self.__get_cross_validate_estimators_scaled(independent_variables)
        self.__predict_error_table_scaled = self.__build_predict_error_table(estimator_metrics_scaled, independent_variables_values)
        self.__mean_error_scaled_table = self.__build_mean_error_table(self.__predict_error_table_scaled)
        self.__predict_error_table_scaled = self.__build_full_predict_error_table(estimator_metrics_scaled, independent_variables_values)
        
    def __evaluate_noscaled(self, independent_variables):
        independent_variables_values = self.__dataset_external_validation[independent_variables].values
        estimator_metrics_noscaled = self.__get_cross_validate_estimators_noscaled(independent_variables)
        self.__predict_error_table_noscaled = self.__build_predict_error_table(estimator_metrics_noscaled, independent_variables_values)
        self.__mean_error_noscaled_table = self.__build_mean_error_table(self.__predict_error_table_noscaled)
        self.__predict_error_table_noscaled = self.__build_full_predict_error_table(estimator_metrics_noscaled, independent_variables_values)

    def __build_predict_error_table(self, estimators, previsores_content):
        results = pd.DataFrame(
                index=[i for i in range(10)], 
                columns=['Mean Abs Error', 'Mean Sqr Error', 'Mean Sqr Log Error', 'Mean Median Error', 'R2 Score']
        )
        j = 0
        
        for estimator in estimators:
            ppc_predict = estimator.predict(previsores_content)
            predict_table = pd.DataFrame(index=[i for i in range(len(ppc_predict))],columns=['PPC correct', 'PPC predict'])

            for i in range(len(ppc_predict)):
                predict_table['PPC correct'][i] = self.__ppc[i]
                predict_table['PPC predict'][i] = ppc_predict[i]

            results.iloc[j,0] = mean_absolute_error(predict_table['PPC correct'].values, predict_table['PPC predict'].values)
            results.iloc[j,1] = mean_squared_error(predict_table['PPC correct'].values, predict_table['PPC predict'].values)
            results.iloc[j,2] = mean_squared_log_error(predict_table['PPC correct'].values, predict_table['PPC predict'].values)
            results.iloc[j,3] = median_absolute_error(predict_table['PPC correct'].values, predict_table['PPC predict'].values)
            results.iloc[j,4] = r2_score(predict_table['PPC correct'].values, predict_table['PPC predict'].values)

            j += 1
            
        return results
    
    def __build_full_predict_error_table(self, estimators, previsores_content):
        results_full = pd.DataFrame(columns=['CC', 'Real EC', 'Predicted PPC', 'Real PPC', 'Error'])
        cyclomatic = self.__dataset_external_validation['Cyclomatic'].values
        ec = self.__dataset_external_validation['EdgeCoverage'].values

        results = pd.DataFrame(
                index=[i for i in range(10)], 
                columns=['Mean Sqr Error']
        )

        predict_table = pd.DataFrame(columns=['Cyclomatic', 'EC correct', 'PPC predict', 'PPC correct', 'Error'])


        for estimator in estimators:
            ppc_predict = estimator.predict(previsores_content)
            tot_ppc_predict = len(ppc_predict)
            current_predict_table = pd.DataFrame(
                columns=['Cyclomatic', 'EC correct', 'PPC predict', 'PPC correct', 'Error'],
                index=[i for i in range(tot_ppc_predict)]
            )


            for i in range(tot_ppc_predict):
                current_predict_table.loc[i, 'PPC correct'] = self.__ppc[i]
                current_predict_table.loc[i, 'PPC predict'] = ppc_predict[i]
                current_predict_table.loc[i, 'EC correct'] = ec[i]
                current_predict_table.loc[i, 'Cyclomatic'] = cyclomatic[i]
                current_predict_table.loc[i, 'Error'] = abs(self.__ppc[i] - ppc_predict[i])

            current_predict_error = current_predict_table['Error'].mean()
            best_predict_error = predict_table['Error'].mean()

            if (np.isnan(best_predict_error)) or (current_predict_error < best_predict_error):
                predict_table = current_predict_table.copy()

        predict_table.sort_values(by='Error', ascending=False, inplace=True)

        return predict_table
    
    def __build_mean_error_table(self, predict_table):
        error_table = pd.DataFrame(
                index=[0], 
                columns=['Mean Abs Error', 'Mean Sqr Error', 'Mean Sqr Log Error', 'Mean Median Error', 'R2 Score']
        )
        error_table['Mean Abs Error'] = predict_table['Mean Abs Error'].mean()
        error_table['Mean Sqr Error'] = predict_table['Mean Sqr Error'].mean()
        error_table['Mean Sqr Log Error'] = predict_table['Mean Sqr Log Error'].mean()
        error_table['Mean Median Error'] = predict_table['Mean Median Error'].mean()
        error_table['R2 Score'] = predict_table['R2 Score'].mean()
        
        return error_table

    def __get_cross_validate_estimators_scaled(self, previsores):
        d = self.__dataset_internal_validation
        ppc = d['PrimePathCoverage'].values
        previsores_content = d[previsores].values

        resultados_escalonados = cross_validate(
                RandomForestRegressor(random_state=0), 
                self.__scale(previsores_content), 
                ppc, 
                cv=k, 
                scoring=metricas_erro, 
                return_estimator=True
        )

        return resultados_escalonados['estimator']

    def __get_cross_validate_estimators_noscaled(self, previsores):
        d = self.__dataset_internal_validation
        k = 10
        classificador = RandomForestRegressor(random_state=0)
        ppc = d['PrimePathCoverage'].values
        previsores_content = d[previsores].values

        resultados = cross_validate(
                classificador, 
                previsores_content, 
                ppc, 
                cv=k, 
                scoring=metricas_erro, 
                return_estimator=True
        )

        return resultados['estimator']
    
    def display_mean_error_table_scaled(self):
        self.__display_dataframe_using_title(self.__mean_error_scaled_table, 'With scaling')
        self.__display_error_mean_table(self.__predict_error_table_scaled, 'With scaling')
        self.__display_interval_error_table(self.__predict_error_table_scaled, 'With scaling')
        
    def __display_dataframe_using_title(self, dataframe, title):
        styled_dataframe = dataframe.style.set_caption(title).set_table_styles([{
            'selector': 'caption',
            'props': [
                ('color', 'black'),
                ('font-size', '16px')
            ]
        }])
        display(styled_dataframe)
        
    def display_mean_error_table_noscaled(self):
        self.__display_dataframe_using_title(self.__mean_error_noscaled_table, 'Without scaling')
        self.__display_error_mean_table(self.__predict_error_table_noscaled, 'Without scaling')
        self.__display_interval_error_table(self.__predict_error_table_noscaled, 'Without scaling')
        
    def __display_error_mean_table(self, dataset, title):
        error_mean_table = pd.DataFrame(
            columns=['Cyclomatic', 'Average error', 'Total'],
            index=[i for i in range(5)]
        )
        error_mean_table['Cyclomatic'][0] = '[0;2]'
        error_mean_table['Cyclomatic'][1] = '[3;5]'
        error_mean_table['Cyclomatic'][2] = '[6;8]'
        error_mean_table['Cyclomatic'][3] = '[8;10]'
        error_mean_table['Cyclomatic'][4] = '> 10'

        error_mean_table['Average error'][0] = dataset[(dataset['Cyclomatic'] >= 0) & (dataset['Cyclomatic'] <= 2)]['Error'].mean()
        error_mean_table['Average error'][1] = dataset[(dataset['Cyclomatic'] >= 3) & (dataset['Cyclomatic'] <= 5)]['Error'].mean()
        error_mean_table['Average error'][2] = dataset[(dataset['Cyclomatic'] >= 6) & (dataset['Cyclomatic'] <= 8)]['Error'].mean()
        error_mean_table['Average error'][3] = dataset[(dataset['Cyclomatic'] >= 8) & (dataset['Cyclomatic'] <= 10)]['Error'].mean()
        error_mean_table['Average error'][4] = dataset[dataset['Cyclomatic'] > 10]['Error'].mean()

        error_mean_table['Total'][0] = dataset[(dataset['Cyclomatic'] >= 0) & (dataset['Cyclomatic'] <= 2)]['Error'].shape[0]
        error_mean_table['Total'][1] = dataset[(dataset['Cyclomatic'] >= 3) & (dataset['Cyclomatic'] <= 5)]['Error'].shape[0]
        error_mean_table['Total'][2] = dataset[(dataset['Cyclomatic'] >= 6) & (dataset['Cyclomatic'] <= 8)]['Error'].shape[0]
        error_mean_table['Total'][3] = dataset[(dataset['Cyclomatic'] >= 8) & (dataset['Cyclomatic'] <= 10)]['Error'].shape[0]
        error_mean_table['Total'][4] = dataset[dataset['Cyclomatic'] > 10]['Error'].shape[0]

        self.__display_dataframe_using_title(error_mean_table, title)
        
    def __display_interval_error_table(self, dataset, title):
        MAX_ROWS = 50
        pd.set_option('display.max_rows', MAX_ROWS)
        error_table = pd.DataFrame(columns=['Interval', 'Total'], index=[0, 1, 2, 3, 4])
        
        error_table['Interval'][0] = 'Error == 0'
        error_table['Total'][0] = dataset[dataset['Error'] == 0].shape[0]
        
        error_table['Interval'][1] = '0 < Error < 0.3'
        error_table['Total'][1] = dataset[(dataset['Error'] > 0) & (dataset['Error'] < 0.3)].shape[0]
        
        error_table['Interval'][2] = '0.3 <= Error < 0.5'
        error_table['Total'][2] = dataset[(dataset['Error'] >= 0.3) & (dataset['Error'] < 0.5)].shape[0]
        
        error_table['Interval'][3] = '0.5 <= Error < 0.7'
        error_table['Total'][3] = dataset[(dataset['Error'] >= 0.5) & (dataset['Error'] < 0.7)].shape[0]
        
        error_table['Interval'][4] = 'Error >= 0.7'
        error_table['Total'][4] = dataset[dataset['Error'] >= 0.7].shape[0]
        
        self.__display_dataframe_using_title(dataset.iloc[0:MAX_ROWS, :], title)
        self.__display_dataframe_using_title(error_table, title)


In [1245]:
'''
estimators_scm_best_noscaled = get_cross_validate_estimators_noscaled(['CountPath', 'CountOutput.1', 'CountInput', 'CountStmt', 'CountStmtDecl.1', 'CountOutput', 'CountStmt.1'])
estimators_scm_best_scaled = get_cross_validate_estimators_scaled(['CountPath', 'CountOutput.1', 'CountInput', 'CountStmt', 'CountStmtDecl.1', 'CountOutput', 'CountStmt.1'])
estimators_scm_all_noscaled = get_cross_validate_estimators_noscaled(previsores_sem_ec)
estimators_scm_all_scaled = get_cross_validate_estimators_scaled(previsores_sem_ec)

estimators_scmec_best_noscaled = get_cross_validate_estimators_noscaled(['MaxNesting', 'CountInput', 'CountStmt', 'CountOutput', 'Cyclomatic', 'Knots', 'EdgeCoverage'])
estimators_scmec_best_scaled = get_cross_validate_estimators_scaled(['MaxNesting', 'CountInput', 'CountStmt', 'CountOutput', 'Cyclomatic', 'Knots', 'EdgeCoverage'])
estimators_scmec_all_noscaled = get_cross_validate_estimators_noscaled(previsores)
estimators_scmec_all_scaled = get_cross_validate_estimators_scaled(previsores)

estimators_ec_noscaled = get_cross_validate_estimators_noscaled(['EdgeCoverage'])
estimators_ec_scaled = get_cross_validate_estimators_scaled(['EdgeCoverage'])
'''

In [33]:
'''
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, median_absolute_error, r2_score

def evaluate_rf_external_validation(estimadores_dataset_treinado, dataset_externo, previsores, 
                                    display_prediction = False, scale=False):
    ppc = dataset_externo['PrimePathCoverage'].values
    previsores_content = dataset_externo[previsores].values
    
    if scale:
        previsores_content = escalonador(previsores_content)
    
    results = pd.DataFrame(
            index=[i for i in range(10)], 
            columns=['Mean Abs Error', 'Mean Sqr Error', 'Mean Sqr Log Error', 'Mean Median Error', 'R2 Score']
    )
    results_mean = pd.DataFrame(
            index=[0], 
            columns=['Mean Abs Error', 'Mean Sqr Error', 'Mean Sqr Log Error', 'Mean Median Error', 'R2 Score']
    )
    
    k = 0
    for estimator in estimadores_dataset_treinado:
        ppc_predict = estimator.predict(previsores_content)
        predict_table = pd.DataFrame(index=[i for i in range(len(ppc_predict))],columns=['PPC correct', 'PPC predict'])

        for i in range(len(ppc_predict)):
            predict_table['PPC correct'][i] = ppc[i]
            predict_table['PPC predict'][i] = ppc_predict[i]
        
        results.iloc[k,0] = mean_absolute_error(predict_table['PPC correct'].values, predict_table['PPC predict'].values)
        results.iloc[k,1] = mean_squared_error(predict_table['PPC correct'].values, predict_table['PPC predict'].values)
        results.iloc[k,2] = mean_squared_log_error(predict_table['PPC correct'].values, predict_table['PPC predict'].values)
        results.iloc[k,3] = median_absolute_error(predict_table['PPC correct'].values, predict_table['PPC predict'].values)
        results.iloc[k,4] = r2_score(predict_table['PPC correct'].values, predict_table['PPC predict'].values)
    
        k += 1
        
    results_mean['Mean Abs Error'] = results['Mean Abs Error'].mean()
    results_mean['Mean Sqr Error'] = results['Mean Sqr Error'].mean()
    results_mean['Mean Sqr Log Error'] = results['Mean Sqr Log Error'].mean()
    results_mean['Mean Median Error'] = results['Mean Median Error'].mean()
    results_mean['R2 Score'] = results['R2 Score'].mean()
    
    return results_mean
'''

In [747]:
'''
def evalutate_rf_external_validation_scm_best(dataset_validation):
    previsores = ['CountPath', 'CountOutput.1', 'CountInput', 'CountStmt', 'CountStmtDecl.1', 'CountOutput', 'CountStmt.1']
    results_scaled = evaluate_rf_external_validation(
        estimators_scm_best_scaled, dataset_validation, previsores, scale=True
    )
    results_noscaled = evaluate_rf_external_validation(estimators_scm_best_noscaled, dataset_validation, previsores, scale=False)
    
    display('-----< SEM ESCALONAMENTO >-----')
    display(results_noscaled)
    display('-----< COM ESCALONAMENTO >-----')
    display(results_scaled)
    
def evalutate_rf_external_validation_scm_all(dataset_validation):
    previsores = previsores_sem_ec
    results_scaled = evaluate_rf_external_validation(estimators_scm_all_scaled, dataset_validation, previsores, scale=True)
    results_noscaled = evaluate_rf_external_validation(estimators_scm_all_noscaled, dataset_validation, previsores, scale=False)
    
    display('-----< SEM ESCALONAMENTO >-----')
    display(results_noscaled)
    display('-----< COM ESCALONAMENTO >-----')
    display(results_scaled)
    
def evalutate_rf_external_validation_scmec_best(dataset_validation):
    previsores = ['MaxNesting', 'CountInput', 'CountStmt', 'CountOutput', 'Cyclomatic', 'Knots', 'EdgeCoverage']
    results_scaled = evaluate_rf_external_validation(estimators_scmec_best_scaled, dataset_validation, previsores, scale=True)
    results_noscaled = evaluate_rf_external_validation(estimators_scmec_best_noscaled, dataset_validation, previsores, scale=False)
    
    display('-----< SEM ESCALONAMENTO >-----')
    display(results_noscaled)
    display('-----< COM ESCALONAMENTO >-----')
    display(results_scaled)
    
def evalutate_rf_external_validation_scmec_all(dataset_validation):
    results_scaled = evaluate_rf_external_validation(estimators_scmec_all_scaled, dataset_validation, previsores, scale=True)
    results_noscaled = evaluate_rf_external_validation(estimators_scmec_all_noscaled, dataset_validation, previsores, scale=False)
    
    display('-----< SEM ESCALONAMENTO >-----')
    display(results_noscaled)
    display('-----< COM ESCALONAMENTO >-----')
    display(results_scaled)
    
def evalutate_rf_external_validation_ec(dataset_validation):
    previsores = ['EdgeCoverage']
    results_scaled = evaluate_rf_external_validation(estimators_ec_scaled, dataset_validation, previsores, scale=True)
    results_noscaled = evaluate_rf_external_validation(estimators_ec_noscaled, dataset_validation, previsores, scale=False)
    
    display('-----< SEM ESCALONAMENTO >-----')
    display(results_noscaled)
    display('-----< COM ESCALONAMENTO >-----')
    display(results_scaled)
'''

In [None]:
'''evalutate_rf_external_validation_scm_best(dataset)'''