# 1 - Software Efort Estimation Using Stacked Ensemble Technique and Hybrid Principal Component Regression and Multivariate Adaptive Regression Splines

# 2 - Estimating Software Development Efforts Using a Random Forest-Based Stacked Ensemble Approach

In [5]:
import os
import pandas as pd
from sklearn.model_selection import KFold, GridSearchCV, cross_val_predict
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.utils.validation import check_X_y
import numpy as np

# Função para carregar datasets
def load_datasets(directory):
    datasets = {}
    if not os.path.exists(directory):
        print(f"Diretório '{directory}' não encontrado.")
        return datasets

    for file in os.listdir(directory):
        filepath = os.path.join(directory, file)
        filename, ext = os.path.splitext(file)

        try:
            if ext == '.txt':
                with open(filepath, 'r') as f:
                    first_line = f.readline()
                    delimiter = ',' if ',' in first_line else ('\t' if '\t' in first_line else ' ')
                datasets[filename] = pd.read_csv(filepath, delimiter=delimiter)
                print(f"{filename}: Delimitador detectado - '{delimiter}'")
            else:
                print(f"Formato não suportado: {file}")
        except Exception as e:
            print(f"Erro ao carregar {file}: {e}")

    print(f"{len(datasets)} arquivos carregados com sucesso de {directory}.")
    return datasets

# Função para treinar e testar modelos de regressão
def test_regression_models(data, output_directory, n_splits=5, n_runs=30):
    models = {
        'LinearRegression': LinearRegression(),
        'Ridge': Ridge(),
        'Lasso': Lasso(),
        'RandomForest': RandomForestRegressor(),
        'GradientBoosting': GradientBoostingRegressor()
    }

    param_grid = {
        'LinearRegression': {},
        'Ridge': {'alpha': [0.1, 1, 10]},
        'Lasso': {'alpha': [0.1, 0.5, 1]},
        'RandomForest': {'n_estimators': [50, 100, 200]},
        'GradientBoosting': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2]}
    }

    results = {}

    for name, df in data.items():
        df = df.dropna()

        if df.shape[0] < 2 or df.shape[1] < 2:
            print(f"{name}: Dataset insuficiente para treino. Tamanho: {df.shape}")
            continue

        X = df.iloc[:, :-1]
        y = df.iloc[:, -1]

        try:
            X, y = check_X_y(X, y)
        except ValueError as e:
            print(f"{name}: Erro de validação dos dados - {e}")
            continue

        metrics_agg = {
            'mae': [],
            'rmse': [],
            'r2': []
        }

        for _ in range(n_runs):
            kf = KFold(n_splits=n_splits, shuffle=True, random_state=np.random.randint(0, 10000))

            for model_name, model in models.items():
                metrics = {
                    'mae': [],
                    'rmse': [],
                    'r2': []
                }

                grid = GridSearchCV(model, param_grid[model_name], cv=kf, scoring='neg_mean_squared_error', error_score='raise')

                try:
                    grid.fit(X, y)
                    best_model = grid.best_estimator_

                    # Previsões com validação cruzada
                    y_pred = cross_val_predict(best_model, X, y, cv=kf)

                    # Cálculo das métricas
                    metrics['mae'].append(mean_absolute_error(y, y_pred))
                    metrics['rmse'].append(np.sqrt(mean_squared_error(y, y_pred)))
                    metrics['r2'].append(r2_score(y, y_pred))

                except Exception as e:
                    print(f"{name} - {model_name}: Falha no treinamento - {e}")
                    continue

                # Agregar métricas
                metrics_agg['mae'].append(np.mean(metrics['mae']))
                metrics_agg['rmse'].append(np.mean(metrics['rmse']))
                metrics_agg['r2'].append(np.mean(metrics['r2']))

                results[f'{name}_{model_name}'] = {
                    'mae': np.mean(metrics_agg['mae']),
                    'rmse': np.mean(metrics_agg['rmse']),
                    'r2': np.mean(metrics_agg['r2']),
                    'best_params': grid.best_params_
                }

                print(f"{name} - {model_name}: MAE = {np.mean(metrics_agg['mae']):.4f}, RMSE = {np.mean(metrics_agg['rmse']):.4f}, R² = {np.mean(metrics_agg['r2']):.4f}")

    results_df = pd.DataFrame.from_dict(results, orient='index')
    output_path = os.path.join(output_directory, 'artigo2-abordagem2.xlsx')

    try:
        os.makedirs(output_directory, exist_ok=True)
        results_df.to_excel(output_path)
        print(f"Resultados salvos em '{output_path}'.")
    except Exception as e:
        print(f"Erro ao salvar os resultados: {e}")


# Processamento de múltiplos diretórios
output_directory = r'C:\Users\CALEO\OneDrive - Hexagon\Documents\GitHub\Software_effort_estimation\proposal\processamento\resultados'  # Especifique a pasta de saída

directories = [
    r'C:\Users\CALEO\OneDrive - Hexagon\Documents\GitHub\Software_effort_estimation\proposal\abordagem2\saida\4-escolha\selectKbest\50\3-simulacao\6-MIMIC-50',
]

for directory in directories:
    data = load_datasets(directory)
    if data:
        test_regression_models(data, output_directory)


tratamento_china: Delimitador detectado - ','
tratamento_cocomo81: Delimitador detectado - ','
tratamento_desharnais: Delimitador detectado - ','
tratamento_maxwell: Delimitador detectado - ','
4 arquivos carregados com sucesso de C:\Users\CALEO\OneDrive - Hexagon\Documents\GitHub\Software_effort_estimation\proposal\abordagem2\saida\4-escolha\selectKbest\50\3-simulacao\6-MIMIC-50.
tratamento_china - LinearRegression: MAE = 0.0007, RMSE = 0.0010, R² = 0.9963
tratamento_china - Ridge: MAE = 0.0017, RMSE = 0.0029, R² = 0.9550
tratamento_china - Lasso: MAE = 0.0045, RMSE = 0.0074, R² = 0.6351
tratamento_china - RandomForest: MAE = 0.0037, RMSE = 0.0064, R² = 0.7162
tratamento_china - GradientBoosting: MAE = 0.0032, RMSE = 0.0056, R² = 0.7676
tratamento_china - LinearRegression: MAE = 0.0028, RMSE = 0.0049, R² = 0.8058
tratamento_china - Ridge: MAE = 0.0028, RMSE = 0.0048, R² = 0.8226
tratamento_china - Lasso: MAE = 0.0037, RMSE = 0.0063, R² = 0.7196
tratamento_china - RandomForest: MAE = 0

# 3 - Software Effort Estimation using Ensemble Learning.

In [7]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.utils.validation import check_X_y
from joblib import Parallel, delayed
import numpy as np

# Função para calcular RAE
def calculate_rae(y_true, y_pred):
    return np.sum(np.abs(y_pred - y_true)) / np.sum(np.abs(y_true - np.mean(y_true)))

# Função para calcular RRSE
def calculate_rrse(y_true, y_pred):
    return np.sqrt(np.sum((y_true - y_pred) ** 2) / np.sum((y_true - np.mean(y_true)) ** 2))

# Função para carregar datasets
def load_datasets(directory):
    datasets = {}
    if not os.path.exists(directory):
        print(f"Diretório '{directory}' não encontrado.")
        return datasets

    for file in os.listdir(directory):
        filepath = os.path.join(directory, file)
        filename, ext = os.path.splitext(file)

        try:
            if ext == '.txt':
                with open(filepath, 'r') as f:
                    first_line = f.readline()
                    delimiter = ',' if ',' in first_line else ('\t' if '\t' in first_line else ' ')
                datasets[filename] = pd.read_csv(filepath, delimiter=delimiter)
                print(f"{filename}: Delimitador detectado - '{delimiter}'")
            else:
                print(f"Formato não suportado: {file}")
        except Exception as e:
            print(f"Erro ao carregar {file}: {e}")

    print(f"{len(datasets)} arquivos carregados com sucesso de {directory}.")
    return datasets

# Função para processar um único modelo
def process_model(X_train, X_test, y_train, y_test, model_name, model, param_grid, repetitions):
    metrics = {'mae': [], 'rmse': [], 'rae': [], 'rrse': []}

    for _ in range(repetitions):
        grid = GridSearchCV(model, param_grid[model_name], cv=5, scoring='neg_mean_absolute_error', error_score='raise')

        try:
            grid.fit(X_train, y_train)
            best_model = grid.best_estimator_

            # Previsões no conjunto de teste
            y_pred = best_model.predict(X_test)

            # Cálculo das métricas
            mae = mean_absolute_error(y_test, y_pred)
            rmse = np.sqrt(mean_squared_error(y_test, y_pred))
            rae = calculate_rae(y_test, y_pred)
            rrse = calculate_rrse(y_test, y_pred)

            metrics['mae'].append(mae)
            metrics['rmse'].append(rmse)
            metrics['rae'].append(rae)
            metrics['rrse'].append(rrse)

        except Exception as e:
            print(f"{model_name}: Falha no treinamento - {e}")
            continue

    # Calcular médias das métricas
    return {
        'mae': np.mean(metrics['mae']),
        'rmse': np.mean(metrics['rmse']),
        'rae': np.mean(metrics['rae']),
        'rrse': np.mean(metrics['rrse']),
        'best_params': grid.best_params_ if 'grid' in locals() else {}
    }

# Função para treinar e testar modelos de regressão
def test_regression_models(data, directory, output_directory, repetitions=30):
    models = {
        'LinearRegression': LinearRegression(),
        'Ridge': Ridge(),
        'Lasso': Lasso(),
        'RandomForest': RandomForestRegressor(),
        'GradientBoosting': GradientBoostingRegressor()
    }

    param_grid = {
        'LinearRegression': {},
        'Ridge': {'alpha': [0.1, 1, 10]},
        'Lasso': {'alpha': [0.1, 0.5, 1]},
        'RandomForest': {'n_estimators': [50, 100, 200]},
        'GradientBoosting': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2]}
    }

    results = {}

    for name, df in data.items():
        df = df.dropna()

        if df.shape[0] < 2 or df.shape[1] < 2:
            print(f"{name}: Dataset insuficiente para treino. Tamanho: {df.shape}")
            continue

        X = df.iloc[:, :-1]
        y = df.iloc[:, -1]

        try:
            X, y = check_X_y(X, y)
        except ValueError as e:
            print(f"{name}: Erro de validação dos dados - {e}")
            continue

        # Divisão dos dados em treinamento (80%) e teste (20%)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Processar cada modelo em paralelo
        model_results = Parallel(n_jobs=-1)(
            delayed(process_model)(X_train, X_test, y_train, y_test, model_name, model, param_grid, repetitions)
            for model_name, model in models.items()
        )

        # Armazenar resultados
        for model_name, metrics in zip(models.keys(), model_results):
            results[f'{name}_{model_name}'] = metrics
            print(f"{name} - {model_name}: MAE = {metrics['mae']:.4f}, RMSE = {metrics['rmse']:.4f}, "
                  f"RAE = {metrics['rae']:.4f}, RRSE = {metrics['rrse']:.4f}")

    results_df = pd.DataFrame.from_dict(results, orient='index')
    output_path = os.path.join(output_directory, 'artigo3-abordagem2.xlsx')

    try:
        os.makedirs(output_directory, exist_ok=True)
        results_df.to_excel(output_path)
        print(f"Resultados salvos em '{output_path}'.")
    except Exception as e:
        print(f"Erro ao salvar os resultados: {e}")

# Processamento de múltiplos diretórios
output_directory = r'C:\Users\CALEO\OneDrive - Hexagon\Documents\GitHub\Software_effort_estimation\proposal\processamento\resultados'
directories = [
    r'C:\Users\CALEO\OneDrive - Hexagon\Documents\GitHub\Software_effort_estimation\proposal\abordagem2\saida\4-escolha\selectKbest\50\3-simulacao\6-MIMIC-50',
]

for directory in directories:
    data = load_datasets(directory)
    if data:
        test_regression_models(data, directory, output_directory, repetitions=30)


tratamento_china: Delimitador detectado - ','
tratamento_cocomo81: Delimitador detectado - ','
tratamento_desharnais: Delimitador detectado - ','
tratamento_maxwell: Delimitador detectado - ','
4 arquivos carregados com sucesso de C:\Users\CALEO\OneDrive - Hexagon\Documents\GitHub\Software_effort_estimation\proposal\abordagem2\saida\4-escolha\selectKbest\50\3-simulacao\6-MIMIC-50.
tratamento_china - LinearRegression: MAE = 0.0007, RMSE = 0.0010, RAE = 0.0633, RRSE = 0.0525
tratamento_china - Ridge: MAE = 0.0029, RMSE = 0.0055, RAE = 0.2760, RRSE = 0.2819
tratamento_china - Lasso: MAE = 0.0106, RMSE = 0.0197, RAE = 1.0011, RRSE = 1.0000
tratamento_china - RandomForest: MAE = 0.0014, RMSE = 0.0047, RAE = 0.1351, RRSE = 0.2409
tratamento_china - GradientBoosting: MAE = 0.0012, RMSE = 0.0037, RAE = 0.1129, RRSE = 0.1885
tratamento_cocomo81 - LinearRegression: MAE = 0.0090, RMSE = 0.0117, RAE = 0.3090, RRSE = 0.2869
tratamento_cocomo81 - Ridge: MAE = 0.0086, RMSE = 0.0113, RAE = 0.2961, RRS

# 4 - artigo

# 5 - artigo

# 6 - artigo