# 1 - Software Efort Estimation Using Stacked Ensemble Technique and Hybrid Principal Component Regression and Multivariate Adaptive Regression Splines

Divisão: Cross_validation e kfold.

In [18]:
import os
import pandas as pd
from sklearn.model_selection import KFold, GridSearchCV, cross_val_predict
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.utils.validation import check_X_y
import numpy as np

# Function to load datasets
def load_datasets(directory):
    datasets = {}
    if not os.path.exists(directory):
        print(f"Directory '{directory}' not found.")
        return datasets

    for file in os.listdir(directory):
        filepath = os.path.join(directory, file)
        filename, ext = os.path.splitext(file)

        try:
            if ext == '.txt':
                with open(filepath, 'r') as f:
                    first_line = f.readline()
                    delimiter = ',' if ',' in first_line else ('\t' if '\t' in first_line else ' ')
                datasets[filename] = pd.read_csv(filepath, delimiter=delimiter)
                print(f"{filename}: Detected delimiter - '{delimiter}'")
            else:
                print(f"Unsupported format: {file}")
        except Exception as e:
            print(f"Error loading {file}: {e}")

    print(f"{len(datasets)} files successfully loaded from {directory}.")
    return datasets

# Function to process a single model
def process_model(X, y, model_name, model, param_grid, kf):
    metrics = {
        'rmse': [],
        'mse': [],
        'mae': [],
        'r2': [],
    }

    grid = GridSearchCV(model, param_grid[model_name], cv=kf, scoring='neg_mean_squared_error', error_score='raise')

    try:
        grid.fit(X, y)
        best_model = grid.best_estimator_

        # Predictions with cross-validation
        y_pred = cross_val_predict(best_model, X, y, cv=kf)

        # Calculate metrics
        metrics['rmse'].append(np.sqrt(mean_squared_error(y, y_pred)))
        metrics['mse'].append(mean_squared_error(y, y_pred))
        metrics['mae'].append(mean_absolute_error(y, y_pred))
        metrics['r2'].append(r2_score(y, y_pred))

        return {
            'rmse': np.mean(metrics['rmse']),
            'mse': np.mean(metrics['mse']),
            'mae': np.mean(metrics['mae']),
            'r2': np.mean(metrics['r2']),
            'best_params': grid.best_params_
        }

    except Exception as e:
        print(f"{model_name}: Training failure - {e}")
        return None

# Function to train and test regression models
def test_regression_models(data, output_directory, n_splits=5, num_runs=30):
    models = {
        'LinearRegression': LinearRegression(),
        'Ridge': Ridge(),
        'RandomForest': RandomForestRegressor(),
    }

    param_grid = {
        'LinearRegression': {},
        'Ridge': {'alpha': [0.1, 1, 10]},
        'RandomForest': {'n_estimators': [50, 100, 200]},
    }

    aggregated_results = {}

    for name, df in data.items():
        df = df.dropna()

        if df.shape[0] < 2 or df.shape[1] < 2:
            print(f"{name}: Dataset too small for training. Size: {df.shape}")
            continue

        X = df.iloc[:, :-1]
        y = df.iloc[:, -1]

        try:
            X, y = check_X_y(X, y)
        except ValueError as e:
            print(f"{name}: Data validation error - {e}")
            continue

        aggregated_results[name] = {model_name: {
            'rmse': [], 'mse': [], 'mae': [], 'r2': []
        } for model_name in models.keys()}

        for run in range(num_runs):
            kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

            for model_name, model in models.items():
                metrics = process_model(X, y, model_name, model, param_grid, kf)
                if metrics:
                    for metric, value in metrics.items():
                        if metric != 'best_params':
                            aggregated_results[name][model_name][metric].append(value)

        # Calculate averages
        for model_name in models.keys():
            for metric in ['rmse', 'mse', 'mae', 'r2']:
                aggregated_results[name][model_name][metric] = np.mean(aggregated_results[name][model_name][metric])

    # Save results to Excel
    results_df = pd.DataFrame.from_dict({
        f'{dataset}_{model}': metrics
        for dataset, models in aggregated_results.items()
        for model, metrics in models.items()
    }, orient='index')

    output_path = os.path.join(output_directory, 'artigo1-abordagem2-teste-30runs.xlsx')

    try:
        os.makedirs(output_directory, exist_ok=True)
        results_df.to_excel(output_path)
        print(f"Results saved at '{output_path}'.")
    except Exception as e:
        print(f"Error saving results: {e}")

# Process multiple directories
output_directory = r'C:\Users\CALEO\OneDrive - Hexagon\Documents\GitHub\Software_effort_estimation\proposal\processamento\resultados'

directories = [
    r'C:\Users\CALEO\OneDrive - Hexagon\Documents\GitHub\Software_effort_estimation\proposal\abordagem2\saida\4-escolha\selectKbest\50\3-simulacao\MIMIC-50',
]

for directory in directories:
    data = load_datasets(directory)
    if data:
        test_regression_models(data, output_directory)


tratamento_china: Detected delimiter - ','
tratamento_cocomo81: Detected delimiter - ','
tratamento_desharnais: Detected delimiter - ','
tratamento_maxwell: Detected delimiter - ','
4 files successfully loaded from C:\Users\CALEO\OneDrive - Hexagon\Documents\GitHub\Software_effort_estimation\proposal\abordagem2\saida\4-escolha\selectKbest\50\3-simulacao\MIMIC-50.


KeyboardInterrupt: 

# 2 - Estimating Software Development Efforts Using a Random Forest-Based Stacked Ensemble Approach

Divisão: cross_validation e kfold. 

In [13]:
import os
import pandas as pd
from sklearn.model_selection import KFold, GridSearchCV, cross_val_predict
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.utils.validation import check_X_y
from joblib import Parallel, delayed
import numpy as np

# Função para carregar datasets
def load_datasets(directory):
    datasets = {}
    if not os.path.exists(directory):
        print(f"Diretório '{directory}' não encontrado.")
        return datasets

    for file in os.listdir(directory):
        filepath = os.path.join(directory, file)
        filename, ext = os.path.splitext(file)

        try:
            if ext == '.txt':
                with open(filepath, 'r') as f:
                    first_line = f.readline()
                    delimiter = ',' if ',' in first_line else ('\t' if '\t' in first_line else ' ')
                datasets[filename] = pd.read_csv(filepath, delimiter=delimiter)
                print(f"{filename}: Delimitador detectado - '{delimiter}'")
            else:
                print(f"Formato não suportado: {file}")
        except Exception as e:
            print(f"Erro ao carregar {file}: {e}")

    print(f"{len(datasets)} arquivos carregados com sucesso de {directory}.")
    return datasets

# Função para processar um único modelo
def process_model(X, y, model_name, model, param_grid, kf):
    metrics = {
        'mae': [],
        'rmse': [],
        'r2': []
    }

    grid = GridSearchCV(model, param_grid[model_name], cv=kf, scoring='neg_mean_squared_error', error_score='raise')

    try:
        grid.fit(X, y)
        best_model = grid.best_estimator_

        # Previsões com validação cruzada
        y_pred = cross_val_predict(best_model, X, y, cv=kf)

        # Cálculo das métricas
        metrics['mae'].append(mean_absolute_error(y, y_pred))
        metrics['rmse'].append(np.sqrt(mean_squared_error(y, y_pred)))
        metrics['r2'].append(r2_score(y, y_pred))

        return {
            'mae': np.mean(metrics['mae']),
            'rmse': np.mean(metrics['rmse']),
            'r2': np.mean(metrics['r2']),
            'best_params': grid.best_params_
        }

    except Exception as e:
        print(f"{model_name}: Falha no treinamento - {e}")
        return None

# Função para treinar e testar modelos de regressão
def test_regression_models(data, output_directory, n_splits=5, n_runs=30):
    models = {
        'LinearRegression': LinearRegression(),
        'Ridge': Ridge(),
        # 'Lasso': Lasso(),
        'RandomForest': RandomForestRegressor(),
        # 'GradientBoosting': GradientBoostingRegressor()
    }

    param_grid = {
        'LinearRegression': {},
        'Ridge': {'alpha': [0.1, 1, 10]},
        # 'Lasso': {'alpha': [0.1, 0.5, 1]},
        'RandomForest': {'n_estimators': [50, 100, 200]},
        # 'GradientBoosting': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2]}
    }

    results = {}

    for name, df in data.items():
        df = df.dropna()

        if df.shape[0] < 2 or df.shape[1] < 2:
            print(f"{name}: Dataset insuficiente para treino. Tamanho: {df.shape}")
            continue

        X = df.iloc[:, :-1]
        y = df.iloc[:, -1]

        try:
            X, y = check_X_y(X, y)
        except ValueError as e:
            print(f"{name}: Erro de validação dos dados - {e}")
            continue

        kf = KFold(n_splits=n_splits, shuffle=True, random_state=None)

        # Processar modelos em paralelo
        model_results = Parallel(n_jobs=-1)(
            delayed(process_model)(X, y, model_name, model, param_grid, kf)
            for model_name, model in models.items()
        )

        for model_name, metrics in zip(models.keys(), model_results):
            if metrics:
                results[f'{name}_{model_name}'] = metrics
                print(f"{name} - {model_name}: MAE = {metrics['mae']:.4f}, RMSE = {metrics['rmse']:.4f}, R² = {metrics['r2']:.4f}")

    results_df = pd.DataFrame.from_dict(results, orient='index')
    output_path = os.path.join(output_directory, 'artigo2-abordagem2-teste.xlsx')

    try:
        os.makedirs(output_directory, exist_ok=True)
        results_df.to_excel(output_path)
        print(f"Resultados salvos em '{output_path}'.")
    except Exception as e:
        print(f"Erro ao salvar os resultados: {e}")

# Processamento de múltiplos diretórios
output_directory = r'C:\Users\CALEO\OneDrive - Hexagon\Documents\GitHub\Software_effort_estimation\proposal\processamento\resultados'  # Especifique a pasta de saída

directories = [
    r'C:\Users\CALEO\OneDrive - Hexagon\Documents\GitHub\Software_effort_estimation\proposal\abordagem2\saida\4-escolha\selectKbest\50\3-simulacao\MIMIC-50',
]

for directory in directories:
    data = load_datasets(directory)
    if data:
        test_regression_models(data, output_directory)


tratamento_china: Delimitador detectado - ','
tratamento_cocomo81: Delimitador detectado - ','
tratamento_desharnais: Delimitador detectado - ','
tratamento_maxwell: Delimitador detectado - ','
4 arquivos carregados com sucesso de C:\Users\CALEO\OneDrive - Hexagon\Documents\GitHub\Software_effort_estimation\proposal\abordagem2\saida\4-escolha\selectKbest\50\3-simulacao\MIMIC-50.
tratamento_china - LinearRegression: MAE = 0.0036, RMSE = 0.0050, R² = 0.9964
tratamento_china - Ridge: MAE = 0.0039, RMSE = 0.0053, R² = 0.9959
tratamento_china - RandomForest: MAE = 0.0042, RMSE = 0.0087, R² = 0.9891
tratamento_cocomo81 - LinearRegression: MAE = 0.0190, RMSE = 0.0249, R² = 0.9554
tratamento_cocomo81 - Ridge: MAE = 0.0191, RMSE = 0.0253, R² = 0.9538
tratamento_cocomo81 - RandomForest: MAE = 0.0183, RMSE = 0.0266, R² = 0.9491
tratamento_desharnais - LinearRegression: MAE = 0.0334, RMSE = 0.0431, R² = 0.9049
tratamento_desharnais - Ridge: MAE = 0.0332, RMSE = 0.0431, R² = 0.9051
tratamento_desha

# 3 - Software Effort Estimation using Ensemble Learning.

Divisão: 80 treino e 20 teste.

In [10]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.utils.validation import check_X_y
from joblib import Parallel, delayed
from tqdm import tqdm
import numpy as np

# Função para calcular RAE
def calculate_rae(y_true, y_pred):
    return np.sum(np.abs(y_pred - y_true)) / np.sum(np.abs(y_true - np.mean(y_true)))

# Função para calcular RRSE
def calculate_rrse(y_true, y_pred):
    return np.sqrt(np.sum((y_true - y_pred) ** 2) / np.sum((y_true - np.mean(y_true)) ** 2))

# Função para carregar datasets
def load_datasets(directory):
    datasets = {}
    if not os.path.exists(directory):
        print(f"Diretório '{directory}' não encontrado.")
        return datasets

    for file in os.listdir(directory):
        filepath = os.path.join(directory, file)
        filename, ext = os.path.splitext(file)

        try:
            if ext == '.txt':
                with open(filepath, 'r') as f:
                    first_line = f.readline()
                    delimiter = ',' if ',' in first_line else ('\t' if '\t' in first_line else ' ')
                datasets[filename] = pd.read_csv(filepath, delimiter=delimiter)
                print(f"{filename}: Delimitador detectado - '{delimiter}'")
            else:
                print(f"Formato não suportado: {file}")
        except Exception as e:
            print(f"Erro ao carregar {file}: {e}")

    print(f"{len(datasets)} arquivos carregados com sucesso de {directory}.")
    return datasets

# Função para processar um único modelo
def process_model(X_train, X_test, y_train, y_test, model_name, model, param_grid, repetitions):
    metrics = {'mae': [], 'rmse': [], 'rae': [], 'rrse': []}

    for _ in range(repetitions):
        grid = GridSearchCV(model, param_grid[model_name], cv=5, scoring='neg_mean_absolute_error', error_score='raise')

        try:
            grid.fit(X_train, y_train)
            best_model = grid.best_estimator_

            # Previsões no conjunto de teste
            y_pred = best_model.predict(X_test)

            # Cálculo das métricas
            mae = mean_absolute_error(y_test, y_pred)
            rmse = np.sqrt(mean_squared_error(y_test, y_pred))
            rae = calculate_rae(y_test, y_pred)
            rrse = calculate_rrse(y_test, y_pred)

            metrics['mae'].append(mae)
            metrics['rmse'].append(rmse)
            metrics['rae'].append(rae)
            metrics['rrse'].append(rrse)

        except Exception as e:
            print(f"{model_name}: Falha no treinamento - {e}")
            continue

    # Calcular médias das métricas
    return {
        'mae': np.mean(metrics['mae']),
        'rmse': np.mean(metrics['rmse']),
        'rae': np.mean(metrics['rae']),
        'rrse': np.mean(metrics['rrse']),
        'best_params': grid.best_params_ if 'grid' in locals() else {}
    }

# Função para treinar e testar modelos de regressão
def test_regression_models(data, directory, output_directory, repetitions=30):
    models = {
        'LinearRegression': LinearRegression(),
        'Ridge': Ridge(),
        #'Lasso': Lasso(),
        'RandomForest': RandomForestRegressor(),
        #'GradientBoosting': GradientBoostingRegressor()
    }

    param_grid = {
        'LinearRegression': {},
        'Ridge': {'alpha': [0.1, 1, 10]},
        #'Lasso': {'alpha': [0.1, 0.5, 1]},
        'RandomForest': {'n_estimators': [50, 100, 200]},
        #'GradientBoosting': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2]}
    }

    results = {}

    for name, df in tqdm(data.items(), desc="Processando datasets"):
        df = df.dropna()

        if df.shape[0] < 2 or df.shape[1] < 2:
            print(f"{name}: Dataset insuficiente para treino. Tamanho: {df.shape}")
            continue

        X = df.iloc[:, :-1]
        y = df.iloc[:, -1]

        try:
            X, y = check_X_y(X, y)
        except ValueError as e:
            print(f"{name}: Erro de validação dos dados - {e}")
            continue

        # Divisão dos dados em treinamento (80%) e teste (20%)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Processar cada modelo em paralelo
        model_results = Parallel(n_jobs=-1)(
            delayed(process_model)(X_train, X_test, y_train, y_test, model_name, model, param_grid, repetitions)
            for model_name, model in models.items()
        )

        # Armazenar resultados
        for model_name, metrics in zip(models.keys(), model_results):
            results[f'{name}_{model_name}'] = metrics
            print(f"{name} - {model_name}: MAE = {metrics['mae']:.4f}, RMSE = {metrics['rmse']:.4f}, "
                  f"RAE = {metrics['rae']:.4f}, RRSE = {metrics['rrse']:.4f}")

    results_df = pd.DataFrame.from_dict(results, orient='index')
    output_path = os.path.join(output_directory, 'artigo3-abordagem2-teste.xlsx')

    try:
        os.makedirs(output_directory, exist_ok=True)
        results_df.to_excel(output_path)
        print(f"Resultados salvos em '{output_path}'.")
    except Exception as e:
        print(f"Erro ao salvar os resultados: {e}")

# Processamento de múltiplos diretórios
output_directory = r'C:\Users\CALEO\OneDrive - Hexagon\Documents\GitHub\Software_effort_estimation\proposal\processamento\resultados'
directories = [
    r'C:\Users\CALEO\OneDrive - Hexagon\Documents\GitHub\Software_effort_estimation\proposal\abordagem2\saida\4-escolha\selectKbest\50\3-simulacao\MIMIC-50',
]

for directory in tqdm(directories, desc="Processando diretórios"):
    data = load_datasets(directory)
    if data:
        test_regression_models(data, directory, output_directory, repetitions=30)


Processando diretórios:   0%|          | 0/1 [00:00<?, ?it/s]

tratamento_china: Delimitador detectado - ','
tratamento_cocomo81: Delimitador detectado - ','
tratamento_desharnais: Delimitador detectado - ','
tratamento_maxwell: Delimitador detectado - ','
4 arquivos carregados com sucesso de C:\Users\CALEO\OneDrive - Hexagon\Documents\GitHub\Software_effort_estimation\proposal\abordagem2\saida\4-escolha\selectKbest\50\3-simulacao\MIMIC-50.




tratamento_china - LinearRegression: MAE = 0.0035, RMSE = 0.0048, RAE = 0.0655, RRSE = 0.0619
tratamento_china - Ridge: MAE = 0.0038, RMSE = 0.0051, RAE = 0.0699, RRSE = 0.0661
tratamento_china - Lasso: MAE = 0.0541, RMSE = 0.0774, RAE = 1.0046, RRSE = 1.0001
tratamento_china - RandomForest: MAE = 0.0041, RMSE = 0.0062, RAE = 0.0769, RRSE = 0.0801
tratamento_china - GradientBoosting: MAE = 0.0037, RMSE = 0.0068, RAE = 0.0682, RRSE = 0.0879




tratamento_cocomo81 - LinearRegression: MAE = 0.0177, RMSE = 0.0234, RAE = 0.2118, RRSE = 0.2107
tratamento_cocomo81 - Ridge: MAE = 0.0178, RMSE = 0.0239, RAE = 0.2121, RRSE = 0.2151
tratamento_cocomo81 - Lasso: MAE = 0.0849, RMSE = 0.1112, RAE = 1.0141, RRSE = 1.0007
tratamento_cocomo81 - RandomForest: MAE = 0.0178, RMSE = 0.0254, RAE = 0.2130, RRSE = 0.2281
tratamento_cocomo81 - GradientBoosting: MAE = 0.0168, RMSE = 0.0237, RAE = 0.2001, RRSE = 0.2128




tratamento_desharnais - LinearRegression: MAE = 0.0339, RMSE = 0.0430, RAE = 0.3360, RRSE = 0.3170
tratamento_desharnais - Ridge: MAE = 0.0338, RMSE = 0.0430, RAE = 0.3351, RRSE = 0.3167
tratamento_desharnais - Lasso: MAE = 0.1009, RMSE = 0.1358, RAE = 1.0004, RRSE = 1.0000
tratamento_desharnais - RandomForest: MAE = 0.0212, RMSE = 0.0274, RAE = 0.2101, RRSE = 0.2016
tratamento_desharnais - GradientBoosting: MAE = 0.0211, RMSE = 0.0270, RAE = 0.2096, RRSE = 0.1991


Processando datasets: 100%|██████████| 4/4 [1:16:34<00:00, 1148.65s/it]
Processando diretórios: 100%|██████████| 1/1 [1:16:34<00:00, 4594.79s/it]

tratamento_maxwell - LinearRegression: MAE = 0.0087, RMSE = 0.0115, RAE = 0.0733, RRSE = 0.0764
tratamento_maxwell - Ridge: MAE = 0.0099, RMSE = 0.0132, RAE = 0.0835, RRSE = 0.0879
tratamento_maxwell - Lasso: MAE = 0.1164, RMSE = 0.1505, RAE = 0.9859, RRSE = 1.0013
tratamento_maxwell - RandomForest: MAE = 0.0139, RMSE = 0.0191, RAE = 0.1178, RRSE = 0.1271
tratamento_maxwell - GradientBoosting: MAE = 0.0127, RMSE = 0.0179, RAE = 0.1074, RRSE = 0.1194
Resultados salvos em 'C:\Users\CALEO\OneDrive - Hexagon\Documents\GitHub\Software_effort_estimation\proposal\processamento\resultados\artigo3-abordagem2-teste.xlsx'.





In [9]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.utils.validation import check_X_y
from joblib import Parallel, delayed
from tqdm import tqdm
import numpy as np

# Funções de métricas

def calculate_rae(y_true, y_pred):
    return np.sum(np.abs(y_pred - y_true)) / np.sum(np.abs(y_true - np.mean(y_true)))

def calculate_rrse(y_true, y_pred):
    return np.sqrt(np.sum((y_true - y_pred) ** 2) / np.sum((y_true - np.mean(y_true)) ** 2))

# Função para carregar datasets
def load_datasets(directory):
    datasets = {}
    if not os.path.exists(directory):
        print(f"Diretório '{directory}' não encontrado.")
        return datasets

    for file in os.listdir(directory):
        filepath = os.path.join(directory, file)
        filename, ext = os.path.splitext(file)

        if ext != '.txt':
            print(f"Ignorando arquivo não .txt: {file}")
            continue

        try:
            with open(filepath, 'r') as f:
                first_line = f.readline()
                delimiter = ',' if ',' in first_line else ('\t' if '\t' in first_line else ' ')
            datasets[filename] = pd.read_csv(filepath, delimiter=delimiter)
            print(f"{filename}: Delimitador detectado - '{delimiter}'")
        except Exception as e:
            print(f"Erro ao carregar {file}: {e}")

    print(f"{len(datasets)} arquivos carregados com sucesso de {directory}.")
    return datasets

# Função para processar um único modelo
def process_model(X_train, X_test, y_train, y_test, model_name, model, param_grid, repetitions):
    metrics = {'mae': [], 'rmse': [], 'rae': [], 'rrse': []}

    for _ in range(repetitions):
        grid = GridSearchCV(model, param_grid[model_name], cv=5, scoring='neg_mean_absolute_error', error_score='raise')

        try:
            grid.fit(X_train, y_train)
            best_model = grid.best_estimator_

            y_pred = best_model.predict(X_test)

            mae = mean_absolute_error(y_test, y_pred)
            rmse = np.sqrt(mean_squared_error(y_test, y_pred))
            rae = calculate_rae(y_test, y_pred)
            rrse = calculate_rrse(y_test, y_pred)

            metrics['mae'].append(mae)
            metrics['rmse'].append(rmse)
            metrics['rae'].append(rae)
            metrics['rrse'].append(rrse)

        except Exception as e:
            print(f"{model_name}: Falha no treinamento - {e}")
            continue

    return {
        'mae': np.mean(metrics['mae']),
        'rmse': np.mean(metrics['rmse']),
        'rae': np.mean(metrics['rae']),
        'rrse': np.mean(metrics['rrse']),
        'best_params': grid.best_params_ if 'grid' in locals() else {}
    }

# Função para treinar e testar modelos
def test_regression_models(data_original, data_synthetic, output_directory, repetitions=30):
    models = {
        'LinearRegression': LinearRegression(),
        'Ridge': Ridge(),
        'RandomForest': RandomForestRegressor(),
    }

    param_grid = {
        'LinearRegression': {},
        'Ridge': {'alpha': [0.1, 1, 10]},
        'RandomForest': {'n_estimators': [50, 100, 200]},
    }

    results = {}

    for name, df_original in tqdm(data_original.items(), desc="Processando datasets originais"):
        if name not in data_synthetic:
            print(f"{name}: Dados sintéticos correspondentes não encontrados.")
            continue

        df_synthetic = data_synthetic[name]

        df_original = df_original.dropna()
        df_synthetic = df_synthetic.dropna()

        if df_original.shape[0] < 2 or df_original.shape[1] < 2:
            print(f"{name}: Dataset original insuficiente. Tamanho: {df_original.shape}")
            continue

        if df_synthetic.shape[0] < 2 or df_synthetic.shape[1] < 2:
            print(f"{name}: Dataset sintético insuficiente. Tamanho: {df_synthetic.shape}")
            continue

        # Dados originais
        X_original = df_original.iloc[:, :-1]
        y_original = df_original.iloc[:, -1]

        # Dados combinados
        combined_df = pd.concat([df_original, df_synthetic], ignore_index=True)
        X_combined = combined_df.iloc[:, :-1]
        y_combined = combined_df.iloc[:, -1]

        # Validação dos dados
        try:
            X_original, y_original = check_X_y(X_original, y_original)
            X_combined, y_combined = check_X_y(X_combined, y_combined)
        except ValueError as e:
            print(f"{name}: Erro de validação dos dados - {e}")
            continue

        # Divisão de treino e teste
        X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X_original, y_original, test_size=0.2, random_state=42)
        X_train_comb, X_test_comb, y_train_comb, y_test_comb = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)

        # Treino e teste para cada configuração
        model_results_original = Parallel(n_jobs=-1)(
            delayed(process_model)(X_train_orig, X_test_orig, y_train_orig, y_test_orig, model_name, model, param_grid, repetitions)
            for model_name, model in models.items()
        )

        model_results_combined = Parallel(n_jobs=-1)(
            delayed(process_model)(X_train_comb, X_test_comb, y_train_comb, y_test_comb, model_name, model, param_grid, repetitions)
            for model_name, model in models.items()
        )

        for model_name, metrics_original, metrics_combined in zip(models.keys(), model_results_original, model_results_combined):
            results[f'{name}_{model_name}_original'] = metrics_original
            results[f'{name}_{model_name}_combined'] = metrics_combined

            print(f"{name} - {model_name} (Original): MAE = {metrics_original['mae']:.4f}, RMSE = {metrics_original['rmse']:.4f}")
            print(f"{name} - {model_name} (Combined): MAE = {metrics_combined['mae']:.4f}, RMSE = {metrics_combined['rmse']:.4f}")

    results_df = pd.DataFrame.from_dict(results, orient='index')
    output_path = os.path.join(output_directory, 'abordagem3-processamento_alisson.xlsx')

    try:
        os.makedirs(output_directory, exist_ok=True)
        results_df.to_excel(output_path)
        print(f"Resultados salvos em '{output_path}'.")
    except Exception as e:
        print(f"Erro ao salvar os resultados: {e}")

# Configuração dos diretórios
output_directory = r'C:\Users\CALEO\OneDrive - Hexagon\Documents\GitHub\Software_effort_estimation\proposal\processamento\resultados'
data_original_dir = r'C:\Users\CALEO\OneDrive - Hexagon\Documents\GitHub\Software_effort_estimation\proposal\abordagem2\saida\4-escolha\selectKbest\50\2-geracao-variaveis\MIMIC-50'
data_synthetic_dir = r'C:\Users\CALEO\OneDrive - Hexagon\Documents\GitHub\Software_effort_estimation\proposal\abordagem2\saida\4-escolha\selectKbest\50\3-simulacao\MIMIC-50'

# Carregamento dos dados
data_original = load_datasets(data_original_dir)
data_synthetic = load_datasets(data_synthetic_dir)

if data_original and data_synthetic:
    test_regression_models(data_original, data_synthetic, output_directory, repetitions=30)


tratamento_china: Delimitador detectado - ','
Ignorando arquivo não .txt: tratamento_china_resultados_processamento_simples.xlsx
tratamento_cocomo81: Delimitador detectado - ','
Ignorando arquivo não .txt: tratamento_cocomo81_resultados_processamento_simples.xlsx
tratamento_desharnais: Delimitador detectado - ','
Ignorando arquivo não .txt: tratamento_desharnais_resultados_processamento_simples.xlsx
tratamento_maxwell: Delimitador detectado - ','
Ignorando arquivo não .txt: tratamento_maxwell_resultados_processamento_simples.xlsx
4 arquivos carregados com sucesso de C:\Users\CALEO\OneDrive - Hexagon\Documents\GitHub\Software_effort_estimation\proposal\abordagem2\saida\4-escolha\selectKbest\50\2-geracao-variaveis\MIMIC-50.
tratamento_china: Delimitador detectado - ','
Ignorando arquivo não .txt: tratamento_china.txt_resultados_alisson.xlsx
Ignorando arquivo não .txt: tratamento_china_resultados_processamento_simples.xlsx
tratamento_cocomo81: Delimitador detectado - ','
Ignorando arquivo

Processando datasets originais: 100%|██████████| 4/4 [00:00<00:00, 280.61it/s]

tratamento_china: Erro de validação dos dados - Input X contains NaN.
tratamento_cocomo81: Erro de validação dos dados - Input X contains NaN.
tratamento_desharnais: Erro de validação dos dados - Input X contains NaN.
tratamento_maxwell: Erro de validação dos dados - Input X contains NaN.
Resultados salvos em 'C:\Users\CALEO\OneDrive - Hexagon\Documents\GitHub\Software_effort_estimation\proposal\processamento\resultados\abordagem3-processamento_alisson.xlsx'.





# 4 - Software effort estimation modeling and fully connected artificial neural network optimization using soft computing techniques

Divisão: 70 treino e 30 teste

# 5 - Recommendation of Machine Learning Techniques for Software Effort Estimation using Multi-Criteria Decision Making

Divisão: 