In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
import arff
import matplotlib.pyplot as plt
import numpy as np
import sys

file_paths = [
    '../datasets_2/albrecht.arff',
    '../datasets_2/kemerer.arff',
    '../datasets_2/cocomo81.arff',
    #'../datasets_2/desharnais.arff',
    '../datasets_2/china.arff',
]

# Para cada arquivo ARFF
for file_path in file_paths:
    print(f"\n\n\nAnalizando o arquivo: {file_path}")
    with open(file_path, 'r') as f:
        data = arff.load(f)

    # Exibir algumas informações sobre o arquivo ARFF
    print("Atributos:", [attr[0] for attr in data['attributes']])
    print("Número de instâncias:", len(data['data']))
    print("Primeira instância:", data['data'][0])
    print("Última instância:", data['data'][-1])
    print()





Analizando o arquivo: ../datasets_2/albrecht.arff
Atributos: ['Input', 'Output', 'Inquiry', 'File', 'FPAdj', 'RawFPcounts', 'AdjFP', 'Effort']
Número de instâncias: 24
Primeira instância: [25.0, 150.0, 75.0, 60.0, 1.0, 1750.0, 1750.0, 102.4]
Última instância: [12.0, 15.0, 0.0, 15.0, 0.95, 273.68, 260.0, 6.1]




Analizando o arquivo: ../datasets_2/kemerer.arff
Atributos: ['ID', 'Language', 'Hardware', 'Duration', 'KSLOC', 'AdjFP', 'RAWFP', 'EffortMM']
Número de instâncias: 15
Primeira instância: [1.0, 1.0, 1.0, 17.0, 253.6, 1217.1, 1010.0, 287.0]
Última instância: [15.0, 3.0, 1.0, 14.0, 60.2, 1044.3, 976.0, 69.9]




Analizando o arquivo: ../datasets_2/cocomo81.arff
Atributos: ['rely', 'data', 'cplx', 'time', 'stor', 'virt', 'turn', 'acap', 'aexp', 'pcap', 'vexp', 'lexp', 'modp', 'tool', 'sced', 'loc', 'actual']
Número de instâncias: 63
Primeira instância: [0.88, 1.16, 0.7, 1.0, 1.06, 1.15, 1.07, 1.19, 1.13, 1.17, 1.1, 1.0, 1.24, 1.1, 1.04, 113.0, 2040.0]
Última instância: [1.0, 0.9

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, median_absolute_error, mean_squared_error, r2_score
from tabulate import tabulate

def calculate_metrics(true_values, predicted_values):
    mae = mean_absolute_error(true_values, predicted_values)
    medae = median_absolute_error(true_values, predicted_values)
    rmse = np.sqrt(mean_squared_error(true_values, predicted_values))
    r2 = r2_score(true_values, predicted_values)
    
    return mae, medae, rmse, r2

def load_arff(file_path):
    with open(file_path, 'r') as f:
        data = arff.load(f)

    attributes = [attr[0] for attr in data['attributes']]
    X = np.array(data['data'])[:, :-1]
    y = np.array(data['data'])[:, -1].astype(float)
    
    return X, y

output_file = "../results/mlp/sem-pre/output_mlp.txt"
sys.stdout = open(output_file, "w")
sys.stderr = open(output_file, "a")

hidden_layer_sizes = (100, 100, 100)  # Tamanhos das camadas ocultas
activation = 'relu'  # Função de ativação
solver = 'adam'  # Algoritmo de otimização
max_iter_values = [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]  # Número máximo de iterações
learning_rate_init_values = [0.002, 0.003, 0.004, 0.005, 0.006]  # Taxa de aprendizado inicial

metrics_data = []

num_runs = 30

for run in range(num_runs):
    for file_path in file_paths:
        X, y = load_arff(file_path)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        for max_iter in max_iter_values:
            for learning_rate_init in learning_rate_init_values:
                mlp = MLPRegressor(hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver=solver,
                                   max_iter=max_iter, learning_rate_init=learning_rate_init)
                mlp.fit(X_train, y_train)
                y_pred = mlp.predict(X_test)
                mae, medae, rmse, r2 = calculate_metrics(y_test, y_pred)
                metrics_data.append([file_path, "MAE", mae, max_iter, learning_rate_init])
                metrics_data.append([file_path, "Median Absolute Error", medae, max_iter, learning_rate_init])
                metrics_data.append([file_path, "RMSE", rmse, max_iter, learning_rate_init])
                metrics_data.append([file_path, "R2 Score", r2, max_iter, learning_rate_init])

headers = ["Dataset", "Metric", "metric valor", "t", "l"]
print(tabulate(metrics_data, headers=headers))
print('\n')
sys.stdout.close()
sys.stderr.close()

In [None]:
import re
import numpy as np
import sys

# Redirecting stdout and stderr to the specified file
output_file = "../results/mlp/sem-pre/output_mlp_analises_t_l_best.txt"
sys.stdout = open(output_file, "w")
sys.stderr = open(output_file, "a")

cont = 0

# Dictionary to store metrics with nested structure for datasets, t, l, and metrics
datasets = {
    "albrecht": {},
    "kemerer": {},
    "cocomo81": {},
    "china": {}
}

def extract_metrics_values(filename):
    global cont
    with open(filename, 'r') as file:
        for line in file:
            if not line.strip() or line.startswith('-'):
                continue
            
            # Updated regex to capture t and l values
            match = re.match(r'.*?([A-Za-z0-9_/.]+)\s+(.*?)\s+([\d.]+)\s+(\d+)\s+(\d+)', line)
            if match:
                dataset, metric, value, t, l = match.groups()
                value = float(value)
                t = int(t)
                l = int(l)
                
                if "albrecht.arff" in dataset:
                    dataset_key = "albrecht"
                elif "kemerer.arff" in dataset:
                    dataset_key = "kemerer"
                elif "cocomo81.arff" in dataset:
                    dataset_key = "cocomo81"
                elif "china.arff" in dataset:
                    dataset_key = "china"
                else:
                    print(f"Dataset not recognized: {dataset}", file=sys.stderr)
                    continue

                # Initialize nested dictionaries if not already done
                if t not in datasets[dataset_key]:
                    datasets[dataset_key][t] = {}
                if l not in datasets[dataset_key][t]:
                    datasets[dataset_key][t][l] = {}
                if metric not in datasets[dataset_key][t][l]:
                    datasets[dataset_key][t][l][metric] = []

                # Append the metric value
                datasets[dataset_key][t][l][metric].append(value)
                cont += 1
            # else:
            #     print(f"Line did not match: {line.strip()}", file=sys.stderr)

def calculate_statistics(metrics):
    results = {}
    for metric, values in metrics.items():
        results[metric] = {
            'Média': np.mean(values),
            'Mínimo': np.min(values),
            'Máximo': np.max(values),
            'Desvio Padrão': np.std(values)
        }
    return results

filename = '../results/mlp/sem-pre/output_mlp.txt'
extract_metrics_values(filename)

# Function to find the best l and t for each metric
def find_best_t_l_for_metric(dataset_name, d_values, metric_name):
    best_t_l = None
    best_value = float('inf') if metric_name in ['MAE', 'Median Absolute Error', 'RMSE'] else float('-inf')
    
    for t, k_values in d_values.items():
        for l, metrics in k_values.items():
            if metric_name in metrics:
                statistics = calculate_statistics(metrics)
                metric_value = statistics[metric_name]['Média']
                
                if (metric_name in ['MAE', 'Median Absolute Error', 'RMSE'] and metric_value < best_value) or \
                   (metric_name == 'R2 Score' and metric_value > best_value):
                    best_value = metric_value
                    best_t_l = (t, l, statistics)
    
    return best_t_l

# Metrics we are interested in
metrics_of_interest = ['MAE', 'Median Absolute Error', 'RMSE', 'R2 Score']

# Iterate through the datasets and print statistics for the best l and t for each metric
for dataset_name, d_values in datasets.items():
    print(f"Melhores resultados para o dataset: {dataset_name}")
    for metric_name in metrics_of_interest:
        best_t_l = find_best_t_l_for_metric(dataset_name, d_values, metric_name)
        if best_t_l:
            t, l, statistics = best_t_l
            print(f"  Melhor para a métrica {metric_name}: t = {t}, l = {l}")
            print(f"    Média: {statistics[metric_name]['Média']:.4f}")
            print(f"    Mínimo: {statistics[metric_name]['Mínimo']:.4f}")
            print(f"    Máximo: {statistics[metric_name]['Máximo']:.4f}")
            print(f"    Desvio Padrão: {statistics[metric_name]['Desvio Padrão']:.4f}")
            print()  # Add a new line for better readability

#print("Número de iterações das métricas: " + str(cont))

sys.stdout.close()
sys.stderr.close()
