In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
import arff
import matplotlib.pyplot as plt
import numpy as np
import sys

file_paths = [
    '../datasets_2/albrecht.arff',
    '../datasets_2/kemerer.arff',
    '../datasets_2/cocomo81.arff',
    #'../datasets_2/desharnais.arff',
    '../datasets_2/china.arff',
]

# Para cada arquivo ARFF
for file_path in file_paths:
    print(f"\n\n\nAnalizando o arquivo: {file_path}")
    with open(file_path, 'r') as f:
        data = arff.load(f)

    # Exibir algumas informações sobre o arquivo ARFF
    print("Atributos:", [attr[0] for attr in data['attributes']])
    print("Número de instâncias:", len(data['data']))
    print("Primeira instância:", data['data'][0])
    print("Última instância:", data['data'][-1])
    print()


In [None]:
import numpy as np
import sys
import arff
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from sklearn.metrics import mean_absolute_error, median_absolute_error, mean_squared_error, r2_score
from tabulate import tabulate

def calculate_metrics(true_values, predicted_values):
    mae = mean_absolute_error(true_values, predicted_values)
    medae = median_absolute_error(true_values, predicted_values)
    rmse = np.sqrt(mean_squared_error(true_values, predicted_values))
    r2 = r2_score(true_values, predicted_values)
    
    return mae, medae, rmse, r2

def load_arff(file_path):
    with open(file_path, 'r') as f:
        data = arff.load(f)

    attributes = [attr[0] for attr in data['attributes']]
    X = np.array(data['data'])[:, :-1]
    y = np.array(data['data'])[:, -1].astype(float)
    
    return X, y

output_file = "../results/lstm/sem-pre/output_lstm_2.txt"
sys.stdout = open(output_file, "w")
sys.stderr = open(output_file, "a")

metrics_data = []

activation = 'tanh'  # Função de ativação
recurrent_activation = 'sigmoid'  # Função de ativação recorrente
dropout = 0.5  # Probabilidade de dropout
optimizer = 'adam'  # Algoritmo de otimização

# Valores para a sintonização de parâmetros
epochs_values = [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]  # Número de épocas
num_neurons_values = [25, 50, 75, 100]  # Número de neurônios na camada oculta

num_runs = 30

for run in range(num_runs):
    for file_path in file_paths:
        X, y = load_arff(file_path)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        for epochs in epochs_values:
            for num_neurons in num_neurons_values:
                model = Sequential()
                model.add(LSTM(num_neurons, activation=activation, recurrent_activation=recurrent_activation, input_shape=(X_train.shape[1], 1)))
                model.add(Dropout(dropout))
                model.add(Dense(1))
                model.compile(optimizer=optimizer, loss='mean_squared_error')
                X_train_reshaped = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
                X_test_reshaped = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))
                model.fit(X_train_reshaped, y_train, epochs=epochs, batch_size=32, verbose=0)
                y_pred = model.predict(X_test_reshaped)
                mae, medae, rmse, r2 = calculate_metrics(y_test, y_pred)
                metrics_data.append([file_path, "MAE", mae, epochs, num_neurons])
                metrics_data.append([file_path, "Median Absolute Error", medae, epochs, num_neurons])
                metrics_data.append([file_path, "RMSE", rmse, epochs, num_neurons])
                metrics_data.append([file_path, "R2 Score", r2, epochs, num_neurons])

table = tabulate(metrics_data, headers=["Dataset", "Metric", "metric valor", "e", "n"])
with open(output_file, "w") as f:
    f.write(table + '\n')

sys.stdout.close()
sys.stderr.close()


In [None]:
import re
import numpy as np
import sys

# Redirecting stdout and stderr to the specified file
output_file = "../results/lstm/sem-pre/output_lstm_analises_e_n_best.txt"
sys.stdout = open(output_file, "w")
sys.stderr = open(output_file, "a")

cont = 0

# Dictionary to store metrics with nested structure for datasets, e, n, and metrics
datasets = {
    "albrecht": {},
    "kemerer": {},
    "cocomo81": {},
    "china": {}
}

def extract_metrics_values(filename):
    global cont
    with open(filename, 'r') as file:
        for line in file:
            if not line.strip() or line.startswith('-'):
                continue
            
            # Updated regex to capture e and n values
            match = re.match(r'.*?([A-Za-z0-9_/.]+)\s+(.*?)\s+([\d.]+)\s+(\d+)\s+(\d+)', line)
            if match:
                dataset, metric, value, e, n = match.groups()
                value = float(value)
                e = int(e)
                n = int(n)
                
                if "albrecht.arff" in dataset:
                    dataset_key = "albrecht"
                elif "kemerer.arff" in dataset:
                    dataset_key = "kemerer"
                elif "cocomo81.arff" in dataset:
                    dataset_key = "cocomo81"
                elif "china.arff" in dataset:
                    dataset_key = "china"
                else:
                    print(f"Dataset not recognized: {dataset}", file=sys.stderr)
                    continue

                # Initialize nested dictionaries if not already done
                if e not in datasets[dataset_key]:
                    datasets[dataset_key][e] = {}
                if n not in datasets[dataset_key][e]:
                    datasets[dataset_key][e][n] = {}
                if metric not in datasets[dataset_key][e][n]:
                    datasets[dataset_key][e][n][metric] = []

                # Append the metric value
                datasets[dataset_key][e][n][metric].append(value)
                cont += 1
            # else:
            #     print(f"Line did not match: {line.strip()}", file=sys.stderr)

def calculate_statistics(metrics):
    results = {}
    for metric, values in metrics.items():
        results[metric] = {
            'Média': np.mean(values),
            'Mínimo': np.min(values),
            'Máximo': np.max(values),
            'Desvio Padrão': np.std(values)
        }
    return results

filename = '../results/lstm/sem-pre/output_lstm.txt'
extract_metrics_values(filename)

# Function to find the best n and e for each metric
def find_best_e_n_for_metric(dataset_name, d_values, metric_name):
    best_e_n = None
    best_value = float('inf') if metric_name in ['MAE', 'Median Absolute Error', 'RMSE'] else float('-inf')
    
    for e, k_values in d_values.items():
        for n, metrics in k_values.items():
            if metric_name in metrics:
                statistics = calculate_statistics(metrics)
                metric_value = statistics[metric_name]['Média']
                
                if (metric_name in ['MAE', 'Median Absolute Error', 'RMSE'] and metric_value < best_value) or \
                   (metric_name == 'R2 Score' and metric_value > best_value):
                    best_value = metric_value
                    best_e_n = (e, n, statistics)
    
    return best_e_n

# Metrics we are interested in
metrics_of_interest = ['MAE', 'Median Absolute Error', 'RMSE', 'R2 Score']

# Iterate through the datasets and print statistics for the best n and e for each metric
for dataset_name, d_values in datasets.items():
    print(f"Melhores resultados para o dataset: {dataset_name}")
    for metric_name in metrics_of_interest:
        best_e_n = find_best_e_n_for_metric(dataset_name, d_values, metric_name)
        if best_e_n:
            e, n, statistics = best_e_n
            print(f"  Melhor para a métrica {metric_name}: e = {e}, n = {n}")
            print(f"    Média: {statistics[metric_name]['Média']:.4f}")
            print(f"    Mínimo: {statistics[metric_name]['Mínimo']:.4f}")
            print(f"    Máximo: {statistics[metric_name]['Máximo']:.4f}")
            print(f"    Desvio Padrão: {statistics[metric_name]['Desvio Padrão']:.4f}")
            print()  # Add a new line for better readability

#print("Número de iterações das métricas: " + str(cont))

sys.stdout.close()
sys.stderr.close()
