In [15]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error


# Function to calculate MAPE
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100


# Base directory
base_dir = os.getcwd()
# Initialize a list to hold the data
data = []

# Iterate over all subdirectories
for directory in os.listdir(base_dir):
    dir_path = os.path.join(base_dir, directory)
    if os.path.isdir(dir_path):
        try:
            # Extract parameters from the directory name
            params = directory.split('-')
            output_dim = int(params[1])
            hidden_dim = int(params[2])
            num_layers = int(params[3])
            dropout = float(params[4])

            # Navigate into 'XJTU results'
            xjtu_results_path = os.path.join(dir_path, 'XJTU results')
            if os.path.isdir(xjtu_results_path):
                # Iterate over batch subdirectories
                for batch_dir in os.listdir(xjtu_results_path):
                    batch_path = os.path.join(xjtu_results_path, batch_dir)
                    if os.path.isdir(batch_path):
                        # Extract batch number from the subdirectory name
                        batch = batch_dir

                        # Construct file paths
                        red_label_path = os.path.join(batch_path, 'pred_label.npy')
                        true_label_path = os.path.join(batch_path, 'true_label.npy')
                        num_param_path = os.path.join(batch_path, 'num_param.txt')

                        # Check if the required files exist
                        if os.path.exists(red_label_path) and os.path.exists(true_label_path) and os.path.exists(num_param_path):
                            # Read files
                            red_label = np.load(red_label_path)
                            true_label = np.load(true_label_path)
                            with open(num_param_path, 'r') as f:
                                num_param = int(f.read().strip())

                            # Calculate metrics
                            mae = mean_absolute_error(true_label, red_label)
                            mape = mean_absolute_percentage_error(true_label, red_label)
                            rmse = np.sqrt(mean_squared_error(true_label, red_label))

                            # Append data
                            data.append({
                                'batch': batch,
                                'output_dim': output_dim,
                                'hidden_dim': hidden_dim,
                                'num_layer': num_layers,
                                'dropout': dropout,
                                'num_param': num_param,
                                'MAE': mae,
                                'MAPE': mape,
                                'RMSE': rmse
                            })
        
        except Exception as e:
            print(f"Error processing directory {directory}: {e}")

for i in range(6):

    path1 = f"../results of reviewer/XJTU results/{i}-{i}/"

    red_label = np.load(path1+"/pred_label.npy")
    true_label = np.load(path1+"/true_label.npy")

    mae = mean_absolute_error(true_label, red_label)
    mape = mean_absolute_percentage_error(true_label, red_label)
    rmse = np.sqrt(mean_squared_error(true_label, red_label))

    data.append({
        'batch': f"{i}-{i}",
        'output_dim': -1,
        'hidden_dim': -1,
        'num_layer': -1,
        'dropout': -1,
        'num_param': 13662,
        'MAE': mae,
        'MAPE': mape,
        'RMSE': rmse
    })

# Create DataFrame
df = pd.DataFrame(data)

df = df.sort_values(by=['batch', 'RMSE'], ascending=[True, True])

# Display DataFrame
df.to_csv('our-results.csv', index=False)

In [16]:
# Define a function to check if a model wins over the legacy model
def check_win(row, base_row):
    # Compare the metrics: MAE, MAPE, RMSE
    metrics = ['MAE', 'MAPE', 'RMSE']
    wins = sum(row[metric] < base_row[metric] for metric in metrics)
    return wins >= 2

# Group the data by batches
batch_groups = df.groupby('batch')

# List to collect results
results = []

# Iterate over each batch
for batch, group in batch_groups:
    # Get the base model row where dropout == -1
    base_model_row = group[group['dropout'] == -1].squeeze()
    
    # Iterate over other rows within the same batch
    for _, row in group.iterrows():
        if row['dropout'] != -1:
            # Check if the current model wins over the base model
            if check_win(row, base_model_row):
                # Extract hyperparameters and append to results
                hyperparameters = row[['output_dim', 'hidden_dim', 'num_layer', 'dropout', 'num_param']]
                results.append(tuple(hyperparameters))

# Create a DataFrame from the results with a count of wins
summary_df = pd.DataFrame(results, columns=['output_dim', 'hidden_dim', 'num_layer', 'dropout', 'num_param'])
summary_df['win_over_legacy'] = 1
summary_df = summary_df.groupby(['output_dim', 'hidden_dim', 'num_layer', 'dropout', 'num_param']).sum().reset_index()

# Sort the new DataFrame by `win_over_legacy` in descending order
sorted_summary_df = summary_df.sort_values(by='win_over_legacy', ascending=False)

# Display the resulting DataFrame
sorted_summary_df.to_csv("comparison.csv", index=False)

In [17]:
# Group by the specified hyperparameters and calculate the mean of the metrics
grouped_df = df.groupby(['output_dim', 'hidden_dim', 'num_layer', 'dropout', 'num_param']).agg(
    mean_MAE=('MAE', 'mean'),
    mean_MAPE=('MAPE', 'mean'),
    mean_RMSE=('RMSE', 'mean')
).reset_index()

# Sort the new DataFrame by mean_RMSE in ascending order
sorted_df = grouped_df.sort_values(by='mean_RMSE', ascending=True)

# View the resulting DataFrame
sorted_df.to_csv("comparison-2.csv", index=False)