In [38]:
import os
import glob
import re
import argparse
import pandas as pd
import numpy as np

In [39]:
results_path = "/home/ubuntu/Bio_Clinical_MBERT/scripts/outputs/HOC"

In [40]:
def parse_filename(filename):
    """
    Extract hyperparameters from the filename.
    Expected format: lr=<lr>_wd=<wd>_epochs=<epochs>_seed=<seed>_effective_batch_size=<effective_bs>.txt
    """
    pattern = r"lr=([^_]+)_wd=([^_]+)_epochs=([^_]+)_seed=([^_]+)_effective_batch_size=([^\.]+)"
    match = re.search(pattern, filename)
    if match:
        lr, wd, epochs, seed, effective_bs = match.groups()
        return {
            "lr": float(lr),
            "wd": float(wd),
            "epochs": int(epochs),
            "seed": int(seed),
            "effective_batch_size": int(effective_bs)
        }
    else:
        return None

def parse_log_file(filepath):
    """
    Parse the log file content, which is assumed to contain lines of the form 'Key: Value'.
    
    This function ignores lines with keys in the ignore list so that the DataFrame doesn't pick up
    headers or unwanted entries.
    """
    ignore_keys = {"test evaluation results", "epoch", "seed", "effective batch size", "eval_samples_per_second", "eval_steps_per_second", "eval_loss"}
    data = {}
    with open(filepath, 'r') as f:
        lines = f.readlines()
    
    for line in lines:
        line = line.strip()
        # Skip lines that are empty or do not contain a colon.
        if not line or ":" not in line:
            continue
        key, value = line.split(":", 1)
        key = key.strip()
        # If the lowercase key is in the ignore list, skip it.
        if key.lower() in ignore_keys:
            continue
        value = value.strip()
        # If the value is an empty string, set it to np.nan.
        if value == "":
            data[key] = np.nan
        else:
            # Try to convert to a float if possible.
            try:
                # If value contains a decimal point then convert to float, otherwise try int.
                if "." in value:
                    data[key] = float(value)
                else:
                    data[key] = int(value)
            except ValueError:
                data[key] = value  # keep as string if conversion fails
    return data

In [41]:
records = []

# Recursively find all .txt files in the directory.
filepaths = glob.glob(os.path.join(results_path, "**", "*.txt"), recursive=True)
if not filepaths:
    print(f"No log files found in {results_path}")

for filepath in filepaths:
    filename = os.path.basename(filepath)
    # Assume that the model name is the immediate parent folder.
    model = os.path.basename(os.path.dirname(filepath))
    
    params = parse_filename(filename)
    if params is None:
        print(f"Filename {filename} does not match the expected pattern. Skipping file.")
        continue
    
    log_data = parse_log_file(filepath)
    
    # Combine the data from the filename, file content, and model name.
    record = {
        "Model": model,
        **params,  # Contains keys: lr, wd, epochs, seed, effective_batch_size
    }
    record.update(log_data)
    records.append(record)

# Create a DataFrame from the records.
df = pd.DataFrame(records)
print("Individual results:")
print(df.head(), "\n")

# Group on Model, lr, wd, and effective_batch_size.
group_cols = ["Model", "lr", "wd", "effective_batch_size"]
ignore_cols = set(group_cols + ["seed", "epochs"])
metric_cols = [col for col in df.columns if col not in ignore_cols]

# Convert each metric column to numeric (coercing errors to NaN).
for col in metric_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")

# Define aggregation: for each metric, compute mean, median, min, and max.
agg_funcs = {col: ["mean", "median", "min", "max"] for col in metric_cols}
grouped = df.groupby(group_cols).agg(agg_funcs)

# Flatten the multi-level column index.
grouped.columns = ["_".join(col).strip() for col in grouped.columns.values]
grouped = grouped.reset_index()

for col in grouped.columns:
    if "f1" in col.lower():
        grouped[col] = (grouped[col] * 100).round(1)


print("Aggregated results over seeds:")
print(grouped.head())

# Optionally, save the aggregated DataFrame to CSV.
output_csv = os.path.join(results_path, "aggregated_results.csv")
grouped.to_csv(output_csv, index=False)
print(f"\nAggregated results saved to {output_csv}")

Individual results:
             Model       lr       wd  epochs  seed  effective_batch_size  \
0  ModernBERT-base  0.00002  0.00001       5    44                    16   
1  ModernBERT-base  0.00002  0.00001       5    45                    16   
2  ModernBERT-base  0.00005  0.00001       5    42                    16   
3  ModernBERT-base  0.00002  0.00001       5    43                    16   
4  ModernBERT-base  0.00005  0.00001       5    46                    16   

   eval_weighted_f1  eval_micro_f1  eval_runtime  Training duration (seconds)  \
0          0.645375       0.657933       54.8963                      1290.60   
1          0.628594       0.634454       56.0935                      1287.29   
2          0.665773       0.669661       50.9906                      1275.34   
3          0.667644       0.672373       55.4254                      1289.49   
4          0.660877       0.663371       54.1044                      1256.63   

   Evaluation duration (seconds)  Be

In [42]:
grouped

Unnamed: 0,Model,lr,wd,effective_batch_size,eval_weighted_f1_mean,eval_weighted_f1_median,eval_weighted_f1_min,eval_weighted_f1_max,eval_micro_f1_mean,eval_micro_f1_median,...,Training duration (seconds)_min,Training duration (seconds)_max,Evaluation duration (seconds)_mean,Evaluation duration (seconds)_median,Evaluation duration (seconds)_min,Evaluation duration (seconds)_max,Best epoch selected_mean,Best epoch selected_median,Best epoch selected_min,Best epoch selected_max
0,ModernBERT-base,2e-05,1e-05,16,64.2,63.9,62.9,66.8,65.0,64.9,...,1280.38,1291.09,55.524,55.43,54.9,56.1,,,,
1,ModernBERT-base,5e-05,1e-05,16,66.5,66.1,65.7,68.0,67.0,67.0,...,1256.63,1285.79,46.328,51.66,21.35,54.11,,,,
2,biobert-v1.1,2e-05,1e-05,16,71.6,72.2,70.0,73.1,72.0,72.2,...,2774.45,2779.88,44.88,45.03,44.45,45.34,,,,
3,biobert-v1.1,5e-05,1e-05,16,71.4,71.5,69.7,73.0,71.6,71.6,...,2766.15,2779.91,43.924,43.62,43.41,45.04,,,,
4,biobert-v1.1,9e-05,1e-05,16,70.1,70.0,69.6,70.4,70.3,70.1,...,2759.77,2779.8,42.382,42.61,41.77,42.65,,,,
5,biomed_roberta_base,2e-05,1e-05,16,71.5,71.5,70.6,72.4,71.6,71.6,...,2773.51,2797.31,41.27,41.24,40.09,43.06,,,,
6,biomed_roberta_base,5e-05,1e-05,16,71.0,70.9,70.2,71.7,71.0,71.0,...,2778.82,2789.62,40.294,44.32,33.04,45.59,,,,
7,modernbert_phase2_bio_1ep_decay_after_3ep_15mr,2e-05,1e-05,16,67.9,67.8,67.3,68.6,68.3,68.3,...,1263.84,1281.37,53.924,55.29,51.32,55.82,,,,
8,modernbert_phase2_bio_1ep_decay_after_3ep_15mr,5e-05,1e-05,16,67.2,67.7,66.1,68.1,67.8,68.2,...,1264.11,1277.06,54.852,54.65,53.45,56.66,,,,
9,modernbert_phase2_bio_1ep_decay_after_3ep_15mr,7e-05,1e-05,16,68.4,68.8,67.1,69.1,68.8,69.3,...,1261.41,1280.18,54.114,55.24,49.81,55.47,,,,
