# Compute statistics for all benchmark results and save them as Markdown.

In [None]:
import glob
import pandas as pd
from util.evaluate_helper import get_experiments_md, extract_single_alphabet_answer

csv_files = glob.glob("results/*.csv")

for file_path in csv_files:
    try:
        df = pd.read_csv(file_path)
        if df.empty:
            print(f"⚠️  Skipped empty file: {file_path}")
            continue

        df["pred"] = df.apply(extract_single_alphabet_answer, axis=1)
        df.to_csv(file_path, index=False)
        print(f"✓ Processed {file_path}")

    except pd.errors.EmptyDataError:
        print(f"⚠️  Skipped empty file: {file_path}")
    except pd.errors.ParserError as e:
        print(f"❌ Corrupted CSV file (skipped): {file_path}")
        print(f"   Error: {str(e)[:100]}")
    except Exception as e:
        print(f"❌ Error processing file (skipped): {file_path}")
        print(f"   Error: {type(e).__name__}: {str(e)[:100]}")

## CLIcK

In [None]:
dataset = "CLIcK"
csv_path_dict = {
    "GPT-5-chat": f"results/[{dataset}] gpt-5-chat-2025-08-08.csv",
    "GPT-5-mini": f"results/[{dataset}] gpt-5-mini-2025-08-08.csv",
    "GPT-5-nano": f"results/[{dataset}] gpt-5-nano-2025-08-08.csv",
    "Phi-4": f"results/[{dataset}] Phi-4.csv",
    "Phi-3.5-MoE-instruct": f"results/[{dataset}] Phi-3-5-MoE-instruct.csv",
    "Phi-3.5-mini-instruct": f"results/[{dataset}] Phi-3-5-mini-instruct.csv",
    "Phi-3-mini-128k-instruct-June": f"results/[{dataset}] Phi-3-mini-128k-June.csv",
    "Llama-3.1-8B-Instruct": f"results/[{dataset}] llama-3-1-8b-instruct.csv",
    "GPT-4o": f"results/[{dataset}] gpt-4o-240513.csv",
    "GPT-4o-mini": f"results/[{dataset}] gpt-4o-mini-240718.csv",
    "GPT-4-turbo": f"results/[{dataset}] gpt-4-turbo-240409.csv",
    "GPT-3.5-turbo": f"results/[{dataset}] gpt-35-turbo-230613.csv",
}
click_md = get_experiments_md(dataset, csv_path_dict)

## HAERAE 1.0

In [None]:
dataset = "HAERAE"
csv_path_dict = {
    "GPT-5-chat": f"results/[{dataset}] gpt-5-chat-2025-08-08.csv",
    "GPT-5-mini": f"results/[{dataset}] gpt-5-mini-2025-08-08.csv",
    "GPT-5-nano": f"results/[{dataset}] gpt-5-nano-2025-08-08.csv",
    "Phi-4": f"results/[{dataset}] Phi-4.csv",
    "Phi-3.5-MoE-instruct": f"results/[{dataset}] Phi-3-5-Moe-instruct.csv",
    "Phi-3.5-mini-instruct": f"results/[{dataset}] Phi-3-5-mini-instruct.csv",
    "Phi-3-mini-128k-instruct-June": f"results/[{dataset}] Phi-3-mini-128k-June.csv",
    "Llama-3.1-8B-Instruct": f"results/[{dataset}] llama-3-1-8b-instruct.csv",
    "GPT-4o": f"results/[{dataset}] gpt-4o-240513.csv",
    "GPT-4o-mini": f"results/[{dataset}] gpt-4o-mini-240718.csv",
    "GPT-4-turbo": f"results/[{dataset}] gpt-4-turbo-240409.csv",
    "GPT-3.5-turbo": f"results/[{dataset}] gpt-35-turbo-230613.csv",
}
haerae_md = get_experiments_md(dataset, csv_path_dict)

## KMMLU

### zero-shot

In [None]:
dataset = "KMMLU"
csv_path_dict = {
    "GPT-5-chat": f"results/[{dataset}] gpt-5-chat-2025-08-08-0shot.csv",
    "GPT-5-mini": f"results/[{dataset}] gpt-5-mini-2025-08-08-0shot.csv",
    "GPT-5-nano": f"results/[{dataset}] gpt-5-nano-2025-08-08-0shot.csv",
    "Phi-4": f"results/[{dataset}] Phi-4-0shot.csv",
    "Phi-3.5-MoE-instruct": f"results/[{dataset}] Phi-3-5-MoE-instruct.csv",
    "Phi-3.5-mini-instruct": f"results/[{dataset}] Phi-3-5-mini-instruct.csv",
    "Phi-3-mini-128k-instruct-June": f"results/[{dataset}] Phi-3-mini-128k-June.csv",
    "Llama-3.1-8B-Instruct": f"results/[{dataset}] llama-3-1-8b-instruct.csv",
    "GPT-4o": f"results/[{dataset}] gpt-4o-240513.csv",
    "GPT-4o-mini": f"results/[{dataset}] gpt-4o-mini-240718.csv",
    "GPT-4-turbo": f"results/[{dataset}] gpt-4-turbo-240409.csv",
    "GPT-3.5-turbo": f"results/[{dataset}] gpt-35-turbo-230613.csv",
}
kmmlu_md = get_experiments_md(dataset, csv_path_dict)

### 5-shot

In [None]:
dataset = "KMMLU"
postfix = "5shot"

csv_path_dict = {
    #"GPT-5-chat": f"results/[{dataset}] gpt-5-chat-2025-08-08-5shot.csv",
    "Phi-4": f"results/[{dataset}] Phi-4-{postfix}.csv",
    "Phi-3.5-MoE-instruct": f"results/[{dataset}] Phi-3-5-MoE-instruct-{postfix}.csv",
    "Phi-3.5-mini-instruct": f"results/[{dataset}] Phi-3-5-mini-instruct-{postfix}.csv",
    "Phi-3-mini-128k-instruct-June": f"results/[{dataset}] Phi-3-mini-128k-June-{postfix}.csv",
    "Llama-3.1-8B-Instruct": f"results/[{dataset}] llama-3-1-8b-instruct-{postfix}.csv",
    "GPT-4o": f"results/[{dataset}] gpt-4o-240513-{postfix}.csv",
    "GPT-4o-mini": f"results/[{dataset}] gpt-4o-mini-240718-{postfix}.csv",
    "GPT-4-turbo": f"results/[{dataset}] gpt-4-turbo-240409-{postfix}.csv",
    "GPT-3.5-turbo": f"results/[{dataset}] gpt-35-turbo-230613-{postfix}.csv",
}
kmmlu_5shot_md = get_experiments_md(dataset, csv_path_dict, postfix)

## KMMLU-HARD

### zero-shot

In [None]:
import os
csv_path = "results/[KMMLU-HARD] Phi-4-0shot.csv"
if not os.path.exists(csv_path) or os.path.getsize(csv_path) == 0:
    print(f"File {csv_path} does not exist or is empty.")
else:
    result = pd.read_csv(csv_path)

dataset = "KMMLU-HARD"
csv_path_dict = {
    "Phi-4": f"results/[{dataset}] Phi-4-0shot.csv",
    "Phi-3.5-MoE-instruct": f"results/[{dataset}] Phi-3-5-MoE-instruct.csv",
    "Phi-3.5-mini-instruct": f"results/[{dataset}] Phi-3-5-mini-instruct.csv",
    "Phi-3-mini-128k-instruct-June": f"results/[{dataset}] Phi-3-mini-128k-June.csv",
    "Llama-3.1-8B-Instruct": f"results/[{dataset}] llama-3-1-8b-instruct.csv",
    "GPT-4o": f"results/[{dataset}] gpt-4o-240513.csv",
    "GPT-4o-mini": f"results/[{dataset}] gpt-4o-mini-240718.csv",
    "GPT-4-turbo": f"results/[{dataset}] gpt-4-turbo-240409.csv",
    "GPT-3.5-turbo": f"results/[{dataset}] gpt-35-turbo-230613.csv",
}
kmmlu_hard_md = get_experiments_md(dataset, csv_path_dict)

### 5-shot

In [None]:
dataset = "KMMLU-HARD"
postfix = "5shot"

csv_path_dict = {
    "Phi-4": f"results/[{dataset}] Phi-4-{postfix}.csv",
    "Phi-3.5-MoE-instruct": f"results/[{dataset}] Phi-3-5-MoE-instruct-{postfix}.csv",
    "Phi-3.5-mini-instruct": f"results/[{dataset}] Phi-3-5-mini-instruct-{postfix}.csv",
    "Phi-3-mini-128k-instruct-June": f"results/[{dataset}] Phi-3-mini-128k-June-{postfix}.csv",
    "Llama-3.1-8B-Instruct": f"results/[{dataset}] llama-3-1-8b-instruct-{postfix}.csv",
    "GPT-4o": f"results/[{dataset}] gpt-4o-240513-{postfix}.csv",
    "GPT-4o-mini": f"results/[{dataset}] gpt-4o-mini-240718-{postfix}.csv",
    "GPT-4-turbo": f"results/[{dataset}] gpt-4-turbo-240409-{postfix}.csv",
    "GPT-3.5-turbo": f"results/[{dataset}] gpt-35-turbo-230613-{postfix}.csv",
}
kmmlu_hard_5shot_md = get_experiments_md(dataset, csv_path_dict, postfix)

## Save to the Markdown

In [None]:
with open("DETAILED_RESULTS.md", "w") as f:
    f.write("## Detailed Results\n\n")
    f.write(click_md)
    f.write("\n\n")
    f.write(haerae_md)
    f.write("\n\n")
    f.write(kmmlu_md)
    f.write("\n\n")
    # f.write(kmmlu_5shot_md)
    # f.write("\n\n")
    # f.write(kmmlu_hard_md)
    # f.write("\n\n")
    # f.write(kmmlu_hard_5shot_md)