# Compute statistics for all benchmark results and save them as Markdown.

In [21]:
import glob
import pandas as pd
from util.evaluate_helper import get_experiments_md
import re
import numpy as np

def extract_single_alphabet_answer(row):
    pred = row["pred"]
    response = row["response"]

    if (
        isinstance(pred, float)
        and np.isnan(pred)
        or (isinstance(pred, str) and len(pred.strip()) == 0)
    ):
        # response도 문자열인지 확인
        if isinstance(response, str) and len(response.strip()) > 0:
            match = re.search(r"정답(?: \(Answer\))?: (\w)", response)
            return match.group(1) if match else None
        else:
            return None
    else:
        return pred

csv_files = glob.glob("results/*.csv")

for file_path in csv_files:
    try:
        df = pd.read_csv(file_path)
        if df.empty:
            print(f"Skipped empty file: {file_path}")
            continue

        df["pred"] = df.apply(extract_single_alphabet_answer, axis=1)
        df.to_csv(file_path, index=False)
        print(f"Processed {file_path}")

    except pd.errors.EmptyDataError:
        print(f"Skipped empty file (EmptyDataError): {file_path}")

Processed results/[KMMLU] Phi-3-5-MoE-instruct.csv
Processed results/[KMMLU] Phi-3-5-MoE-instruct-5shot.csv
Processed results/[KMMLU-HARD] gpt-4o-240513-5shot.csv
Processed results/[KMMLU] Phi-3-5-mini-instruct.csv
Processed results/[HAERAE] llama-3-1-8b-instruct.csv
Processed results/[CLIcK] gpt-5.1-chat-2025-11-13.csv
Processed results/[KMMLU] gpt-4.1-mini-2025-04-14.csv
Processed results/[KMMLU] Phi-4-0shot.csv
Processed results/[KMMLU-HARD] Phi-3-5-MoE-instruct.csv
Processed results/[KMMLU] gpt-5-chat-2025-08-08-0shot.csv
Processed results/[KMMLU] gpt-4.1-2025-04-14.csv
Processed results/[KMMLU] gpt-5.1-chat-2025-11-13-0shot.csv
Processed results/[KMMLU] gpt-5-mini-2025-08-08-0shot.csv
Processed results/[CLIcK] gpt-4.1-2025-04-14.csv
Processed results/[KMMLU-HARD] Phi-3-mini-128k-June.csv
Processed results/[KMMLU-HARD] Phi-4-0shot.csv
Processed results/[CLIcK] Phi-3-mini-128k-June.csv
Processed results/[HAERAE] Phi-3-5-mini-instruct.csv
Processed results/[KMMLU-HARD] gpt-5.1-2025-1

## CLIcK

In [15]:
dataset = "CLIcK"
csv_path_dict = {
    "GPT-5.1-medium": f"results/[{dataset}] gpt-51-medium-2025-11-13.csv",
    "GPT-5.1": f"results/[{dataset}] gpt-5.1-2025-11-13.csv",
    "GPT-5.1-chat": f"results/[{dataset}] gpt-5.1-chat-2025-11-13.csv",
    "GPT-5-chat": f"results/[{dataset}] gpt-5-chat-2025-08-08.csv",
    "GPT-5-mini": f"results/[{dataset}] gpt-5-mini-2025-08-08.csv",
    "GPT-5-nano": f"results/[{dataset}] gpt-5-nano-2025-08-08.csv",
    "Phi-4": f"results/[{dataset}] Phi-4.csv",
    "Phi-3.5-MoE-instruct": f"results/[{dataset}] Phi-3-5-MoE-instruct.csv",
    "Phi-3.5-mini-instruct": f"results/[{dataset}] Phi-3-5-mini-instruct.csv",
    "Phi-3-mini-128k-instruct-June": f"results/[{dataset}] Phi-3-mini-128k-June.csv",
    "Llama-3.1-8B-Instruct": f"results/[{dataset}] llama-3-1-8b-instruct.csv",
    "GPT-4o": f"results/[{dataset}] gpt-4o-240513.csv",
    "GPT-4o-mini": f"results/[{dataset}] gpt-4o-mini-240718.csv",
    "GPT-4-turbo": f"results/[{dataset}] gpt-4-turbo-240409.csv",
    "GPT-3.5-turbo": f"results/[{dataset}] gpt-35-turbo-230613.csv",
}
click_md = get_experiments_md(dataset, csv_path_dict)

Excluding FAILED responses from accuracy calculation
Evaluating on 1796 valid responses
Excluding FAILED responses from accuracy calculation
Evaluating on 1826 valid responses
Excluding FAILED responses from accuracy calculation
Evaluating on 1816 valid responses
Excluding FAILED responses from accuracy calculation
Evaluating on 1816 valid responses
Excluding FAILED responses from accuracy calculation
Evaluating on 1856 valid responses
Excluding FAILED responses from accuracy calculation
Evaluating on 1846 valid responses


## HAERAE 1.0

In [16]:
dataset = "HAERAE"
csv_path_dict = {
    "GPT-5.1-medium": f"results/[{dataset}] gpt-51-medium-2025-11-13.csv",
    "GPT-5.1": f"results/[{dataset}] gpt-5.1-2025-11-13.csv",
    "GPT-5.1-chat": f"results/[{dataset}] gpt-5.1-chat-2025-11-13.csv",
    "GPT-5-chat": f"results/[{dataset}] gpt-5-chat-2025-08-08.csv",
    "GPT-5-mini": f"results/[{dataset}] gpt-5-mini-2025-08-08.csv",
    "GPT-5-nano": f"results/[{dataset}] gpt-5-nano-2025-08-08.csv",
    "Phi-4": f"results/[{dataset}] Phi-4.csv",
    "Phi-3.5-MoE-instruct": f"results/[{dataset}] Phi-3-5-Moe-instruct.csv",
    "Phi-3.5-mini-instruct": f"results/[{dataset}] Phi-3-5-mini-instruct.csv",
    "Phi-3-mini-128k-instruct-June": f"results/[{dataset}] Phi-3-mini-128k-June.csv",
    "Llama-3.1-8B-Instruct": f"results/[{dataset}] llama-3-1-8b-instruct.csv",
    "GPT-4o": f"results/[{dataset}] gpt-4o-240513.csv",
    "GPT-4o-mini": f"results/[{dataset}] gpt-4o-mini-240718.csv",
    "GPT-4-turbo": f"results/[{dataset}] gpt-4-turbo-240409.csv",
    "GPT-3.5-turbo": f"results/[{dataset}] gpt-35-turbo-230613.csv",
}
haerae_md = get_experiments_md(dataset, csv_path_dict)

Excluding FAILED responses from accuracy calculation
Evaluating on 1498 valid responses
Excluding FAILED responses from accuracy calculation
Evaluating on 1518 valid responses
Excluding FAILED responses from accuracy calculation
Evaluating on 1508 valid responses
Excluding FAILED responses from accuracy calculation
Evaluating on 1448 valid responses
Excluding FAILED responses from accuracy calculation
Evaluating on 1508 valid responses
Excluding FAILED responses from accuracy calculation
Evaluating on 1528 valid responses


## KMMLU

### zero-shot

In [23]:
dataset = "KMMLU"
csv_path_dict = {
    "GPT-5.1-medium": f"results/[{dataset}] gpt-51-medium-2025-11-13-0shot.csv",
    "GPT-5.1": f"results/[{dataset}] gpt-5.1-2025-11-13-0shot.csv",
    "GPT-5.1-chat": f"results/[{dataset}] gpt-5.1-chat-2025-11-13-0shot.csv",
    "GPT-5-chat": f"results/[{dataset}] gpt-5-chat-2025-08-08-0shot.csv",
    "GPT-5-mini": f"results/[{dataset}] gpt-5-mini-2025-08-08-0shot.csv",
    "GPT-5-nano": f"results/[{dataset}] gpt-5-nano-2025-08-08-0shot.csv",
    "Phi-4": f"results/[{dataset}] Phi-4-0shot.csv",
    "Phi-3.5-MoE-instruct": f"results/[{dataset}] Phi-3-5-MoE-instruct.csv",
    "Phi-3.5-mini-instruct": f"results/[{dataset}] Phi-3-5-mini-instruct.csv",
    "Phi-3-mini-128k-instruct-June": f"results/[{dataset}] Phi-3-mini-128k-June.csv",
    "Llama-3.1-8B-Instruct": f"results/[{dataset}] llama-3-1-8b-instruct.csv",
    "GPT-4o": f"results/[{dataset}] gpt-4o-240513.csv",
    "GPT-4o-mini": f"results/[{dataset}] gpt-4o-mini-240718.csv",
    "GPT-4-turbo": f"results/[{dataset}] gpt-4-turbo-240409.csv",
    "GPT-3.5-turbo": f"results/[{dataset}] gpt-35-turbo-230613.csv",
}
kmmlu_md = get_experiments_md(dataset, csv_path_dict)

Excluding FAILED responses from accuracy calculation
Evaluating on 33710 valid responses
Excluding FAILED responses from accuracy calculation
Evaluating on 34330 valid responses
Excluding FAILED responses from accuracy calculation
Evaluating on 34100 valid responses
Excluding FAILED responses from accuracy calculation
Evaluating on 34420 valid responses
Excluding FAILED responses from accuracy calculation
Evaluating on 34040 valid responses


Excluding FAILED responses from accuracy calculation
Evaluating on 34510 valid responses


### 5-shot

In [13]:
dataset = "KMMLU"
postfix = "5shot"

csv_path_dict = {
    "GPT-5.1": f"results/[{dataset}] gpt-5.1-2025-11-13.csv",
    "GPT-5.1-chat": f"results/[{dataset}] gpt-5.1-chat-2025-11-13.csv",
    "GPT-5-chat": f"results/[{dataset}] gpt-5-chat-2025-08-08-5shot.csv",
    "Phi-4": f"results/[{dataset}] Phi-4-{postfix}.csv",
    "Phi-3.5-MoE-instruct": f"results/[{dataset}] Phi-3-5-MoE-instruct-{postfix}.csv",
    "Phi-3.5-mini-instruct": f"results/[{dataset}] Phi-3-5-mini-instruct-{postfix}.csv",
    "Phi-3-mini-128k-instruct-June": f"results/[{dataset}] Phi-3-mini-128k-June-{postfix}.csv",
    "Llama-3.1-8B-Instruct": f"results/[{dataset}] llama-3-1-8b-instruct-{postfix}.csv",
    "GPT-4o": f"results/[{dataset}] gpt-4o-240513-{postfix}.csv",
    "GPT-4o-mini": f"results/[{dataset}] gpt-4o-mini-240718-{postfix}.csv",
    "GPT-4-turbo": f"results/[{dataset}] gpt-4-turbo-240409-{postfix}.csv",
    "GPT-3.5-turbo": f"results/[{dataset}] gpt-35-turbo-230613-{postfix}.csv",
}
kmmlu_5shot_md = get_experiments_md(dataset, csv_path_dict, postfix)

FileNotFoundError: [Errno 2] No such file or directory: 'results/[KMMLU] gpt-5.1-2025-11-13.csv'

## KMMLU-HARD

### zero-shot

In [19]:
import os
csv_path = "results/[KMMLU-HARD] Phi-4-0shot.csv"
if not os.path.exists(csv_path) or os.path.getsize(csv_path) == 0:
    print(f"File {csv_path} does not exist or is empty.")
else:
    result = pd.read_csv(csv_path)

dataset = "KMMLU-HARD"
csv_path_dict = {
    "GPT-5.1-medium": f"results/[{dataset}] gpt-51-medium-2025-11-13-0shot.csv",
    "GPT-5.1": f"results/[{dataset}] gpt-5.1-2025-11-13-0shot.csv",
    "GPT-5.1-chat": f"results/[{dataset}] gpt-5.1-chat-2025-11-13-0shot.csv",
    "GPT-5-chat": f"results/[{dataset}] gpt-5-chat-2025-08-08-0shot.csv",
    "GPT-5-mini": f"results/[{dataset}] gpt-5-mini-2025-08-08-0shot.csv",
    "GPT-5-nano": f"results/[{dataset}] gpt-5-nano-2025-08-08-0shot.csv",
    "Phi-4": f"results/[{dataset}] Phi-4-0shot.csv",
    "Phi-3.5-MoE-instruct": f"results/[{dataset}] Phi-3-5-MoE-instruct.csv",
    "Phi-3.5-mini-instruct": f"results/[{dataset}] Phi-3-5-mini-instruct.csv",
    "Phi-3-mini-128k-instruct-June": f"results/[{dataset}] Phi-3-mini-128k-June.csv",
    "Llama-3.1-8B-Instruct": f"results/[{dataset}] llama-3-1-8b-instruct.csv",
    "GPT-4o": f"results/[{dataset}] gpt-4o-240513.csv",
    "GPT-4o-mini": f"results/[{dataset}] gpt-4o-mini-240718.csv",
    "GPT-4-turbo": f"results/[{dataset}] gpt-4-turbo-240409.csv",
    "GPT-3.5-turbo": f"results/[{dataset}] gpt-35-turbo-230613.csv",
}
kmmlu_hard_md = get_experiments_md(dataset, csv_path_dict)

Excluding FAILED responses from accuracy calculation
Evaluating on 4024 valid responses
Excluding FAILED responses from accuracy calculation
Evaluating on 4064 valid responses
Excluding FAILED responses from accuracy calculation
Evaluating on 4014 valid responses
Excluding FAILED responses from accuracy calculation
Evaluating on 4024 valid responses
Excluding FAILED responses from accuracy calculation
Evaluating on 4044 valid responses
Excluding FAILED responses from accuracy calculation
Evaluating on 4064 valid responses


### 5-shot

In [5]:
dataset = "KMMLU-HARD"
postfix = "5shot"

csv_path_dict = {
    "Phi-4": f"results/[{dataset}] Phi-4-{postfix}.csv",
    "Phi-3.5-MoE-instruct": f"results/[{dataset}] Phi-3-5-MoE-instruct-{postfix}.csv",
    "Phi-3.5-mini-instruct": f"results/[{dataset}] Phi-3-5-mini-instruct-{postfix}.csv",
    "Phi-3-mini-128k-instruct-June": f"results/[{dataset}] Phi-3-mini-128k-June-{postfix}.csv",
    "Llama-3.1-8B-Instruct": f"results/[{dataset}] llama-3-1-8b-instruct-{postfix}.csv",
    "GPT-4o": f"results/[{dataset}] gpt-4o-240513-{postfix}.csv",
    "GPT-4o-mini": f"results/[{dataset}] gpt-4o-mini-240718-{postfix}.csv",
    "GPT-4-turbo": f"results/[{dataset}] gpt-4-turbo-240409-{postfix}.csv",
    "GPT-3.5-turbo": f"results/[{dataset}] gpt-35-turbo-230613-{postfix}.csv",
}
kmmlu_hard_5shot_md = get_experiments_md(dataset, csv_path_dict, postfix)

## Save to the Markdown

In [24]:
with open("DETAILED_RESULTS.md", "w") as f:
    f.write("## Detailed Results\n\n")
    f.write(click_md)
    f.write("\n\n")
    f.write(haerae_md)
    f.write("\n\n")
    f.write(kmmlu_md)
    f.write("\n\n")
    # f.write(kmmlu_5shot_md)
    # f.write("\n\n")
    f.write(kmmlu_hard_md)
    f.write("\n\n")
    # f.write(kmmlu_hard_5shot_md)