# Korean LLM Evaluation Results Analysis

이 노트북은 CLIcK, HAE-RAE, KMMLU, KMMLU-HARD 데이터셋에 대한 평가 결과를 분석합니다.

## 데이터 전처리

모든 결과 파일의 `pred` 컬럼을 정규화하여 단일 알파벳 답변을 추출합니다.

In [9]:
import glob
import pandas as pd
from util.evaluate_helper import get_experiments_md, extract_single_alphabet_answer

csv_files = glob.glob("results/*.csv")

for file_path in csv_files:
    try:
        df = pd.read_csv(file_path)
        if df.empty:
            print(f"⚠️  Skipped empty file: {file_path}")
            continue

        df["pred"] = df.apply(extract_single_alphabet_answer, axis=1)
        df.to_csv(file_path, index=False)
        print(f"✓ Processed {file_path}")

    except pd.errors.EmptyDataError:
        print(f"⚠️  Skipped empty file: {file_path}")
    except pd.errors.ParserError as e:
        print(f"❌ Corrupted CSV file (skipped): {file_path}")
        print(f"   Error: {str(e)[:100]}")
    except Exception as e:
        print(f"❌ Error processing file (skipped): {file_path}")
        print(f"   Error: {type(e).__name__}: {str(e)[:100]}")

✓ Processed results/[HAERAE] gpt-5.1-chat-2025-11-13.csv
✓ Processed results/[KMMLU] gpt-5.1-2025-11-13-0shot.csv
✓ Processed results/[KMMLU-HARD] llama-3-1-8b-instruct.csv
✓ Processed results/[KMMLU-HARD] gpt-51-medium-2025-11-13-0shot.csv
✓ Processed results/[KMMLU-HARD] gpt-4-turbo-240409-5shot.csv
✓ Processed results/[CLIcK] gpt-35-turbo-230613.csv


✓ Processed results/[CLIcK] Phi-3-5-MoE-instruct.csv
✓ Processed results/[CLIcK] llama-3-1-8b-instruct.csv
✓ Processed results/[KMMLU] gpt-5.1-chat-2025-11-13-0shot.csv
✓ Processed results/[KMMLU-HARD] gpt-35-turbo-230613-5shot.csv
✓ Processed results/[CLIcK] gpt-4.1-nano-2025-04-14.csv
✓ Processed results/[KMMLU] gpt-4-turbo-240409-5shot.csv
✓ Processed results/[CLIcK] gpt-5-chat-2025-08-08.csv
✓ Processed results/[KMMLU-HARD] gpt-4.1-2025-04-14.csv
✓ Processed results/[CLIcK] gpt-5.1-chat-2025-11-13.csv
✓ Processed results/[KMMLU-HARD] gpt-4.1-nano-2025-04-14.csv
✓ Processed results/[KMMLU] Phi-3-5-mini-instruct.csv
✓ Processed results/[HAERAE] gpt-51-medium-2025-11-13.csv
✓ Processed results/[KMMLU-HARD] Phi-3-5-mini-instruct-5shot.csv
✓ Processed results/[CLIcK] Phi-3-mini-128k-June.csv
✓ Processed results/[HAERAE] gpt-4-turbo-240409.csv
✓ Processed results/[KMMLU] gpt-4.1-nano-2025-04-14.csv
✓ Processed results/[KMMLU] gpt-4.1-2025-04-14.csv
✓ Processed results/[KMMLU-HARD] gpt-35

## 재평가 (선택사항)

필요한 경우 아래 코드의 주석을 해제하여 특정 결과 파일을 재평가할 수 있습니다.

In [None]:

# Python에서 직접 실행
from benchmarks.haerae_main import evaluate as haerae_evaluate
from benchmarks.click_main import evaluate as click_evaluate
from benchmarks.kmmlu_main import evaluate as kmmlu_evaluate

from benchmarks.kormedmcqa_main import evaluate as kormedmcqa_evaluate

# 예시: CLIcK 결과들 재평가
# click_evaluate("results/[CLIcK] gpt-51-medium-2025-11-13..csv")

# 예시: HAERAE 결과들 재평가
# haerae_evaluate("results/[HAERAE] gpt-51-medium-2025-11-13.csv")

# # 예시: KMMLU 결과들 재평가
#kmmlu_evaluate("results/[KMMLU] nova2-lite-2025-12-05.csv", dataset="KMMLU", verbose=True)
# kmmlu_evaluate("results/[KMMLU] gpt-51-medium-2025-11-13-0shot.csv", dataset=dataset_name, verbose=True)

kormedmcqa_evaluate("results/[KorMedMCQA] qwen3-next-2025-12-11.csv", dataset="KOR-MEDMCQA", verbose=True)

: 

## CLIcK

### Open source models

In [None]:
dataset = "CLIcK"
csv_path_dict = {
    "gpt-oss-120b": f"results/[{dataset}] gpt-oss-120b-2025-08-05.csv",
    "Phi-4": f"results/[{dataset}] Phi-4.csv",
    "Phi-3.5-MoE-instruct": f"results/[{dataset}] Phi-3-5-MoE-instruct.csv",
    "Phi-3.5-mini-instruct": f"results/[{dataset}] Phi-3-5-mini-instruct.csv",
    "Phi-3-mini-128k-instruct-June": f"results/[{dataset}] Phi-3-mini-128k-June.csv",
    "Llama-3.1-8B-Instruct": f"results/[{dataset}] llama-3-1-8b-instruct.csv",
}
print(get_experiments_md(dataset, csv_path_dict))

### Proprietary models

In [None]:
dataset = "CLIcK"
csv_path_dict = {
    "GPT-5-chat": f"results/[{dataset}] gpt-5-chat-2025-08-08.csv",
    "GPT-5-mini": f"results/[{dataset}] gpt-5-mini-2025-08-08.csv",
    "GPT-5-nano": f"results/[{dataset}] gpt-5-nano-2025-08-08.csv",
    "GPT-4.1": f"results/[{dataset}] gpt-4.1-2025-04-14.csv",
    "GPT-4.1-mini": f"results/[{dataset}] gpt-4.1-mini-2025-04-14.csv",
    "GPT-4.1-nano": f"results/[{dataset}] gpt-4.1-nano-2025-04-14.csv",
    "GPT-4o": f"results/[{dataset}] gpt-4o-240513.csv",
    "GPT-4o-mini": f"results/[{dataset}] gpt-4o-mini-240718.csv",
    "GPT-4-turbo": f"results/[{dataset}] gpt-4-turbo-240409.csv",
    "GPT-3.5-turbo": f"results/[{dataset}] gpt-35-turbo-230613.csv",
}
print(get_experiments_md(dataset, csv_path_dict))

## HAERAE 1.0

### Open source models

In [None]:
dataset = "HAERAE"
csv_path_dict = {
    "gpt-oss-120b": f"results/[{dataset}] gpt-oss-120b-2025-08-05.csv",
    "Phi-4": f"results/[{dataset}] Phi-4.csv",
    "Phi-3.5-MoE-instruct": f"results/[{dataset}] Phi-3-5-Moe-instruct.csv",
    "Phi-3.5-mini-instruct": f"results/[{dataset}] Phi-3-5-mini-instruct.csv",
    "Phi-3-mini-128k-instruct-June": f"results/[{dataset}] Phi-3-mini-128k-June.csv",
    "Llama-3.1-8B-Instruct": f"results/[{dataset}] llama-3-1-8b-instruct.csv",
}
print(get_experiments_md(dataset, csv_path_dict))

### Proprietary models

In [None]:
dataset = "HAERAE"
csv_path_dict = {
    "GPT-5-chat": f"results/[{dataset}] gpt-5-chat-2025-08-08.csv",
    "GPT-5-mini": f"results/[{dataset}] gpt-5-mini-2025-08-08.csv",
    "GPT-5-nano": f"results/[{dataset}] gpt-5-nano-2025-08-08.csv",
    "GPT-4.1": f"results/[{dataset}] gpt-4.1-2025-04-14.csv",
    "GPT-4.1-mini": f"results/[{dataset}] gpt-4.1-mini-2025-04-14.csv",
    "GPT-4.1-nano": f"results/[{dataset}] gpt-4.1-nano-2025-04-14.csv",
    "GPT-4o": f"results/[{dataset}] gpt-4o-240513.csv",
    "GPT-4o-mini": f"results/[{dataset}] gpt-4o-mini-240718.csv",
    "GPT-4-turbo": f"results/[{dataset}] gpt-4-turbo-240409.csv",
    "GPT-3.5-turbo": f"results/[{dataset}] gpt-35-turbo-230613.csv",
}
print(get_experiments_md(dataset, csv_path_dict))

## KMMLU

### Open source models

#### zero-shot

In [None]:
dataset = "KMMLU"
csv_path_dict = {
    "Phi-4": f"results/[{dataset}] Phi-4-0shot.csv",
    "Phi-3.5-MoE-instruct": f"results/[{dataset}] Phi-3-5-MoE-instruct.csv",
    "Phi-3.5-mini-instruct": f"results/[{dataset}] Phi-3-5-mini-instruct.csv",
    "Phi-3-mini-128k-instruct-June": f"results/[{dataset}] Phi-3-mini-128k-June.csv",
    "Llama-3.1-8B-Instruct": f"results/[{dataset}] llama-3-1-8b-instruct.csv",
}
print(get_experiments_md(dataset, csv_path_dict))

#### 5-shot

In [None]:
dataset = "KMMLU"
postfix = "5shot"
csv_path_dict = {
    "Phi-4": f"results/[{dataset}] Phi-4-{postfix}.csv",
    "Phi-3.5-MoE-instruct": f"results/[{dataset}] Phi-3-5-MoE-instruct-{postfix}.csv",
    "Phi-3.5-mini-instruct": f"results/[{dataset}] Phi-3-5-mini-instruct-{postfix}.csv",
    "Phi-3-mini-128k-instruct-June": f"results/[{dataset}] Phi-3-mini-128k-June-{postfix}.csv",
    "Llama-3.1-8B-Instruct": f"results/[{dataset}] llama-3-1-8b-instruct-{postfix}.csv",
}
print(get_experiments_md(dataset, csv_path_dict))

### Proprietary models

#### zero-shot

In [None]:
dataset = "KMMLU"
csv_path_dict = {
    "GPT-5-chat": f"results/[{dataset}] gpt-5-chat-2025-08-08-0shot.csv",
    "GPT-5-mini": f"results/[{dataset}] gpt-5-mini-2025-08-08-0shot.csv",
    "GPT-5-nano": f"results/[{dataset}] gpt-5-nano-2025-08-08-0shot.csv",
    "GPT-4.1": f"results/[{dataset}] gpt-4.1-2025-04-14.csv",
    "GPT-4.1-mini": f"results/[{dataset}] gpt-4.1-mini-2025-04-14.csv",
    "GPT-4.1-nano": f"results/[{dataset}] gpt-4.1-nano-2025-04-14.csv",
    "GPT-4o": f"results/[{dataset}] gpt-4o-240513.csv",
    "GPT-4o-mini": f"results/[{dataset}] gpt-4o-mini-240718.csv",
    "GPT-4-turbo": f"results/[{dataset}] gpt-4-turbo-240409.csv",
    "GPT-3.5-turbo": f"results/[{dataset}] gpt-35-turbo-230613.csv",
}
print(get_experiments_md(dataset, csv_path_dict))

#### 5-shot

In [None]:
dataset = "KMMLU"
postfix = "5shot"
csv_path_dict = {
    #"GPT-5-chat": f"results/[{dataset}] gpt-5-chat-2025-08-08.csv",
    "GPT-4o": f"results/[{dataset}] gpt-4o-240513-{postfix}.csv",
    "GPT-4o-mini": f"results/[{dataset}] gpt-4o-mini-240718-{postfix}.csv",
    "GPT-4-turbo": f"results/[{dataset}] gpt-4-turbo-240409-{postfix}.csv",
    "GPT-3.5-turbo": f"results/[{dataset}] gpt-35-turbo-230613-{postfix}.csv",
}
print(get_experiments_md(dataset, csv_path_dict))

## KMMLU-HARD

### Open source models

#### zero-shot

In [None]:
dataset = "KMMLU-HARD"
csv_path_dict = {
   # "Phi-4": f"results/[{dataset}] Phi-4-0shot.csv",
    "Phi-3.5-MoE-instruct": f"results/[{dataset}] Phi-3-5-MoE-instruct.csv",
    "Phi-3.5-mini-instruct": f"results/[{dataset}] Phi-3-5-mini-instruct.csv",
    "Phi-3-mini-128k-instruct-June": f"results/[{dataset}] Phi-3-mini-128k-June.csv",
    "Llama-3.1-8B-Instruct": f"results/[{dataset}] llama-3-1-8b-instruct.csv",
}
print(get_experiments_md(dataset, csv_path_dict))

#### 5-shot

In [None]:
dataset = "KMMLU-HARD"
postfix = "5shot"
csv_path_dict = {
    "Phi-4": f"results/[{dataset}] Phi-4-{postfix}.csv",
    "Phi-3.5-MoE-instruct": f"results/[{dataset}] Phi-3-5-MoE-instruct-{postfix}.csv",
    "Phi-3.5-mini-instruct": f"results/[{dataset}] Phi-3-5-mini-instruct-{postfix}.csv",
    "Phi-3-mini-128k-instruct-June": f"results/[{dataset}] Phi-3-mini-128k-June-{postfix}.csv",
    "Llama-3.1-8B-Instruct": f"results/[{dataset}] llama-3-1-8b-instruct-{postfix}.csv",
}
print(get_experiments_md(dataset, csv_path_dict))

### Proprietary models

#### zero-shot

In [None]:
dataset = "KMMLU-HARD"
csv_path_dict = {
    "GPT-5-chat": f"results/[{dataset}] gpt-5-chat-2025-08-08-0shot.csv",
    "GPT-5-mini": f"results/[{dataset}] gpt-5-mini-2025-08-08-0shot.csv",
    "GPT-5-nano": f"results/[{dataset}] gpt-5-nano-2025-08-08-0shot.csv",
    "GPT-4.1": f"results/[{dataset}] gpt-4.1-2025-04-14.csv",
    "GPT-4.1-mini": f"results/[{dataset}] gpt-4.1-mini-2025-04-14.csv",
    "GPT-4.1-nano": f"results/[{dataset}] gpt-4.1-nano-2025-04-14.csv",
    "GPT-4o": f"results/[{dataset}] gpt-4o-240513.csv",
    "GPT-4o-mini": f"results/[{dataset}] gpt-4o-mini-240718.csv",
    "GPT-4-turbo": f"results/[{dataset}] gpt-4-turbo-240409.csv",
    "GPT-3.5-turbo": f"results/[{dataset}] gpt-35-turbo-230613.csv",
}
print(get_experiments_md(dataset, csv_path_dict))