In [9]:
import glob
import pandas as pd
from util.evaluate_helper import get_experiments_md, extract_single_alphabet_answer

csv_files = glob.glob("results/*.csv")

for file_path in csv_files:
    try:
        df = pd.read_csv(file_path)
        if df.empty:
            print(f"Skipped empty file: {file_path}")
            continue

        df["pred"] = df.apply(extract_single_alphabet_answer, axis=1)
        df.to_csv(file_path, index=False)
        print(f"Processed {file_path}")

    except pd.errors.EmptyDataError:
        print(f"Skipped empty file (EmptyDataError): {file_path}")

Processed results/[KMMLU] Phi-3-mini-128k-June.csv
Processed results/[CLIcK] Phi-3-mini-128k-June.csv
Processed results/[KMMLU-HARD] gpt-4.1-mini-2025-04-14.csv
Processed results/[HAERAE] gpt-35-turbo-230613.csv
Processed results/[KMMLU-HARD] Phi-3-mini-128k-June.csv
Processed results/[HAERAE] Phi-3-5-mini-instruct.csv
Processed results/[HAERAE] gpt-5-nano-2025-08-08.csv
Processed results/[KMMLU-HARD] gpt-5-mini-2025-08-08-0shot.csv
Processed results/[KMMLU] Phi-4-0shot.csv
Processed results/[HAERAE] Phi-3-mini-128k-June.csv
Processed results/[KMMLU] llama-3-1-8b-instruct-5shot.csv
Processed results/[HAERAE] llama-3-1-8b-instruct.csv
Processed results/[HAERAE] gpt-4.1-nano-2025-04-14.csv
Processed results/[KMMLU] Phi-3-5-mini-instruct-5shot.csv
Processed results/[KMMLU-HARD] gpt-4-turbo-240409-5shot.csv
Processed results/[KMMLU] gpt-35-turbo-230613.csv
Processed results/[KMMLU-HARD] gpt-4o-mini-240718-5shot.csv
Processed results/[KMMLU] gpt-4-turbo-240409.csv
Processed results/[KMMLU-H

In [11]:

# Python에서 직접 실행
from haerae_main import evaluate
from click_main import evaluate as click_evaluate
from kmmlu_main import evaluate as kmmlu_evaluate

# 예시: HAERAE 결과들 재평가
evaluate("results/[HAERAE] gpt-5-chat-2025-08-08.csv")
evaluate("results/[HAERAE] gpt-5-mini-2025-08-08.csv")
evaluate("results/[HAERAE] gpt-5-nano-2025-08-08.csv")
# ... 기타 파일들

# 예시: CLIcK 결과들 재평가
click_evaluate("results/[CLIcK] gpt-5-chat-2025-08-08.csv")
click_evaluate("results/[CLIcK] gpt-5-mini-2025-08-08.csv")
click_evaluate("results/[CLIcK] gpt-5-nano-2025-08-08.csv")

# 예시: KMMLU 결과들 재평가
kmmlu_evaluate("results/[KMMLU] gpt-5-chat-2025-08-08-0shot.csv")
kmmlu_evaluate("results/[KMMLU] gpt-5-nano-2025-08-08-0shot.csv")
#kmmlu_evaluate("results/[KMMLU] gpt-5-mini-2025-08-08-0shot.csv")

2025-08-09 01:29:11,795 - logger - INFO - Excluding FAILED responses from accuracy calculation
2025-08-09 01:29:11,797 - logger - INFO - Evaluating on 1338 valid responses
2025-08-09 01:29:11,906 - logger - INFO - Excluding FAILED responses from accuracy calculation
2025-08-09 01:29:11,910 - logger - INFO - Evaluating on 28530 valid responses
2025-08-09 01:29:11,943 - logger - INFO - Excluding FAILED responses from accuracy calculation
2025-08-09 01:29:11,947 - logger - INFO - Evaluating on 28630 valid responses


                category  correct_mean  correct_count
0      General Knowledge      0.767045            176
1                History      1.000000             24
2             Loan Words      0.781955            133
3             Rare Words      0.883951            405
4  Reading Comprehension      0.865772            447
5  Standard Nomenclature      0.843137            153
Overall Average: 0.8569766710049912
                category  correct_mean  correct_count
0      General Knowledge      0.806818            176
1                History      0.958333             24
2             Loan Words      0.909774            133
3             Rare Words      0.827160            405
4  Reading Comprehension      0.883669            447
5  Standard Nomenclature      0.941176            153
Overall Average: 0.8878219699100448
                category  correct_mean  correct_count
0      General Knowledge      0.738636            176
1                History      0.875000             24
2         

## CLIcK

### Open source models

In [12]:
dataset = "CLIcK"
csv_path_dict = {
    "Phi-4": f"results/[{dataset}] Phi-4.csv",
    "Phi-3.5-MoE-instruct": f"results/[{dataset}] Phi-3-5-MoE-instruct.csv",
    "Phi-3.5-mini-instruct": f"results/[{dataset}] Phi-3-5-mini-instruct.csv",
    "Phi-3-mini-128k-instruct-June": f"results/[{dataset}] Phi-3-mini-128k-June.csv",
    "Llama-3.1-8B-Instruct": f"results/[{dataset}] llama-3-1-8b-instruct.csv",
}
print(get_experiments_md(dataset, csv_path_dict))

### CLIcK

#### Accuracy by supercategory
| supercategory   |   Phi-4 |   Phi-3.5-MoE-instruct |   Phi-3.5-mini-instruct |   Phi-3-mini-128k-instruct-June |   Llama-3.1-8B-Instruct |
|:----------------|--------:|-----------------------:|------------------------:|--------------------------------:|------------------------:|
| Culture         |   57.84 |                  58.44 |                   43.77 |                           29.74 |                   51.15 |
| Language        |   61.85 |                  52.31 |                   41.38 |                           27.85 |                   40.92 |
| **Overall**     |   59.15 |                  56.44 |                   42.99 |                           29.12 |                   47.82 |

#### Accuracy by category
| supercategory   | category    |   Phi-4 |   Phi-3.5-MoE-instruct |   Phi-3.5-mini-instruct |   Phi-3-mini-128k-instruct-June |   Llama-3.1-8B-Instruct |
|:----------------|:------------|--------:|-----------------------:|---

### Proprietary models

In [3]:
dataset = "CLIcK"
csv_path_dict = {
    "GPT-5-chat": f"results/[{dataset}] gpt-5-chat-2025-08-08.csv",
    "GPT-5-mini": f"results/[{dataset}] gpt-5-mini-2025-08-08.csv",
    "GPT-5-nano": f"results/[{dataset}] gpt-5-nano-2025-08-08.csv",
    "GPT-4.1": f"results/[{dataset}] gpt-4.1-2025-04-14.csv",
    "GPT-4.1-mini": f"results/[{dataset}] gpt-4.1-mini-2025-04-14.csv",
    "GPT-4.1-nano": f"results/[{dataset}] gpt-4.1-nano-2025-04-14.csv",
    "GPT-4o": f"results/[{dataset}] gpt-4o-240513.csv",
    "GPT-4o-mini": f"results/[{dataset}] gpt-4o-mini-240718.csv",
    "GPT-4-turbo": f"results/[{dataset}] gpt-4-turbo-240409.csv",
    "GPT-3.5-turbo": f"results/[{dataset}] gpt-35-turbo-230613.csv",
}
print(get_experiments_md(dataset, csv_path_dict))

### CLIcK

#### Accuracy by supercategory
| supercategory   |   GPT-5-chat |   GPT-5-mini |   GPT-5-nano |   GPT-4.1 |   GPT-4.1-mini |   GPT-4.1-nano |   GPT-4o |   GPT-4o-mini |   GPT-4-turbo |   GPT-3.5-turbo |
|:----------------|-------------:|-------------:|-------------:|----------:|---------------:|---------------:|---------:|--------------:|--------------:|----------------:|
| Culture         |        86.03 |        84.88 |        77.42 |     81.65 |          72.81 |          62.47 |    81.89 |         70.95 |         73.61 |           53.38 |
| Language        |        78.77 |        86.77 |        69.54 |     78.31 |          70.62 |          58.62 |    77.54 |         63.54 |         71.23 |           46    |
| **Overall**     |        83.24 |        85.6  |        74.4  |     80.55 |          72.09 |          61.21 |    80.46 |         68.5  |         72.82 |           50.98 |

#### Accuracy by category
| supercategory   | category    |   GPT-5-chat |   GPT-5-mini |   GPT-5

## HAERAE 1.0

### Open source models

In [14]:
dataset = "HAERAE"
csv_path_dict = {
    "Phi-4": f"results/[{dataset}] Phi-4.csv",
    "Phi-3.5-MoE-instruct": f"results/[{dataset}] Phi-3-5-MoE-instruct.csv",
    "Phi-3.5-mini-instruct": f"results/[{dataset}] Phi-3-5-mini-instruct.csv",
    "Phi-3-mini-128k-instruct-June": f"results/[{dataset}] Phi-3-mini-128k-June.csv",
    "Llama-3.1-8B-Instruct": f"results/[{dataset}] llama-3-1-8b-instruct.csv",
}
print(get_experiments_md(dataset, csv_path_dict))

FileNotFoundError: [Errno 2] No such file or directory: 'results/[HAERAE] Phi-3-5-MoE-instruct.csv'

### Proprietary models

In [5]:
dataset = "HAERAE"
csv_path_dict = {
    "GPT-5-chat": f"results/[{dataset}] gpt-5-chat-2025-08-08.csv",
    "GPT-5-mini": f"results/[{dataset}] gpt-5-mini-2025-08-08.csv",
    "GPT-5-nano": f"results/[{dataset}] gpt-5-nano-2025-08-08.csv",
    "GPT-4.1": f"results/[{dataset}] gpt-4.1-2025-04-14.csv",
    "GPT-4.1-mini": f"results/[{dataset}] gpt-4.1-mini-2025-04-14.csv",
    "GPT-4.1-nano": f"results/[{dataset}] gpt-4.1-nano-2025-04-14.csv",
    "GPT-4o": f"results/[{dataset}] gpt-4o-240513.csv",
    "GPT-4o-mini": f"results/[{dataset}] gpt-4o-mini-240718.csv",
    "GPT-4-turbo": f"results/[{dataset}] gpt-4-turbo-240409.csv",
    "GPT-3.5-turbo": f"results/[{dataset}] gpt-35-turbo-230613.csv",
}
print(get_experiments_md(dataset, csv_path_dict))

Excluding FAILED responses from accuracy calculation
Evaluating on 1338 valid responses
### HAERAE

#### Accuracy by category
| category              |   GPT-5-chat |   GPT-5-mini |   GPT-5-nano |   GPT-4.1 |   GPT-4.1-mini |   GPT-4.1-nano |   GPT-4o |   GPT-4o-mini |   GPT-4-turbo |   GPT-3.5-turbo |
|:----------------------|-------------:|-------------:|-------------:|----------:|---------------:|---------------:|---------:|--------------:|--------------:|----------------:|
| General Knowledge     |        75.57 |        80.68 |        73.86 |     75.57 |          52.27 |          43.18 |    77.27 |         53.41 |         66.48 |           40.91 |
| History               |       100    |        95.83 |        87.5  |     93.62 |          89.89 |          64.89 |    92.02 |         84.57 |         78.72 |           30.32 |
| Loan Words            |        78.95 |        90.98 |        84.21 |     79.29 |          73.96 |          73.37 |    79.88 |         76.33 |         78.11 |   

## KMMLU

### Open source models

#### zero-shot

In [16]:
dataset = "KMMLU"
csv_path_dict = {
    "Phi-4": f"results/[{dataset}] Phi-4-0shot.csv",
    "Phi-3.5-MoE-instruct": f"results/[{dataset}] Phi-3-5-MoE-instruct.csv",
    "Phi-3.5-mini-instruct": f"results/[{dataset}] Phi-3-5-mini-instruct.csv",
    "Phi-3-mini-128k-instruct-June": f"results/[{dataset}] Phi-3-mini-128k-June.csv",
    "Llama-3.1-8B-Instruct": f"results/[{dataset}] llama-3-1-8b-instruct.csv",
}
print(get_experiments_md(dataset, csv_path_dict))

### KMMLU

#### Accuracy by supercategory
| supercategory   |   Phi-4 |   Phi-3.5-MoE-instruct |   Phi-3.5-mini-instruct |   Phi-3-mini-128k-instruct-June |   Llama-3.1-8B-Instruct |
|:----------------|--------:|-----------------------:|------------------------:|--------------------------------:|------------------------:|
| Applied Science |   47.97 |                  45.15 |                   35.8  |                           31.68 |                   37.03 |
| HUMSS           |   54.27 |                  49.75 |                   31.56 |                           26.47 |                   37.29 |
| Other           |   49.07 |                  47.24 |                   35.45 |                           31.01 |                   39.15 |
| STEM            |   52    |                  49.08 |                   38.54 |                           31.9  |                   40.42 |
| **Overall**     |   50.3  |                  47.43 |                   35.87 |                           30.82

#### 5-shot

In [17]:
dataset = "KMMLU"
postfix = "5shot"
csv_path_dict = {
    "Phi-4": f"results/[{dataset}] Phi-4-{postfix}.csv",
    "Phi-3.5-MoE-instruct": f"results/[{dataset}] Phi-3-5-MoE-instruct-{postfix}.csv",
    "Phi-3.5-mini-instruct": f"results/[{dataset}] Phi-3-5-mini-instruct-{postfix}.csv",
    "Phi-3-mini-128k-instruct-June": f"results/[{dataset}] Phi-3-mini-128k-June-{postfix}.csv",
    "Llama-3.1-8B-Instruct": f"results/[{dataset}] llama-3-1-8b-instruct-{postfix}.csv",
}
print(get_experiments_md(dataset, csv_path_dict))

### KMMLU

#### Accuracy by supercategory
| supercategory   |   Phi-4 |   Phi-3.5-MoE-instruct |   Phi-3.5-mini-instruct |   Phi-3-mini-128k-instruct-June |   Llama-3.1-8B-Instruct |
|:----------------|--------:|-----------------------:|------------------------:|--------------------------------:|------------------------:|
| Applied Science |   48.44 |                  45.9  |                   37.42 |                           29.98 |                   19.24 |
| HUMSS           |   54.46 |                  49.18 |                   34.72 |                           27.27 |                   22.5  |
| Other           |   49.2  |                  48.43 |                   37.04 |                           30.76 |                   20.95 |
| STEM            |   53.31 |                  49.21 |                   38.9  |                           30.73 |                   19.55 |
| **Overall**     |   50.88 |                  47.92 |                   37.35 |                           29.98

### Proprietary models

#### zero-shot

In [2]:
dataset = "KMMLU"
csv_path_dict = {
    "GPT-5-chat": f"results/[{dataset}] gpt-5-chat-2025-08-08-0shot.csv",
    "GPT-5-mini": f"results/[{dataset}] gpt-5-mini-2025-08-08-0shot.csv",
    "GPT-5-nano": f"results/[{dataset}] gpt-5-nano-2025-08-08-0shot.csv",
    "GPT-4.1": f"results/[{dataset}] gpt-4.1-2025-04-14.csv",
    "GPT-4.1-mini": f"results/[{dataset}] gpt-4.1-mini-2025-04-14.csv",
    "GPT-4.1-nano": f"results/[{dataset}] gpt-4.1-nano-2025-04-14.csv",
    "GPT-4o": f"results/[{dataset}] gpt-4o-240513.csv",
    "GPT-4o-mini": f"results/[{dataset}] gpt-4o-mini-240718.csv",
    "GPT-4-turbo": f"results/[{dataset}] gpt-4-turbo-240409.csv",
    "GPT-3.5-turbo": f"results/[{dataset}] gpt-35-turbo-230613.csv",
}
print(get_experiments_md(dataset, csv_path_dict))

### KMMLU

#### Accuracy by supercategory
| supercategory   |   GPT-5-chat |   GPT-5-mini |   GPT-5-nano |   GPT-4.1 |   GPT-4.1-mini |   GPT-4.1-nano |   GPT-4o |   GPT-4o-mini |   GPT-4-turbo |   GPT-3.5-turbo |
|:----------------|-------------:|-------------:|-------------:|----------:|---------------:|---------------:|---------:|--------------:|--------------:|----------------:|
| Applied Science |        66.18 |        56.67 |        50    |     61.98 |          56.21 |          45.84 |    61.52 |         49.29 |         55.98 |           38.47 |
| HUMSS           |        73    |        68.18 |        60.91 |     71.7  |          64.11 |          51.56 |    69.45 |         56.59 |         63    |           40.9  |
| Other           |        68.83 |        67    |        61    |     65.29 |          58.26 |          47.93 |    63.79 |         52.35 |         57.53 |           40.19 |
| STEM            |        67.16 |        65    |        56    |     66.56 |          61.17 |     

#### 5-shot

In [None]:
dataset = "KMMLU"
postfix = "5shot"
csv_path_dict = {
    "GPT-5-chat": f"results/[{dataset}] gpt-5-chat-2025-08-08.csv",
    "GPT-4o": f"results/[{dataset}] gpt-4o-240513-{postfix}.csv",
    "GPT-4o-mini": f"results/[{dataset}] gpt-4o-mini-240718-{postfix}.csv",
    "GPT-4-turbo": f"results/[{dataset}] gpt-4-turbo-240409-{postfix}.csv",
    "GPT-3.5-turbo": f"results/[{dataset}] gpt-35-turbo-230613-{postfix}.csv",
}
print(get_experiments_md(dataset, csv_path_dict))

## KMMLU-HARD

### Open source models

#### zero-shot

In [None]:
dataset = "KMMLU-HARD"
csv_path_dict = {
   # "Phi-4": f"results/[{dataset}] Phi-4-0shot.csv",
    "Phi-3.5-MoE-instruct": f"results/[{dataset}] Phi-3-5-MoE-instruct.csv",
    "Phi-3.5-mini-instruct": f"results/[{dataset}] Phi-3-5-mini-instruct.csv",
    "Phi-3-mini-128k-instruct-June": f"results/[{dataset}] Phi-3-mini-128k-June.csv",
    "Llama-3.1-8B-Instruct": f"results/[{dataset}] llama-3-1-8b-instruct.csv",
}
print(get_experiments_md(dataset, csv_path_dict))

#### 5-shot

In [None]:
dataset = "KMMLU-HARD"
postfix = "5shot"
csv_path_dict = {
    "Phi-4": f"results/[{dataset}] Phi-4-{postfix}.csv",
    "Phi-3.5-MoE-instruct": f"results/[{dataset}] Phi-3-5-MoE-instruct-{postfix}.csv",
    "Phi-3.5-mini-instruct": f"results/[{dataset}] Phi-3-5-mini-instruct-{postfix}.csv",
    "Phi-3-mini-128k-instruct-June": f"results/[{dataset}] Phi-3-mini-128k-June-{postfix}.csv",
    "Llama-3.1-8B-Instruct": f"results/[{dataset}] llama-3-1-8b-instruct-{postfix}.csv",
}
print(get_experiments_md(dataset, csv_path_dict))

### Proprietary models

#### zero-shot

In [12]:
dataset = "KMMLU-HARD"
csv_path_dict = {
    "GPT-5-chat": f"results/[{dataset}] gpt-5-chat-2025-08-08-0shot.csv",
    "GPT-5-mini": f"results/[{dataset}] gpt-5-mini-2025-08-08-0shot.csv",
    "GPT-5-nano": f"results/[{dataset}] gpt-5-nano-2025-08-08-0shot.csv",
    "GPT-4.1": f"results/[{dataset}] gpt-4.1-2025-04-14.csv",
    "GPT-4.1-mini": f"results/[{dataset}] gpt-4.1-mini-2025-04-14.csv",
    "GPT-4.1-nano": f"results/[{dataset}] gpt-4.1-nano-2025-04-14.csv",
    "GPT-4o": f"results/[{dataset}] gpt-4o-240513.csv",
    "GPT-4o-mini": f"results/[{dataset}] gpt-4o-mini-240718.csv",
    "GPT-4-turbo": f"results/[{dataset}] gpt-4-turbo-240409.csv",
    "GPT-3.5-turbo": f"results/[{dataset}] gpt-35-turbo-230613.csv",
}
print(get_experiments_md(dataset, csv_path_dict))

Excluding FAILED responses from accuracy calculation
Evaluating on 3604 valid responses
Excluding FAILED responses from accuracy calculation
Evaluating on 3604 valid responses
Excluding FAILED responses from accuracy calculation
Evaluating on 3604 valid responses
### KMMLU-HARD

#### Accuracy by supercategory
| supercategory   |   GPT-5-chat |   GPT-5-mini |   GPT-5-nano |   GPT-4.1 |   GPT-4.1-mini |   GPT-4.1-nano |   GPT-4o |   GPT-4o-mini |   GPT-4-turbo |   GPT-3.5-turbo |
|:----------------|-------------:|-------------:|-------------:|----------:|---------------:|---------------:|---------:|--------------:|--------------:|----------------:|
| Applied Science |        42.5  |        68.3  |        56.3  |     39.08 |          34.92 |          23.75 |    37.12 |         22.25 |         29.17 |           21.07 |
| HUMSS           |        47.56 |        56    |        43.4  |     44.11 |          37.57 |          22.95 |    41.97 |         23.31 |         31.51 |           19.44 |
|

#### 5-shot

In [None]:
dataset = "KMMLU-HARD"
postfix = "5shot"
csv_path_dict = {   
    "GPT-4o": f"results/[{dataset}] gpt-4o-240513-{postfix}.csv",
    "GPT-4o-mini": f"results/[{dataset}] gpt-4o-mini-240718-{postfix}.csv",
    "GPT-4-turbo": f"results/[{dataset}] gpt-4-turbo-240409-{postfix}.csv",
    "GPT-3.5-turbo": f"results/[{dataset}] gpt-35-turbo-230613-{postfix}.csv",
}
print(get_experiments_md(dataset, csv_path_dict))