In [6]:
import glob
import pandas as pd
from util.evaluate_helper import get_experiments_md, extract_single_alphabet_answer

csv_files = glob.glob("results/*.csv")

for file_path in csv_files:
    try:
        df = pd.read_csv(file_path)
        if df.empty:
            print(f"Skipped empty file: {file_path}")
            continue

        df["pred"] = df.apply(extract_single_alphabet_answer, axis=1)
        df.to_csv(file_path, index=False)
        print(f"Processed {file_path}")

    except pd.errors.EmptyDataError:
        print(f"Skipped empty file (EmptyDataError): {file_path}")

Processed results/[KMMLU] Phi-3-5-MoE-instruct.csv
Processed results/[KMMLU] Phi-3-5-MoE-instruct-5shot.csv
Processed results/[HAERAE] gpt-35-turbo-230613.csv
Processed results/[KMMLU] Phi-3-5-mini-instruct.csv
Processed results/[KMMLU-HARD] llama-3-1-8b-instruct-5shot.csv
Processed results/[KMMLU] gpt-4o-240513-5shot.csv
Processed results/[KMMLU-HARD] Phi-4-0shot.csv
Processed results/[KMMLU-HARD] gpt-4o-mini-240718.csv
Processed results/[KMMLU] gpt-4o-mini-240718-5shot.csv
Processed results/[CLIcK] Phi-3-mini-128k-June.csv
Processed results/[KMMLU-HARD] Phi-3-5-MoE-instruct.csv
Processed results/[HAERAE] Phi-3-mini-128k-June.csv
Processed results/[KMMLU] llama-3-1-8b-instruct.csv
Processed results/[KMMLU] Phi-3-mini-128k-June-5shot.csv
Processed results/[KMMLU] gpt-4o-240513.csv
Processed results/[HAERAE] Phi-4.csv
Processed results/[KMMLU] gpt-4-turbo-240409.csv
Processed results/[KMMLU] Phi-3-mini-128k-June.csv
Processed results/[KMMLU] llama-3-1-8b-instruct-5shot.csv
Processed res

## CLIcK

### Open source models

In [7]:
dataset = "CLIcK"
csv_path_dict = {
    "Phi-4": f"results/[{dataset}] Phi-4.csv",
    "Phi-3.5-MoE-instruct": f"results/[{dataset}] Phi-3-5-MoE-instruct.csv",
    "Phi-3.5-mini-instruct": f"results/[{dataset}] Phi-3-5-mini-instruct.csv",
    "Phi-3-mini-128k-instruct-June": f"results/[{dataset}] Phi-3-mini-128k-June.csv",
    "Llama-3.1-8B-Instruct": f"results/[{dataset}] llama-3-1-8b-instruct.csv",
}
print(get_experiments_md(dataset, csv_path_dict))

### CLIcK

#### Accuracy by supercategory
| supercategory   |   Phi-4 |   Phi-3.5-MoE-instruct |   Phi-3.5-mini-instruct |   Phi-3-mini-128k-instruct-June |   Llama-3.1-8B-Instruct |
|:----------------|--------:|-----------------------:|------------------------:|--------------------------------:|------------------------:|
| Culture         |   57.84 |                  58.44 |                   43.77 |                           29.74 |                   51.15 |
| Language        |   61.85 |                  52.31 |                   41.38 |                           27.85 |                   40.92 |
| **Overall**     |   59.15 |                  56.44 |                   42.99 |                           29.12 |                   47.82 |

#### Accuracy by category
| supercategory   | category    |   Phi-4 |   Phi-3.5-MoE-instruct |   Phi-3.5-mini-instruct |   Phi-3-mini-128k-instruct-June |   Llama-3.1-8B-Instruct |
|:----------------|:------------|--------:|-----------------------:|---

In [None]:
### Proprietary models

In [8]:
dataset = "CLIcK"
csv_path_dict = {
    "GPT-4o": f"results/[{dataset}] gpt-4o-240513.csv",
    "GPT-4o-mini": f"results/[{dataset}] gpt-4o-mini-240718.csv",
    "GPT-4-turbo": f"results/[{dataset}] gpt-4-turbo-240409.csv",
    "GPT-3.5-turbo": f"results/[{dataset}] gpt-35-turbo-230613.csv",
}
print(get_experiments_md(dataset, csv_path_dict))

### CLIcK

#### Accuracy by supercategory
| supercategory   |   GPT-4o |   GPT-4o-mini |   GPT-4-turbo |   GPT-3.5-turbo |
|:----------------|---------:|--------------:|--------------:|----------------:|
| Culture         |    81.89 |         70.95 |         73.61 |           53.38 |
| Language        |    77.54 |         63.54 |         71.23 |           46    |
| **Overall**     |    80.46 |         68.5  |         72.82 |           50.98 |

#### Accuracy by category
| supercategory   | category    |   GPT-4o |   GPT-4o-mini |   GPT-4-turbo |   GPT-3.5-turbo |
|:----------------|:------------|---------:|--------------:|--------------:|----------------:|
| Culture         | Economy     |    94.92 |         83.05 |         89.83 |           64.41 |
| Culture         | Geography   |    80.15 |         77.86 |         82.44 |           53.44 |
| Culture         | History     |    66.92 |         48.4  |         46.4  |           31.79 |
| Culture         | Law         |    70.78 |       

## HAERAE 1.0

### Open source models

In [9]:
dataset = "HAERAE"
csv_path_dict = {
    "Phi-4": f"results/[{dataset}] Phi-4.csv",
    "Phi-3.5-MoE-instruct": f"results/[{dataset}] Phi-3-5-MoE-instruct.csv",
    "Phi-3.5-mini-instruct": f"results/[{dataset}] Phi-3-5-mini-instruct.csv",
    "Phi-3-mini-128k-instruct-June": f"results/[{dataset}] Phi-3-mini-128k-June.csv",
    "Llama-3.1-8B-Instruct": f"results/[{dataset}] llama-3-1-8b-instruct.csv",
}
print(get_experiments_md(dataset, csv_path_dict))

### HAERAE

#### Accuracy by category
| category              |   Phi-4 |   Phi-3.5-MoE-instruct |   Phi-3.5-mini-instruct |   Phi-3-mini-128k-instruct-June |   Llama-3.1-8B-Instruct |
|:----------------------|--------:|-----------------------:|------------------------:|--------------------------------:|------------------------:|
| General Knowledge     |   38.07 |                  39.77 |                   31.25 |                           28.41 |                   34.66 |
| History               |   37.77 |                  60.64 |                   32.45 |                           22.34 |                   44.15 |
| Loan Words            |   61.54 |                  70.41 |                   47.93 |                           35.5  |                   63.31 |
| Rare Words            |   62.72 |                  63.95 |                   55.06 |                           42.96 |                   63.21 |
| Reading Comprehension |   71.14 |                  64.43 |                   4

### Proprietary models

In [10]:
dataset = "HAERAE"
csv_path_dict = {
    "GPT-4o": f"results/[{dataset}] gpt-4o-240513.csv",
    "GPT-4o-mini": f"results/[{dataset}] gpt-4o-mini-240718.csv",
    "GPT-4-turbo": f"results/[{dataset}] gpt-4-turbo-240409.csv",
    "GPT-3.5-turbo": f"results/[{dataset}] gpt-35-turbo-230613.csv",
}
print(get_experiments_md(dataset, csv_path_dict))

### HAERAE

#### Accuracy by category
| category              |   GPT-4o |   GPT-4o-mini |   GPT-4-turbo |   GPT-3.5-turbo |
|:----------------------|---------:|--------------:|--------------:|----------------:|
| General Knowledge     |    77.27 |         53.41 |         66.48 |           40.91 |
| History               |    92.02 |         84.57 |         78.72 |           30.32 |
| Loan Words            |    79.88 |         76.33 |         78.11 |           59.17 |
| Rare Words            |    87.9  |         81.98 |         79.01 |           61.23 |
| Reading Comprehension |    85.46 |         77.18 |         80.09 |           56.15 |
| Standard Nomenclature |    88.89 |         75.82 |         79.08 |           53.59 |
| **Overall**           |    85.7  |         76.4  |         77.76 |           52.67 |


## KMMLU

### Open source models

#### zero-shot

In [12]:
dataset = "KMMLU"
csv_path_dict = {
    "Phi-4": f"results/[{dataset}] Phi-4-0shot.csv",
    "Phi-3.5-MoE-instruct": f"results/[{dataset}] Phi-3-5-MoE-instruct.csv",
    "Phi-3.5-mini-instruct": f"results/[{dataset}] Phi-3-5-mini-instruct.csv",
    "Phi-3-mini-128k-instruct-June": f"results/[{dataset}] Phi-3-mini-128k-June.csv",
    "Llama-3.1-8B-Instruct": f"results/[{dataset}] llama-3-1-8b-instruct.csv",
}
print(get_experiments_md(dataset, csv_path_dict))

### KMMLU

#### Accuracy by supercategory
| supercategory   |   Phi-4 |   Phi-3.5-MoE-instruct |   Phi-3.5-mini-instruct |   Phi-3-mini-128k-instruct-June |   Llama-3.1-8B-Instruct |
|:----------------|--------:|-----------------------:|------------------------:|--------------------------------:|------------------------:|
| Applied Science |   47.97 |                  45.15 |                   35.8  |                           31.68 |                   37.03 |
| HUMSS           |   54.27 |                  49.75 |                   31.56 |                           26.47 |                   37.29 |
| Other           |   49.07 |                  47.24 |                   35.45 |                           31.01 |                   39.15 |
| STEM            |   52    |                  49.08 |                   38.54 |                           31.9  |                   40.42 |
| **Overall**     |   50.3  |                  47.43 |                   35.87 |                           30.82

#### 5-shot

In [13]:
dataset = "KMMLU"
postfix = "5shot"
csv_path_dict = {
    "Phi-4": f"results/[{dataset}] Phi-4-{postfix}.csv",
    "Phi-3.5-MoE-instruct": f"results/[{dataset}] Phi-3-5-MoE-instruct-{postfix}.csv",
    "Phi-3.5-mini-instruct": f"results/[{dataset}] Phi-3-5-mini-instruct-{postfix}.csv",
    "Phi-3-mini-128k-instruct-June": f"results/[{dataset}] Phi-3-mini-128k-June-{postfix}.csv",
    "Llama-3.1-8B-Instruct": f"results/[{dataset}] llama-3-1-8b-instruct-{postfix}.csv",
}
print(get_experiments_md(dataset, csv_path_dict))

### KMMLU

#### Accuracy by supercategory
| supercategory   |   Phi-4 |   Phi-3.5-MoE-instruct |   Phi-3.5-mini-instruct |   Phi-3-mini-128k-instruct-June |   Llama-3.1-8B-Instruct |
|:----------------|--------:|-----------------------:|------------------------:|--------------------------------:|------------------------:|
| Applied Science |   48.44 |                  45.9  |                   37.42 |                           29.98 |                   19.24 |
| HUMSS           |   54.46 |                  49.18 |                   34.72 |                           27.27 |                   22.5  |
| Other           |   49.2  |                  48.43 |                   37.04 |                           30.76 |                   20.95 |
| STEM            |   53.31 |                  49.21 |                   38.9  |                           30.73 |                   19.55 |
| **Overall**     |   50.88 |                  47.92 |                   37.35 |                           29.98

### Proprietary models

#### zero-shot

In [22]:
dataset = "KMMLU"
csv_path_dict = {
    "GPT-4o": f"results/[{dataset}] gpt-4o-240513.csv",
    "GPT-4o-mini": f"results/[{dataset}] gpt-4o-mini-240718.csv",
    "GPT-4-turbo": f"results/[{dataset}] gpt-4-turbo-240409.csv",
    "GPT-3.5-turbo": f"results/[{dataset}] gpt-35-turbo-230613.csv",
}
print(get_experiments_md(dataset, csv_path_dict))

### KMMLU

#### Accuracy by supercategory
| supercategory   |   GPT-4o |   GPT-4o-mini |   GPT-4-turbo |   GPT-3.5-turbo |
|:----------------|---------:|--------------:|--------------:|----------------:|
| Applied Science |    61.52 |         49.29 |         55.98 |           38.47 |
| HUMSS           |    69.45 |         56.59 |         63    |           40.9  |
| Other           |    63.79 |         52.35 |         57.53 |           40.19 |
| STEM            |    65.16 |         54.74 |         60.84 |           42.24 |
| **Overall**     |    64.26 |         52.63 |         58.75 |           40.3  |

#### Accuracy by category
| supercategory   | category                                   |   GPT-4o |   GPT-4o-mini |   GPT-4-turbo |   GPT-3.5-turbo |
|:----------------|:-------------------------------------------|---------:|--------------:|--------------:|----------------:|
| Applied Science | Aviation-Engineering-and-Maintenance       |    69.8  |         50.4  |         61.92 |     

#### 5-shot

In [15]:
dataset = "KMMLU"
postfix = "5shot"
csv_path_dict = {
    "GPT-4o": f"results/[{dataset}] gpt-4o-240513-{postfix}.csv",
    "GPT-4o-mini": f"results/[{dataset}] gpt-4o-mini-240718-{postfix}.csv",
    "GPT-4-turbo": f"results/[{dataset}] gpt-4-turbo-240409-{postfix}.csv",
    "GPT-3.5-turbo": f"results/[{dataset}] gpt-35-turbo-230613-{postfix}.csv",
}
print(get_experiments_md(dataset, csv_path_dict))

### KMMLU

#### Accuracy by supercategory
| supercategory   |   GPT-4o |   GPT-4o-mini |   GPT-4-turbo |   GPT-3.5-turbo |
|:----------------|---------:|--------------:|--------------:|----------------:|
| Applied Science |    61.47 |         48.66 |         56.85 |           40.22 |
| HUMSS           |    68.79 |         55.95 |         63.68 |           43.35 |
| Other           |    64.21 |         51.1  |         57.85 |           41.92 |
| STEM            |    65.28 |         53.29 |         61.08 |           44.43 |
| **Overall**     |    64.28 |         51.62 |         59.29 |           42.28 |

#### Accuracy by category
| supercategory   | category                                   |   GPT-4o |   GPT-4o-mini |   GPT-4-turbo |   GPT-3.5-turbo |
|:----------------|:-------------------------------------------|---------:|--------------:|--------------:|----------------:|
| Applied Science | Aviation-Engineering-and-Maintenance       |    70.8  |         50.6  |         61.9  |     

## KMMLU-HARD

### Open source models

#### zero-shot

In [21]:
dataset = "KMMLU-HARD"
csv_path_dict = {
    "Phi-4": f"results/[{dataset}] Phi-4-0shot.csv",
    "Phi-3.5-MoE-instruct": f"results/[{dataset}] Phi-3-5-MoE-instruct.csv",
    "Phi-3.5-mini-instruct": f"results/[{dataset}] Phi-3-5-mini-instruct.csv",
    "Phi-3-mini-128k-instruct-June": f"results/[{dataset}] Phi-3-mini-128k-June.csv",
    "Llama-3.1-8B-Instruct": f"results/[{dataset}] llama-3-1-8b-instruct.csv",
}
print(get_experiments_md(dataset, csv_path_dict))

### KMMLU-HARD

#### Accuracy by supercategory
| supercategory   |   Phi-4 |   Phi-3.5-MoE-instruct |   Phi-3.5-mini-instruct |   Phi-3-mini-128k-instruct-June |   Llama-3.1-8B-Instruct |
|:----------------|--------:|-----------------------:|------------------------:|--------------------------------:|------------------------:|
| Applied Science |   24    |                  25.83 |                   27.08 |                           26.17 |                   26.25 |
| HUMSS           |   22.88 |                  21.52 |                   20.21 |                           24.38 |                   20.21 |
| Other           |   22.73 |                  24.82 |                   23.05 |                           24.82 |                   23.88 |
| STEM            |   24.25 |                  28.18 |                   24.36 |                           26.91 |                   24.64 |
| **Overall**     |   24.24 |                  25.34 |                   24    |                           

#### 5-shot

In [17]:
dataset = "KMMLU-HARD"
postfix = "5shot"
csv_path_dict = {
    "Phi-4": f"results/[{dataset}] Phi-4-{postfix}.csv",
    "Phi-3.5-MoE-instruct": f"results/[{dataset}] Phi-3-5-MoE-instruct-{postfix}.csv",
    "Phi-3.5-mini-instruct": f"results/[{dataset}] Phi-3-5-mini-instruct-{postfix}.csv",
    "Phi-3-mini-128k-instruct-June": f"results/[{dataset}] Phi-3-mini-128k-June-{postfix}.csv",
    "Llama-3.1-8B-Instruct": f"results/[{dataset}] llama-3-1-8b-instruct-{postfix}.csv",
}
print(get_experiments_md(dataset, csv_path_dict))

### KMMLU-HARD

#### Accuracy by supercategory
| supercategory   |   Phi-4 |   Phi-3.5-MoE-instruct |   Phi-3.5-mini-instruct |   Phi-3-mini-128k-instruct-June |   Llama-3.1-8B-Instruct |
|:----------------|--------:|-----------------------:|------------------------:|--------------------------------:|------------------------:|
| Applied Science |   19    |                  21    |                   25    |                           29    |                   12    |
| HUMSS           |   23.47 |                  22.88 |                   21.89 |                           19.92 |                   14    |
| Other           |   22.73 |                  25.13 |                   23.26 |                           27.27 |                   12.83 |
| STEM            |   22.5  |                  21.75 |                   20.5  |                           25.25 |                   12.75 |
| **Overall**     |   24.32 |                  25.66 |                   24.76 |                           

### Proprietary models

#### zero-shot

In [18]:
dataset = "KMMLU-HARD"
csv_path_dict = {
    "GPT-4o": f"results/[{dataset}] gpt-4o-240513.csv",
    "GPT-4o-mini": f"results/[{dataset}] gpt-4o-mini-240718.csv",
    "GPT-4-turbo": f"results/[{dataset}] gpt-4-turbo-240409.csv",
    "GPT-3.5-turbo": f"results/[{dataset}] gpt-35-turbo-230613.csv",
}
print(get_experiments_md(dataset, csv_path_dict))

### KMMLU-HARD

#### Accuracy by supercategory
| supercategory   |   GPT-4o |   GPT-4o-mini |   GPT-4-turbo |   GPT-3.5-turbo |
|:----------------|---------:|--------------:|--------------:|----------------:|
| Applied Science |    37.12 |         22.25 |         29.17 |           21.07 |
| HUMSS           |    41.97 |         23.31 |         31.51 |           19.44 |
| Other           |    40.39 |         26.48 |         29.59 |           22.22 |
| STEM            |    39.82 |         26.36 |         32.18 |           20.91 |
| **Overall**     |    39.62 |         24.56 |         30.56 |           20.97 |

#### Accuracy by category
| supercategory   | category                                   |   GPT-4o |   GPT-4o-mini |   GPT-4-turbo |   GPT-3.5-turbo |
|:----------------|:-------------------------------------------|---------:|--------------:|--------------:|----------------:|
| Applied Science | Aviation-Engineering-and-Maintenance       |    45    |         25    |         32    |

#### 5-shot

In [19]:
dataset = "KMMLU-HARD"
postfix = "5shot"
csv_path_dict = {
    "GPT-4o": f"results/[{dataset}] gpt-4o-240513-{postfix}.csv",
    "GPT-4o-mini": f"results/[{dataset}] gpt-4o-mini-240718-{postfix}.csv",
    "GPT-4-turbo": f"results/[{dataset}] gpt-4-turbo-240409-{postfix}.csv",
    "GPT-3.5-turbo": f"results/[{dataset}] gpt-35-turbo-230613-{postfix}.csv",
}
print(get_experiments_md(dataset, csv_path_dict))

### KMMLU-HARD

#### Accuracy by supercategory
| supercategory   |   GPT-4o |   GPT-4o-mini |   GPT-4-turbo |   GPT-3.5-turbo |
|:----------------|---------:|--------------:|--------------:|----------------:|
| Applied Science |    31    |         21    |         25    |           20    |
| HUMSS           |    43.98 |         23.47 |         33.53 |           19.53 |
| Other           |    39.84 |         28.34 |         29.68 |           23.22 |
| STEM            |    40.25 |         23.25 |         27.25 |           19.75 |
| **Overall**     |    40.94 |         24.63 |         31.12 |           21.19 |

#### Accuracy by category
| supercategory   | category     |   GPT-4o |   GPT-4o-mini |   GPT-4-turbo |   GPT-3.5-turbo |
|:----------------|:-------------|---------:|--------------:|--------------:|----------------:|
| Applied Science | Geomatics    |    31    |         21    |         25    |           20    |
| HUMSS           | Accounting   |    52.17 |         23.91 |         4