# MMLU-ProX-Lite Processing and Upload

Process all language configs, filter by answerability, and upload to HuggingFace.

In [37]:
import os
import pandas as pd
from datasets import load_dataset, load_from_disk, DatasetDict
from dotenv import load_dotenv
from huggingface_hub import login

# Load environment variables and login to HuggingFace
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")
login(token=HF_TOKEN)
print("Logged in to HuggingFace")

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Logged in to HuggingFace


In [38]:
# Load classified dataset and create answerable question IDs set
mmlu_classified = load_from_disk("mmlu_prox_classified")
df_answerable = pd.DataFrame({
    "index": mmlu_classified["question_id"],
    "is_answerable": [1 if x else 0 for x in mmlu_classified["is_answerable"]],
})
answerable_question_ids = set(
    df_answerable[df_answerable["is_answerable"] == 1]["index"]
)

print(f"Total questions: {len(df_answerable)}")
print(f"Answerable questions: {len(answerable_question_ids)}")

Total questions: 588
Answerable questions: 470


In [39]:
# Define all language configs and processing functions
configs = [
    "af",
    "ar",
    "bn",
    "cs",
    "de",
    "en",
    "es",
    "fr",
    "hi",
    "hu",
    "id",
    "it",
    "ja",
    "ko",
    "mr",
    "ne",
    "pt",
    "ru",
    "sr",
    "sw",
    "te",
    "th",
    "uk",
    "ur",
    "vi",
    "wo",
    "yo",
    "zh",
    "zu",
]


def get_answer_text(example):
    """Extract actual answer text based on answer_index"""
    answer_index = example["answer_index"]
    option_key = f"option_{answer_index}"
    return example.get(option_key, example["answer"])


def process_config(config):
    """Process a single language config"""
    dataset = load_dataset("li-lab/MMLU-ProX-Lite", config, split="test")
    filtered = dataset.filter(lambda x: x["question_id"] in answerable_question_ids)
    filtered = filtered.map(lambda x: {**x, "answer": get_answer_text(x)})
    columns_to_keep = [
        "question_id",
        "question",
        "answer",
        "cot_content",
        "category",
        "src",
    ]
    return filtered.select_columns(columns_to_keep)


print(f"Will process {len(configs)} language configs")

Will process 29 language configs


In [40]:
# Process all configs
processed_datasets = {}

for config in configs:
    print(f"Processing {config}...")
    try:
        processed = process_config(config)
        processed_datasets[config] = processed
        print(f"  {config}: {len(processed)} questions")
    except Exception as e:
        print(f"  Error processing {config}: {e}")

print(f"Successfully processed {len(processed_datasets)} configs")

Processing af...


Generating validation split: 100%|██████████| 70/70 [00:00<00:00, 7097.99 examples/s]
Generating test split: 100%|██████████| 588/588 [00:00<00:00, 82771.20 examples/s]
Filter: 100%|██████████| 588/588 [00:00<00:00, 56618.63 examples/s]
Map: 100%|██████████| 470/470 [00:00<00:00, 8213.37 examples/s]


  af: 470 questions
Processing ar...


Generating validation split: 100%|██████████| 70/70 [00:00<00:00, 18474.78 examples/s]
Generating test split: 100%|██████████| 588/588 [00:00<00:00, 165497.97 examples/s]
Filter: 100%|██████████| 588/588 [00:00<00:00, 69040.11 examples/s]
Map: 100%|██████████| 470/470 [00:00<00:00, 9119.82 examples/s]


  ar: 470 questions
Processing bn...


Generating validation split: 100%|██████████| 70/70 [00:00<00:00, 12224.73 examples/s]
Generating test split: 100%|██████████| 588/588 [00:00<00:00, 96142.63 examples/s]
Filter: 100%|██████████| 588/588 [00:00<00:00, 45287.21 examples/s]
Map: 100%|██████████| 470/470 [00:00<00:00, 8244.63 examples/s]


  bn: 470 questions
Processing cs...


Generating validation split: 100%|██████████| 70/70 [00:00<00:00, 10256.45 examples/s]
Generating test split: 100%|██████████| 588/588 [00:00<00:00, 82101.63 examples/s]
Filter: 100%|██████████| 588/588 [00:00<00:00, 53821.24 examples/s]
Map: 100%|██████████| 470/470 [00:00<00:00, 9421.30 examples/s]


  cs: 470 questions
Processing de...


Filter: 100%|██████████| 588/588 [00:00<00:00, 48373.98 examples/s]
Map: 100%|██████████| 470/470 [00:00<00:00, 7710.12 examples/s]


  de: 470 questions
Processing en...


Filter: 100%|██████████| 588/588 [00:00<00:00, 44430.55 examples/s]
Map: 100%|██████████| 470/470 [00:00<00:00, 8137.12 examples/s]


  en: 470 questions
Processing es...


Generating validation split: 100%|██████████| 70/70 [00:00<00:00, 19785.79 examples/s]
Generating test split: 100%|██████████| 588/588 [00:00<00:00, 160385.69 examples/s]
Filter: 100%|██████████| 588/588 [00:00<00:00, 55534.93 examples/s]
Map: 100%|██████████| 470/470 [00:00<00:00, 9697.72 examples/s]


  es: 470 questions
Processing fr...


Generating validation split: 100%|██████████| 70/70 [00:00<00:00, 11625.47 examples/s]
Generating test split: 100%|██████████| 588/588 [00:00<00:00, 120440.04 examples/s]
Filter: 100%|██████████| 588/588 [00:00<00:00, 49419.90 examples/s]
Map: 100%|██████████| 470/470 [00:00<00:00, 8332.73 examples/s]


  fr: 470 questions
Processing hi...


Generating validation split: 100%|██████████| 70/70 [00:00<00:00, 10800.92 examples/s]
Generating test split: 100%|██████████| 588/588 [00:00<00:00, 128130.23 examples/s]
Filter: 100%|██████████| 588/588 [00:00<00:00, 47001.27 examples/s]
Map: 100%|██████████| 470/470 [00:00<00:00, 8394.07 examples/s]


  hi: 470 questions
Processing hu...


Generating validation split: 100%|██████████| 70/70 [00:00<00:00, 13999.68 examples/s]
Generating test split: 100%|██████████| 588/588 [00:00<00:00, 141348.62 examples/s]
Filter: 100%|██████████| 588/588 [00:00<00:00, 50364.54 examples/s]
Map: 100%|██████████| 470/470 [00:00<00:00, 8514.96 examples/s]


  hu: 470 questions
Processing id...


Generating validation split: 100%|██████████| 70/70 [00:00<00:00, 9082.79 examples/s]
Generating test split: 100%|██████████| 588/588 [00:00<00:00, 98756.69 examples/s]
Filter: 100%|██████████| 588/588 [00:00<00:00, 46358.10 examples/s]
Map: 100%|██████████| 470/470 [00:00<00:00, 9355.27 examples/s]


  id: 470 questions
Processing it...


Generating validation split: 100%|██████████| 70/70 [00:00<00:00, 9139.62 examples/s]
Generating test split: 100%|██████████| 588/588 [00:00<00:00, 53358.95 examples/s]
Filter: 100%|██████████| 588/588 [00:00<00:00, 33645.99 examples/s]
Map: 100%|██████████| 470/470 [00:00<00:00, 8779.77 examples/s]


  it: 470 questions
Processing ja...


Generating validation split: 100%|██████████| 70/70 [00:00<00:00, 10921.85 examples/s]
Generating test split: 100%|██████████| 588/588 [00:00<00:00, 104070.00 examples/s]
Filter: 100%|██████████| 588/588 [00:00<00:00, 67369.17 examples/s]
Map: 100%|██████████| 470/470 [00:00<00:00, 8065.21 examples/s]


  ja: 470 questions
Processing ko...


Generating validation split: 100%|██████████| 70/70 [00:00<00:00, 17861.13 examples/s]
Generating test split: 100%|██████████| 588/588 [00:00<00:00, 208738.95 examples/s]
Filter: 100%|██████████| 588/588 [00:00<00:00, 41593.60 examples/s]
Map: 100%|██████████| 470/470 [00:00<00:00, 8834.27 examples/s]


  ko: 470 questions
Processing mr...


Generating validation split: 100%|██████████| 70/70 [00:00<00:00, 5808.14 examples/s]
Generating test split: 100%|██████████| 588/588 [00:00<00:00, 133102.20 examples/s]
Filter: 100%|██████████| 588/588 [00:00<00:00, 53492.05 examples/s]
Map: 100%|██████████| 470/470 [00:00<00:00, 4599.05 examples/s]


  mr: 470 questions
Processing ne...


Generating validation split: 100%|██████████| 70/70 [00:00<00:00, 19509.69 examples/s]
Generating test split: 100%|██████████| 588/588 [00:00<00:00, 180255.13 examples/s]
Filter: 100%|██████████| 588/588 [00:00<00:00, 53989.73 examples/s]
Map: 100%|██████████| 470/470 [00:00<00:00, 9081.88 examples/s]


  ne: 470 questions
Processing pt...


Generating validation split: 100%|██████████| 70/70 [00:00<00:00, 20940.11 examples/s]
Generating test split: 100%|██████████| 588/588 [00:00<00:00, 192796.34 examples/s]
Filter: 100%|██████████| 588/588 [00:00<00:00, 77591.65 examples/s]
Map: 100%|██████████| 470/470 [00:00<00:00, 10006.77 examples/s]


  pt: 470 questions
Processing ru...


Generating validation split: 100%|██████████| 70/70 [00:00<00:00, 18304.32 examples/s]
Generating test split: 100%|██████████| 588/588 [00:00<00:00, 183801.67 examples/s]
Filter: 100%|██████████| 588/588 [00:00<00:00, 39582.24 examples/s]
Map: 100%|██████████| 470/470 [00:00<00:00, 8917.91 examples/s]


  ru: 470 questions
Processing sr...


Generating validation split: 100%|██████████| 70/70 [00:00<00:00, 28880.71 examples/s]
Generating test split: 100%|██████████| 588/588 [00:00<00:00, 189799.20 examples/s]
Filter: 100%|██████████| 588/588 [00:00<00:00, 70605.52 examples/s]
Map: 100%|██████████| 470/470 [00:00<00:00, 9421.12 examples/s]


  sr: 470 questions
Processing sw...


Generating validation split: 100%|██████████| 70/70 [00:00<00:00, 16117.77 examples/s]
Generating test split: 100%|██████████| 588/588 [00:00<00:00, 146966.85 examples/s]
Filter: 100%|██████████| 588/588 [00:00<00:00, 59556.89 examples/s]
Map: 100%|██████████| 470/470 [00:00<00:00, 9042.93 examples/s]


  sw: 470 questions
Processing te...


Generating validation split: 100%|██████████| 70/70 [00:00<00:00, 13971.03 examples/s]
Generating test split: 100%|██████████| 588/588 [00:00<00:00, 114666.67 examples/s]
Filter: 100%|██████████| 588/588 [00:00<00:00, 47657.94 examples/s]
Map: 100%|██████████| 470/470 [00:00<00:00, 8398.97 examples/s]


  te: 470 questions
Processing th...


Generating validation split: 100%|██████████| 70/70 [00:00<00:00, 15096.73 examples/s]
Generating test split: 100%|██████████| 588/588 [00:00<00:00, 182942.72 examples/s]
Filter: 100%|██████████| 588/588 [00:00<00:00, 45216.63 examples/s]
Map: 100%|██████████| 470/470 [00:00<00:00, 7990.35 examples/s]


  th: 470 questions
Processing uk...


Generating validation split: 100%|██████████| 70/70 [00:00<00:00, 9979.65 examples/s]
Generating test split: 100%|██████████| 588/588 [00:00<00:00, 112923.57 examples/s]
Filter: 100%|██████████| 588/588 [00:00<00:00, 45591.95 examples/s]
Map: 100%|██████████| 470/470 [00:00<00:00, 8316.52 examples/s]


  uk: 470 questions
Processing ur...


Generating validation split: 100%|██████████| 70/70 [00:00<00:00, 9519.84 examples/s]
Generating test split: 100%|██████████| 588/588 [00:00<00:00, 125241.25 examples/s]
Filter: 100%|██████████| 588/588 [00:00<00:00, 46361.58 examples/s]
Map: 100%|██████████| 470/470 [00:00<00:00, 8388.39 examples/s]


  ur: 470 questions
Processing vi...


Generating validation split: 100%|██████████| 70/70 [00:00<00:00, 9124.29 examples/s]
Generating test split: 100%|██████████| 588/588 [00:00<00:00, 116756.65 examples/s]
Filter: 100%|██████████| 588/588 [00:00<00:00, 45275.57 examples/s]
Map: 100%|██████████| 470/470 [00:00<00:00, 8238.87 examples/s]


  vi: 470 questions
Processing wo...


Generating validation split: 100%|██████████| 70/70 [00:00<00:00, 19520.06 examples/s]
Generating test split: 100%|██████████| 588/588 [00:00<00:00, 190106.43 examples/s]
Filter: 100%|██████████| 588/588 [00:00<00:00, 59843.03 examples/s]
Map: 100%|██████████| 470/470 [00:00<00:00, 8714.49 examples/s]


  wo: 470 questions
Processing yo...


Generating validation split: 100%|██████████| 70/70 [00:00<00:00, 9594.19 examples/s]
Generating test split: 100%|██████████| 588/588 [00:00<00:00, 123919.74 examples/s]
Filter: 100%|██████████| 588/588 [00:00<00:00, 44357.03 examples/s]
Map: 100%|██████████| 470/470 [00:00<00:00, 8328.89 examples/s]


  yo: 470 questions
Processing zh...


Generating validation split: 100%|██████████| 70/70 [00:00<00:00, 20528.69 examples/s]
Generating test split: 100%|██████████| 588/588 [00:00<00:00, 117011.47 examples/s]
Filter: 100%|██████████| 588/588 [00:00<00:00, 79382.35 examples/s]
Map: 100%|██████████| 470/470 [00:00<00:00, 9956.73 examples/s]


  zh: 470 questions
Processing zu...


Generating validation split: 100%|██████████| 70/70 [00:00<00:00, 10925.51 examples/s]
Generating test split: 100%|██████████| 588/588 [00:00<00:00, 138483.39 examples/s]
Filter: 100%|██████████| 588/588 [00:00<00:00, 50485.16 examples/s]
Map: 100%|██████████| 470/470 [00:00<00:00, 8613.10 examples/s]

  zu: 470 questions
Successfully processed 29 configs





In [41]:
# Upload to HuggingFace
dataset_dict = DatasetDict(processed_datasets)

print(f"Dataset dictionary created with {len(dataset_dict)} configs")
print("Uploading to jphme/MMLU-ProX-Lite-open...")

dataset_dict.push_to_hub("jphme/MMLU-ProX-Lite-open", token=HF_TOKEN, private=False)

print("Upload completed!")

Dataset dictionary created with 29 configs
Uploading to jphme/MMLU-ProX-Lite-open...


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 209.52ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  3.23s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 454.52ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.78s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 445.59ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.82s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 282.24ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.80s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 409.16ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.96s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 537.66ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.77s/it]
Creating parquet from Arrow format: 100%|█████

Upload completed!


In [42]:
# Summary
print("Processing Summary:")
for config, dataset in processed_datasets.items():
    print(f"{config}: {len(dataset)} questions")

print(f"\nTotal configs: {len(processed_datasets)}")
print(f"Dataset uploaded to: jphme/MMLU-ProX-Lite-open")

Processing Summary:
af: 470 questions
ar: 470 questions
bn: 470 questions
cs: 470 questions
de: 470 questions
en: 470 questions
es: 470 questions
fr: 470 questions
hi: 470 questions
hu: 470 questions
id: 470 questions
it: 470 questions
ja: 470 questions
ko: 470 questions
mr: 470 questions
ne: 470 questions
pt: 470 questions
ru: 470 questions
sr: 470 questions
sw: 470 questions
te: 470 questions
th: 470 questions
uk: 470 questions
ur: 470 questions
vi: 470 questions
wo: 470 questions
yo: 470 questions
zh: 470 questions
zu: 470 questions

Total configs: 29
Dataset uploaded to: jphme/MMLU-ProX-Lite-open


In [26]:
# Overwrite "answer" column with the actual answer text from option_i
# where i is the answer_index


def get_answer_text(example):
    """Extract the actual answer text based on answer_index"""
    answer_index = example["answer_index"]
    option_key = f"option_{answer_index}"
    return example.get(
        option_key, example["answer"]
    )  # fallback to original if option not found


# Map the dataset to replace answer with actual answer text
filtered_mmlu_prox_lite = filtered_mmlu_prox_lite.map(
    lambda example: {**example, "answer": get_answer_text(example)}
)

print("Updated answer column with actual answer text")
print("Sample answers:")
for i in range(min(3, len(filtered_mmlu_prox_lite))):
    example = filtered_mmlu_prox_lite[i]
    print(
        f"Question {example['question_id']}: Answer index {example['answer_index']} -> '{example['answer']}'"
    )

Map: 100%|██████████| 470/470 [00:00<00:00, 7014.96 examples/s]

Updated answer column with actual answer text
Sample answers:
Question 72: Answer index 6 -> '62 Mann'
Question 73: Answer index 3 -> 'Kommunikation'
Question 74: Answer index 4 -> 'Watermans Anteil betrug 5.500 $ und Coles Anteil betrug 4.900 $'





In [36]:
final_dataset[0]

{'question_id': 72,
 'question': 'Ermitteln Sie die Anzahl der Männer, die benötigt werden, um ein Boot in 77 Tagen zu bauen, wenn 36 Mann 132 Tage brauchen, um eines zu bauen.',
 'answer': '62 Mann',
 'cot_content': '',
 'category': 'business',
 'src': 'stemez-Business'}