# MMLU-ProX-Lite Processing and Upload

Process all language configs, filter by answerability, and upload to HuggingFace.

In [None]:
import os
import pandas as pd
from datasets import load_dataset, load_from_disk, DatasetDict
from dotenv import load_dotenv
from huggingface_hub import login

# Load environment variables and login to HuggingFace
load_dotenv()
HF_TOKEN = os.getenv('HF_TOKEN')
login(token=HF_TOKEN)
print("Logged in to HuggingFace")

In [None]:
# Load classified dataset and create answerable question IDs set
mmlu_classified = load_from_disk("mmlu_prox_classified")
df_answerable = pd.DataFrame({
    "index": mmlu_classified["question_id"],
    "is_answerable": [1 if x else 0 for x in mmlu_classified["is_answerable"]],
})
answerable_question_ids = set(df_answerable[df_answerable['is_answerable'] == 1]['index'])

print(f"Total questions: {len(df_answerable)}")
print(f"Answerable questions: {len(answerable_question_ids)}")

In [None]:
# Define all language configs and processing functions
configs = [
    "af", "ar", "bn", "cs", "de", "en", "es", "fr", "hi", "hu",
    "id", "it", "ja", "ko", "mr", "ne", "pt", "ru", "sr", "sw",
    "te", "th", "uk", "ur", "vi", "wo", "yo", "zh", "zu"
]

def get_answer_text(example):
    """Extract actual answer text based on answer_index"""
    answer_index = example['answer_index']
    option_key = f'option_{answer_index}'
    return example.get(option_key, example['answer'])

def process_config(config):
    """Process a single language config"""
    dataset = load_dataset("li-lab/MMLU-ProX-Lite", config, split="test")
    filtered = dataset.filter(lambda x: x['question_id'] in answerable_question_ids)
    filtered = filtered.map(lambda x: {**x, 'answer': get_answer_text(x)})
    columns_to_keep = ["question_id", "question", "answer", "cot_content", "category", "src"]
    return filtered.select_columns(columns_to_keep)

print(f"Will process {len(configs)} language configs")

In [None]:
# Process all configs
processed_datasets = {}

for config in configs:
    print(f"Processing {config}...")
    try:
        processed = process_config(config)
        processed_datasets[config] = processed
        print(f"  {config}: {len(processed)} questions")
    except Exception as e:
        print(f"  Error processing {config}: {e}")

print(f"Successfully processed {len(processed_datasets)} configs")

In [None]:
# Upload to HuggingFace
dataset_dict = DatasetDict(processed_datasets)

print(f"Dataset dictionary created with {len(dataset_dict)} configs")
print("Uploading to jphme/MMLU-ProX-Lite-open...")

dataset_dict.push_to_hub(
    "jphme/MMLU-ProX-Lite-open",
    token=HF_TOKEN,
    private=False
)

print("Upload completed!")

In [None]:
# Summary
print("Processing Summary:")
for config, dataset in processed_datasets.items():
    print(f"{config}: {len(dataset)} questions")

print(f"\nTotal configs: {len(processed_datasets)}")
print(f"Dataset uploaded to: jphme/MMLU-ProX-Lite-open")

In [26]:
# Overwrite "answer" column with the actual answer text from option_i
# where i is the answer_index


def get_answer_text(example):
    """Extract the actual answer text based on answer_index"""
    answer_index = example["answer_index"]
    option_key = f"option_{answer_index}"
    return example.get(
        option_key, example["answer"]
    )  # fallback to original if option not found


# Map the dataset to replace answer with actual answer text
filtered_mmlu_prox_lite = filtered_mmlu_prox_lite.map(
    lambda example: {**example, "answer": get_answer_text(example)}
)

print("Updated answer column with actual answer text")
print("Sample answers:")
for i in range(min(3, len(filtered_mmlu_prox_lite))):
    example = filtered_mmlu_prox_lite[i]
    print(
        f"Question {example['question_id']}: Answer index {example['answer_index']} -> '{example['answer']}'"
    )

Map: 100%|██████████| 470/470 [00:00<00:00, 7014.96 examples/s]

Updated answer column with actual answer text
Sample answers:
Question 72: Answer index 6 -> '62 Mann'
Question 73: Answer index 3 -> 'Kommunikation'
Question 74: Answer index 4 -> 'Watermans Anteil betrug 5.500 $ und Coles Anteil betrug 4.900 $'





In [36]:
final_dataset[0]

{'question_id': 72,
 'question': 'Ermitteln Sie die Anzahl der Männer, die benötigt werden, um ein Boot in 77 Tagen zu bauen, wenn 36 Mann 132 Tage brauchen, um eines zu bauen.',
 'answer': '62 Mann',
 'cot_content': '',
 'category': 'business',
 'src': 'stemez-Business'}