# Loading Dataformat
the object format:
- P-MMeval-EN
- P-MMEval-ZH
- P-MMEval-KO

In [1]:
from typing import List
from zipfile import ZipFile
import pandas as pd
import json
import os

P_MMEVAL_EN = {}
P_MMEVAL_KO = {}
P_MMEVAL_ZH = {}

object_languages = ["Chinese", "English", "Korean", "en", "zh", "ko", "mmlu_EN-US", "mmlu_KO-KR", "mmlu_ZH-CN"]
object_languages_dict = {
    "Chinese": "zh",
    "English": "en",
    "Korean": "ko",
    "en": "en",
    "zh": "zh",
    "ko": "ko",
    "mmlu_EN-US": "en",
    "mmlu_KO-KR": "ko",
    "mmlu_ZH-CN": "zh"
}

header = '''# coding=utf-8
# Copyright 2024 The P-MMEval Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
from typing import List
import datasets
import pandas as pd

_CITATION = """\\
@misc{zhang2024pmmevalparallelmultilingualmultitask,
      title={P-MMEval: A Parallel Multilingual Multitask Benchmark for Consistent Evaluation of LLMs}, 
      author={Yidan Zhang and Yu Wan and Boyi Deng and Baosong Yang and Haoran Wei and Fei Huang and Bowen Yu and Junyang Lin and Fei Huang and Jingren Zhou},
      year={2024},
      eprint={2411.09116},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2411.09116}, 
}
"""

_DESCRIPTION = """\\
We introduce a multilingual benchmark, P-MMEval, covering effective fundamental and capability-specialized datasets. We extend the existing benchmarks, ensuring consistent language coverage across all datasets and providing parallel samples among multiple languages, supporting up to 10 languages from 8 language families (i.e., en, zh, ar, es, ja, ko, th, fr, pt, vi). As a result, P-MMEval facilitates a holistic assessment of multilingual capabilities and comparative analysis of cross-lingual transferability.
"""

_HOMEPAGE = "https://huggingface.co/datasets/Qwen/P-MMEval"
_LICENSE = "Apache-2.0"
_URL = "{{name}}.zip"

task_list = ["all"]
'''


def export_files_and_zip(name: str, 
                         output_dir, 
                         test_split: List[dict], 
                         dev_split: List[dict] = None, 
                         dev_set: bool = False, 
                         parquet_format: bool = True
                         ):
    """
    Export the test split to a zip file.
    """
    if os.path.exists(output_dir):
        os.makedirs(os.path.join(output_dir, "data"), exist_ok=True)
        if test_split:
            os.makedirs(os.path.join(output_dir, "data", "test"), exist_ok=True)
        if dev_split and dev_set:
            os.makedirs(os.path.join(output_dir, "data", "dev"), exist_ok=True)
        
    # export data to csv or parquet
    if parquet_format:
        test_df = pd.DataFrame(test_split)
        test_df.to_parquet(os.path.join(output_dir, "data", "test", "all_test.parquet"), index=False)
        if dev_split and dev_set:
            dev_df = pd.DataFrame(dev_split)
            dev_df.to_parquet(os.path.join(output_dir, "data", "dev", "all_dev.parquet"), index=False)
    else:
        test_df = pd.DataFrame(test_split)
        test_df.to_csv(os.path.join(output_dir, "data", "test", "all_test.csv"), index=False)
        if dev_split and dev_set:
            dev_df = pd.DataFrame(dev_split)
            dev_df.to_csv(os.path.join(output_dir, "data", "dev", "all_dev.csv"), index=False)
    mapping = {
        "all":{
            "name": "All",
            "category": "all",
        }
    }
    
    print(test_df.columns)
    with open(os.path.join(output_dir, "mapping.json"), 'w') as f:
        json.dump(mapping, f, ensure_ascii=False, indent=2)
    # create a python file
    with open(os.path.join(output_dir, f"{name}.py"), 'w') as f:
        f.write(header.replace("{{name}}", name))
    # zip files in the data directory and renamed to name given
    with ZipFile(os.path.join(output_dir, f"{name}.zip"), 'w') as zipf:
        for root, dirs, files in os.walk(os.path.join(output_dir, "data")):
            for file in files:
                zipf.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), os.path.join(output_dir, "data")))
    # delete the data directory
    import shutil
    shutil.rmtree(os.path.join(output_dir, "data"))

In [2]:
# humaneval-xl
humaneval_xl_data_dict = {}
for code_name in os.listdir("/work/u5110390/BenchWeaver/P-MMEval/humaneval-xl/test"):
    for name in os.listdir(f"/work/u5110390/BenchWeaver/P-MMEval/humaneval-xl/test/{code_name}"):
        lang = name.split(".")[0]
        if lang in object_languages:
            with open(f"/work/u5110390/BenchWeaver/P-MMEval/humaneval-xl/test/{code_name}/{name}", "r") as f:
                data = []
                for line in f:
                    line = json.loads(line)
                    data.append({
                        "task_id": line["task_id"],
                        "text": line["prompt"],
                        "test_list": [line['test']],
                    })
            if lang not in humaneval_xl_data_dict.keys():
                humaneval_xl_data_dict[lang] = data
            else:
                humaneval_xl_data_dict[lang] += data

# Update the P-MMEval dictionaries
for lang in humaneval_xl_data_dict.keys():
    unify_lang = object_languages_dict[lang]
    if unify_lang == "en":
        P_MMEVAL_EN["humaneval-xl"] = humaneval_xl_data_dict[lang]
    elif unify_lang == "ko":
        P_MMEVAL_KO["humaneval-xl"] = humaneval_xl_data_dict[lang]
    elif unify_lang == "zh":
        P_MMEVAL_ZH["humaneval-xl"] = humaneval_xl_data_dict[lang]

In [3]:
# mgsm
mgsm_data_test_dict = {}
for name in os.listdir("/work/u5110390/BenchWeaver/P-MMEval/mgsm/test"):
    lang = name.split(".")[0]
    if lang in object_languages:
        with open(f"/work/u5110390/BenchWeaver/P-MMEval/mgsm/test/{lang}.jsonl", "r") as f:
            data = []
            for line in f:
                line = json.loads(line)
                data.append(line)
        mgsm_data_test_dict[lang] = data

# Update the P-MMEval dictionaries
for lang in mgsm_data_test_dict.keys():
    unify_lang = object_languages_dict[lang]
    if unify_lang == "en":
        P_MMEVAL_EN["mgsm"] = mgsm_data_test_dict[lang]
    elif unify_lang == "ko":
        P_MMEVAL_KO["mgsm"] = mgsm_data_test_dict[lang]
    elif unify_lang == "zh":
        P_MMEVAL_ZH["mgsm"] = mgsm_data_test_dict[lang]

In [4]:
# mhellaswag
mhellaswag_data_test_dict = {}

for name in os.listdir("/work/u5110390/BenchWeaver/P-MMEval/mhellaswag/test"):
    lang = name.split(".")[0]
    if lang in object_languages:
        with open(f"/work/u5110390/BenchWeaver/P-MMEval/mhellaswag/test/{lang}.jsonl", "r") as f:
            data = []
            for line in f:
                line = json.loads(line)
                data.append({
                    "activity_label": line["activity_label"],
                    "split_type": line["split"],
                    "question": line['ctx'],
                    "A": line["endings"][0],
                    "B": line["endings"][1],
                    "C": line["endings"][2],
                    "D": line["endings"][3],
                    "answer": chr(ord('A') + int(line['label'])),
                    })
        mhellaswag_data_test_dict[lang] = data

# Update the P-MMEval dictionaries
for lang in mhellaswag_data_test_dict.keys():
    unify_lang = object_languages_dict[lang]
    if unify_lang == "en":
        P_MMEVAL_EN["mhellaswag"] = mhellaswag_data_test_dict[lang]
    elif unify_lang == "ko":
        P_MMEVAL_KO["mhellaswag"] = mhellaswag_data_test_dict[lang]
    elif unify_lang == "zh":
        P_MMEVAL_ZH["mhellaswag"] = mhellaswag_data_test_dict[lang]

In [5]:
# mifeval
mifeval_data_test_dict = {}

for name in os.listdir("/work/u5110390/BenchWeaver/P-MMEval/mifeval/test"):
    lang = name.split(".")[0]
    if lang in object_languages:
        with open(f"/work/u5110390/BenchWeaver/P-MMEval/mifeval/test/{lang}.jsonl", "r") as f:
            data = []
            for line in f:
                line = json.loads(line)
                data.append({
                    "key": line["key"], 
                    "question": line['prompt'], 
                    "instruction_id_list": line['instruction_id_list'], 
                    "kwargs": line['kwargs'],
                             })
        mifeval_data_test_dict[lang] = data

# Update the P-MMEval dictionaries
for lang in mifeval_data_test_dict.keys():
    unify_lang = object_languages_dict[lang]
    if unify_lang == "en":
        P_MMEVAL_EN["mifeval"] = mifeval_data_test_dict[lang]
    elif unify_lang == "ko":
        P_MMEVAL_KO["mifeval"] = mifeval_data_test_dict[lang]
    elif unify_lang == "zh":
        P_MMEVAL_ZH["mifeval"] = mifeval_data_test_dict[lang]

In [6]:
# mlogiqa

mlogiqa_data_test_dict = {}

for name in os.listdir("/work/u5110390/BenchWeaver/P-MMEval/mlogiqa/test"):
    lang = name.split(".")[0]
    if lang in object_languages:
        with open(f"/work/u5110390/BenchWeaver/P-MMEval/mlogiqa/test/{lang}.jsonl", "r") as f:
            data = []
            for line in f:
                line = json.loads(line)
                data.append({
                    "question": line['question'],
                    "context": line['context'],
                    "A": line["options"][0],
                    "B": line["options"][1],
                    "C": line["options"][2],
                    "D": line["options"][3],
                    "answer": chr(ord('A') + int(line['answer'])),
                    })
        mlogiqa_data_test_dict[lang] = data

# Update the P-MMEval dictionaries
for lang in mlogiqa_data_test_dict.keys():
    unify_lang = object_languages_dict[lang]
    if unify_lang == "en":
        P_MMEVAL_EN["mlogiqa"] = mlogiqa_data_test_dict[lang]
    elif unify_lang == "ko":
        P_MMEVAL_KO["mlogiqa"] = mlogiqa_data_test_dict[lang]
    elif unify_lang == "zh":
        P_MMEVAL_ZH["mlogiqa"] = mlogiqa_data_test_dict[lang]

In [7]:
# mmmlu
mmmlu_data_test_dict = {}
mmmlu_data_dev_dict = {}
for name in os.listdir("/work/u5110390/BenchWeaver/P-MMEval/mmmlu/easy/test"):
    lang = name.split(".")[0]
    if lang in object_languages:
        with open(f"/work/u5110390/BenchWeaver/P-MMEval/mmmlu/easy/test/{lang}.jsonl", "r") as f:
            data = []
            for line in f:
                line = json.loads(line)
                data.append({
                    "difficulty": "easy",
                    "question": line['Question'],
                    "A": line["A"],
                    "B": line["B"],
                    "C": line["C"],
                    "D": line["D"],
                    "answer": line['Answer'],
                    "subject": line['Subject'],
                })
        mmmlu_data_test_dict[lang] = data
        
for name in os.listdir("/work/u5110390/BenchWeaver/P-MMEval/mmmlu/hard/test"):
    lang = name.split(".")[0]
    if lang in object_languages:
        with open(f"/work/u5110390/BenchWeaver/P-MMEval/mmmlu/hard/test/{lang}.jsonl", "r") as f:
            data = []
            for line in f:
                line = json.loads(line)
                data.append({
                    "difficulty": "hard",
                    "question": line['Question'],
                    "A": line["A"],
                    "B": line["B"],
                    "C": line["C"],
                    "D": line["D"],
                    "answer": line['Answer'],
                    "subject": line['Subject'],
                })
        mmmlu_data_test_dict[lang] += data

for name in os.listdir("/work/u5110390/BenchWeaver/P-MMEval/mmmlu/val"):
    lang = name.split(".")[0]
    if lang in object_languages:
        with open(f"/work/u5110390/BenchWeaver/P-MMEval/mmmlu/val/{lang}.jsonl", "r") as f:
            data = []
            for line in f:
                line = json.loads(line)
                data.append({
                    "difficulty": "",
                    "question": line['Question'],
                    "A": line["A"],
                    "B": line["B"],
                    "C": line["C"],
                    "D": line["D"],
                    "answer": line['Answer'],
                    "subject": line['Subject'],
                })
        unify_lang = object_languages_dict[lang]
        mmmlu_data_dev_dict[unify_lang] = data

# Update the P-MMEval dictionaries
for lang in mmmlu_data_test_dict.keys():
    unify_lang = object_languages_dict[lang]
    if unify_lang == "en":
        P_MMEVAL_EN["mmmlu"] = mmmlu_data_test_dict[lang]
    elif unify_lang == "ko":
        P_MMEVAL_KO["mmmlu"] = mmmlu_data_test_dict[lang]
    elif unify_lang == "zh":
        P_MMEVAL_ZH["mmmlu"] = mmmlu_data_test_dict[lang]


In [8]:
# xnli
xnli_data_test_dict = {}
for name in os.listdir("/work/u5110390/BenchWeaver/P-MMEval/xnli/test"):
    lang = name.split(".")[0]
    if lang in object_languages:
        with open(f"/work/u5110390/BenchWeaver/P-MMEval/xnli/test/{lang}.jsonl", "r") as f:
            data = []
            for line in f:
                line = json.loads(line)
                data.append({
                    "premise": line['premise'],
                    "statement": line['statement'],
                    "answer": line['answer'],
                })
        xnli_data_test_dict[lang] = data

# Update the P-MMEval dictionaries
for lang in xnli_data_test_dict.keys():
    unify_lang = object_languages_dict[lang]
    if unify_lang == "en":
        P_MMEVAL_EN["xnli"] = xnli_data_test_dict[lang]
    elif unify_lang == "ko":
        P_MMEVAL_KO["xnli"] = xnli_data_test_dict[lang]
    elif unify_lang == "zh":
        P_MMEVAL_ZH["xnli"] = xnli_data_test_dict[lang]

# Exports Datasets
- we only need the average score of the P-MMeval

In [9]:
P_MMEVAL_EN.keys()

dict_keys(['humaneval-xl', 'mgsm', 'mhellaswag', 'mifeval', 'mlogiqa', 'mmmlu', 'xnli'])

In [10]:
for benchmark_name, language_split in P_MMEVAL_EN.items():
    print(f"Exporting {benchmark_name}...")
    export_files_and_zip(
        benchmark_name, 
        f"/work/u5110390/BenchWeaver/evaluation_data/P-MMEval/en/{benchmark_name}", 
        language_split, 
        dev_split=mmmlu_data_dev_dict["en"] if benchmark_name == "mmmlu" else None, 
        dev_set=True if benchmark_name == "mmmlu" else False, 
        parquet_format=True
        )
    print(f"Exporting {benchmark_name}... done")

Exporting humaneval-xl...
Index(['task_id', 'text', 'test_list'], dtype='object')
Exporting humaneval-xl... done
Exporting mgsm...
Index(['id', 'question', 'answer'], dtype='object')
Exporting mgsm... done
Exporting mhellaswag...
Index(['activity_label', 'split_type', 'question', 'A', 'B', 'C', 'D',
       'answer'],
      dtype='object')
Exporting mhellaswag... done
Exporting mifeval...
Index(['key', 'question', 'instruction_id_list', 'kwargs'], dtype='object')
Exporting mifeval... done
Exporting mlogiqa...
Index(['question', 'context', 'A', 'B', 'C', 'D', 'answer'], dtype='object')
Exporting mlogiqa... done
Exporting mmmlu...
Index(['difficulty', 'question', 'A', 'B', 'C', 'D', 'answer', 'subject'], dtype='object')
Exporting mmmlu... done
Exporting xnli...
Index(['premise', 'statement', 'answer'], dtype='object')
Exporting xnli... done


In [None]:
for benchmark_name, language_split in P_MMEVAL_ZH.items():
    print(f"Exporting {benchmark_name}...")
    export_files_and_zip(
        benchmark_name, 
        f"/work/u5110390/BenchWeaver/evaluation_data/P-MMEval/zh/{benchmark_name}", 
        language_split, 
        dev_split=mmmlu_data_dev_dict["zh"] if benchmark_name == "mmmlu" else None, 
        dev_set=True if benchmark_name == "mmmlu" else False, 
        parquet_format=True 
        )
    print(f"Exporting {benchmark_name}... done")

Exporting humaneval-xl...
Index(['task_id', 'text', 'test_list'], dtype='object')
Exporting humaneval-xl... done
Exporting mgsm...
Index(['id', 'question', 'answer'], dtype='object')
Exporting mgsm... done
Exporting mhellaswag...
Index(['activity_label', 'split_type', 'question', 'A', 'B', 'C', 'D',
       'answer'],
      dtype='object')
Exporting mhellaswag... done
Exporting mifeval...
Index(['key', 'question', 'instruction_id_list', 'kwargs'], dtype='object')
Exporting mifeval... done
Exporting mlogiqa...
Index(['question', 'context', 'A', 'B', 'C', 'D', 'answer'], dtype='object')
Exporting mlogiqa... done
Exporting mmmlu...
Index(['difficulty', 'question', 'A', 'B', 'C', 'D', 'answer', 'subject'], dtype='object')
Exporting mmmlu... done
Exporting xnli...
Index(['premise', 'statement', 'answer'], dtype='object')
Exporting xnli... done


In [12]:
for benchmark_name, language_split in P_MMEVAL_KO.items():
    print(f"Exporting {benchmark_name}...")
    export_files_and_zip(
        benchmark_name, 
        f"/work/u5110390/BenchWeaver/evaluation_data/P-MMEval/ko/{benchmark_name}", 
        language_split, 
        dev_split=mmmlu_data_dev_dict["ko"] if benchmark_name == "mmmlu" else None, 
        dev_set=True if benchmark_name == "mmmlu" else False, 
        parquet_format=True
        )
    print(f"Exporting {benchmark_name}... done")

Exporting humaneval-xl...
Index(['task_id', 'text', 'test_list'], dtype='object')
Exporting humaneval-xl... done
Exporting mgsm...
Index(['answer', 'id', 'question'], dtype='object')
Exporting mgsm... done
Exporting mhellaswag...
Index(['activity_label', 'split_type', 'question', 'A', 'B', 'C', 'D',
       'answer'],
      dtype='object')
Exporting mhellaswag... done
Exporting mifeval...
Index(['key', 'question', 'instruction_id_list', 'kwargs'], dtype='object')
Exporting mifeval... done
Exporting mlogiqa...
Index(['question', 'context', 'A', 'B', 'C', 'D', 'answer'], dtype='object')
Exporting mlogiqa... done
Exporting mmmlu...
Index(['difficulty', 'question', 'A', 'B', 'C', 'D', 'answer', 'subject'], dtype='object')
Exporting mmmlu... done
Exporting xnli...
Index(['premise', 'statement', 'answer'], dtype='object')
Exporting xnli... done
