# CLIcK

In [None]:
from datasets import load_dataset
from typing import Tuple
import pandas as pd

ds = load_dataset("EunsuKim/CLIcK")
df = ds['train'].to_pandas()
df.head()

In [None]:
def extract_category(id_text: str) -> str:
    text_split = id_text.split('_')
    return text_split[0]


In [None]:
df['categories'] = df['id'].apply(lambda x: extract_category(x))
df['choices'] = df['choices'].apply(lambda x: [str(_) for _ in list(x)])
print(set(df['categories']))

keys = list(set(df['categories']))

In [None]:
df['choices'].iloc[0]

In [None]:
df = df[['paragraph', 'question', 'choices', 'answer', 'categories']]
dfs = df.groupby('categories')

In [None]:
dfs.get_group('TK')

In [None]:
# save each group to folder with name: category_test.csv
folder = "/work/u5110390/BenchWeaver/evaluation_data/click/data/test"
import os
for key in keys:
    group = dfs.get_group(key)
    group.to_csv(os.path.join(folder, f"{key}_test.csv"), index=False)

# HAE_RAE_BENCH 1.0

In [None]:
from datasets import load_dataset, get_dataset_config_names
from typing import Tuple
import pandas as pd


In [None]:
opqa = ['lyrics_denoising', 'proverbs_denoising']
mcqa = ['correct_definition_matching', 'csat_geo', 'csat_law', 'csat_socio', 'date_understanding', 'general_knowledge', 'history', 'loan_words', 'rare_words', 'standard_nomenclature', 'reading_comprehension']

## MCQA

In [None]:
# check options for each task
import ast
from tqdm.auto import  tqdm
answers = []
with  tqdm(total=len(mcqa)) as pbar:
    for task in mcqa:
        pbar.set_postfix_str(task)
        ds = load_dataset("HAERAE-HUB/HAE_RAE_BENCH_1.1", task)
        df = ds['test'].to_pandas()
        answers.extend(df['answer'].tolist())
        for idx in range(df.shape[0]):
            try:
                option_list = ast.literal_eval(df['options'].iloc[idx])
            except:
                option_list = df['options'].iloc[idx].split("|")
            if len(option_list) != 5:
                print(idx, df['options'].iloc[idx])
        pbar.update(1)
print(set(answers))
print(len(set(answers)))

In [None]:
# check question format
with  tqdm(total=len(mcqa)) as pbar:
    for task in mcqa:
        pbar.set_postfix_str(task)
        ds = load_dataset("HAERAE-HUB/HAE_RAE_BENCH_1.1", task)
        df = ds['test'].to_pandas()
        print(f"-------------------{task}-------------------")
        print(df['query'].iloc[0])
        print("=================================")
        pbar.update(1)

In [None]:
# redefine the task
import re
from typing import List

def parse_options(options_list_text: str)->List[str]:
    try:
        option_list = ast.literal_eval(options_list_text)
    except:
        option_list = options_list_text.split("|")
    return option_list

def clean_answer(answers: str)->str:
    return re.sub(r"\(|\)", "", answers)


def process_mcqa_task(task) -> pd.DataFrame:
    ds = load_dataset("HAERAE-HUB/HAE_RAE_BENCH_1.1", task)
    df = ds['test'].to_pandas()
    
    # Apply parse_options and expand into separate columns
    options_df = df['options'].apply(lambda x: pd.Series(parse_options(x), index=["A", "B", "C", "D", "E"]))
    df = pd.concat([df, options_df], axis=1)

    df['answer'] = df['answer'].apply(clean_answer)
    df['question'] = df['query']
    df['categories'] = "mcqa"
    return df[['question', 'A', 'B', 'C', 'D', 'E', 'answer', 'categories']]

In [None]:
with tqdm(total=len(mcqa)) as pbar:
    for task in mcqa:
        pbar.set_postfix_str(task)
        df = process_mcqa_task(task)
        df.to_csv(f"/work/u5110390/BenchWeaver/evaluation_data/hae-rae-bench/data/test/{task}_test.csv", index=False)
        pbar.update(1)

## OPQA

In [None]:
with tqdm(total=len(opqa)) as pbar:
    for task in opqa:
        pbar.set_postfix_str(task)
        ds = load_dataset("HAERAE-HUB/HAE_RAE_BENCH_1.1", task)
        df = ds['test'].to_pandas()
        print(f"-------------------{task}-------------------")
        print(df['query'].iloc[0])
        print("=================================")
        print(df['options'].iloc[0])
        print("=================================")
        print(df['answer'].iloc[0])
        pbar.update(1)

In [None]:

def process_opqa_task(task) -> pd.DataFrame:
    ds = load_dataset("HAERAE-HUB/HAE_RAE_BENCH_1.1", task)
    df = ds['test'].to_pandas()
    df['A'] = ""
    df['B'] = ""
    df['C'] = ""
    df['D'] = ""
    df['E'] = ""
    df['question'] = df['query']
    df['categories'] = "opqa"
    
    return df[['question', 'A', 'B', 'C', 'D', 'E', 'answer', 'categories']]

In [None]:
with tqdm(total=len(opqa)) as pbar:
    for task in opqa:
        pbar.set_postfix_str(task)
        df = process_opqa_task(task)
        df.to_csv(f"/work/u5110390/BenchWeaver/evaluation_data/hae-rae-bench/data/test/{task}_test.csv", index=False)
        pbar.update(1)

# KMMLU (Hard)

In [None]:
supercategories = {
        "accounting": "HUMSS",
        "agricultural_sciences": "Other",
        "aviation_engineering_and_maintenance": "Applied Science",
        "biology": "STEM",
        "chemical_engineering": "STEM",
        "chemistry": "STEM",
        "civil_engineering": "STEM",
        "computer_science": "STEM",
        "construction": "Other",
        "criminal_law": "HUMSS",
        "ecology": "STEM",
        "economics": "HUMSS",
        "education": "HUMSS",
        "electrical_engineering": "STEM",
        "electronics_engineering": "Applied Science",
        "energy_management": "Applied Science",
        "environmental_science": "Applied Science",
        "fashion": "Other",
        "food_processing": "Other",
        "gas_technology_and_engineering": "Applied Science",
        "geomatics": "Applied Science",
        "health": "Other",
        "industrial_engineer": "Applied Science",
        "information_technology": "STEM",
        "interior_architecture_and_design": "Other",
        "law": "HUMSS",
        "machine_design_and_manufacturing": "Applied Science",
        "management": "HUMSS",
        "maritime_engineering": "Applied Science",
        "marketing": "Other",
        "materials_engineering": "STEM",
        "mechanical_engineering": "STEM",
        "nondestructive_testing": "Applied Science",
        "patent": "Other",
        "political_science_and_sociology": "HUMSS",
        "psychology": "HUMSS",
        "public_safety": "Other",
        "railway_and_automotive_engineering": "Applied Science",
        "real_estate": "Other",
        "refrigerating_machinery": "Other",
        "social_welfare": "HUMSS",
        "taxation": "HUMSS",
        "telecommunications_and_wireless_technology": "Applied Science",
        "korean_history": "HUMSS",
        "math": "STEM"
    }

In [None]:
set(supercategories.values())

In [None]:
import json

mapping = {}
for k, v in supercategories.items():
    mapping[k] = {
        "name": k.replace("_", " "),
        "category": v
    }
    
with open("/work/u5110390/BenchWeaver/evaluation_data/kmmlu/mapping.json", "w") as f:
    json.dump(mapping, f, indent=2)
    
with open("/work/u5110390/BenchWeaver/evaluation_data/kmmlu-hard/mapping.json", "w") as f:
    json.dump(mapping, f, indent=2)

In [None]:
task = ['Accounting', 'Agricultural-Sciences', 'Aviation-Engineering-and-Maintenance', 'Biology', 'Chemical-Engineering', 'Chemistry', 'Civil-Engineering', 'Computer-Science', 'Construction', 'Criminal-Law', 'Ecology', 'Economics', 'Education', 'Electrical-Engineering', 'Electronics-Engineering', 'Energy-Management', 'Environmental-Science', 'Fashion', 'Food-Processing', 'Gas-Technology-and-Engineering', 'Geomatics', 'Health', 'Industrial-Engineer', 'Information-Technology', 'Interior-Architecture-and-Design', 'Law', 'Machine-Design-and-Manufacturing', 'Management', 'Maritime-Engineering', 'Marketing', 'Materials-Engineering', 'Mechanical-Engineering', 'Nondestructive-Testing', 'Patent', 'Political-Science-and-Sociology', 'Psychology', 'Public-Safety', 'Railway-and-Automotive-Engineering', 'Real-Estate', 'Refrigerating-Machinery', 'Social-Welfare', 'Taxation', 'Telecommunications-and-Wireless-Technology', 'Korean-History', 'Math']

task_list = [t.lower().replace("-", "_") for t in task]
task_list

In [None]:
from datasets import load_dataset
from tqdm.auto import tqdm
import pandas as pd

In [None]:
import os
def casting_idx2option(idx) -> str:
    idx2option = { "1": "A", "2": "B", "3": "C", "4": "D", "5": "E", "6": "F"}
    return idx2option[str(idx)]

with tqdm(total=len(task)) as pbar:
    for t in task:
        pbar.set_postfix_str(t)
        ds = load_dataset("HAERAE-HUB/KMMLU", t)
        train_df = ds['train'].to_pandas()
        test_df = ds['test'].to_pandas()
        dev_df = ds['dev'].to_pandas()
        train_df['answer'] = train_df['answer'].apply(casting_idx2option)
        test_df['answer'] = test_df['answer'].apply(casting_idx2option)
        dev_df['answer'] = dev_df['answer'].apply(casting_idx2option)
        task_name = t.lower().replace("-", "_")
        os.makedirs("/work/u5110390/BenchWeaver/evaluation_data/kmmlu/data/dev", exist_ok=True)
        os.makedirs("/work/u5110390/BenchWeaver/evaluation_data/kmmlu/data/test", exist_ok=True)
        os.makedirs("/work/u5110390/BenchWeaver/evaluation_data/kmmlu/data/val", exist_ok=True)
        train_df.to_csv(f"/work/u5110390/BenchWeaver/evaluation_data/kmmlu/data/dev/{task_name}_dev.csv", encoding="utf-8", index=False)
        dev_df.to_csv(f"/work/u5110390/BenchWeaver/evaluation_data/kmmlu/data/val/{task_name}_val.csv", encoding="utf-8", index=False)
        test_df.to_csv(f"/work/u5110390/BenchWeaver/evaluation_data/kmmlu/data/test/{task_name}_test.csv", encoding="utf-8", index=False)
        pbar.update(1)

In [None]:

        
with tqdm(total=len(task)) as pbar:
    for t in task:
        pbar.set_postfix_str(t)
        task_name = t.lower().replace("-", "_")
        ds = load_dataset("HAERAE-HUB/KMMLU-HARD", task_name)
        test_df = ds['test'].to_pandas()
        dev_df = ds['dev'].to_pandas()
        test_df['answer'] = test_df['answer'].apply(casting_idx2option)
        dev_df['answer'] = dev_df['answer'].apply(casting_idx2option)
        test_df.rename(columns={"cot": "explanation"}, inplace=True)
        dev_df.rename(columns={"cot": "explanation"}, inplace=True)
        os.makedirs("/work/u5110390/BenchWeaver/evaluation_data/kmmlu-hard/data/dev", exist_ok=True)
        os.makedirs("/work/u5110390/BenchWeaver/evaluation_data/kmmlu-hard/data/test", exist_ok=True)
        dev_df.to_csv(f"/work/u5110390/BenchWeaver/evaluation_data/kmmlu-hard/data/dev/{task_name}_dev.csv", index=False)
        test_df.to_csv(f"/work/u5110390/BenchWeaver/evaluation_data/kmmlu-hard/data/test/{task_name}_test.csv", index=False)
        pbar.update(1)

In [None]:
import pandas as pd
df = pd.read_csv("/work/u5110390/BenchWeaver/evaluation_data/kmmlu/data/test/agricultural_sciences_test.csv")

# TMMLU+

In [None]:
from datasets import load_dataset
import os
import pandas as pd
import zipfile
import shutil

task_list = [
             'engineering_math', 'dentistry', 'traditional_chinese_medicine_clinical_medicine', 'clinical_psychology', 'technical', 'culinary_skills', 'mechanical', 'logic_reasoning', 'real_estate',
             'general_principles_of_law', 'finance_banking', 'anti_money_laundering', 'ttqav2', 'marketing_management', 'business_management', 'organic_chemistry', 'advance_chemistry',
             'physics', 'secondary_physics', 'human_behavior', 'national_protection', 'jce_humanities', 'politic_science', 'agriculture', 'official_document_management',
             'financial_analysis', 'pharmacy', 'educational_psychology', 'statistics_and_machine_learning', 'management_accounting', 'introduction_to_law', 'computer_science', 'veterinary_pathology',
             'accounting', 'fire_science', 'optometry', 'insurance_studies', 'pharmacology', 'taxation', 'trust_practice', 'geography_of_taiwan', 'physical_education', 'auditing', 'administrative_law',
             'education_(profession_level)', 'economics', 'veterinary_pharmacology', 'nautical_science', 'occupational_therapy_for_psychological_disorders',
             'basic_medical_science', 'macroeconomics', 'trade', 'chinese_language_and_literature', 'tve_design', 'junior_science_exam', 'junior_math_exam', 'junior_chinese_exam',
             'junior_social_studies', 'tve_mathematics', 'tve_chinese_language', 'tve_natural_sciences', 'junior_chemistry', 'music', 'education', 'three_principles_of_people',
             'taiwanese_hokkien'
            ]

base_dir = "/work/u5110390/BenchWeaver/evaluation_data/tmmluplus/data"
for task in task_list:
    val = load_dataset('ikala/tmmluplus', task)['validation']
    dev = load_dataset('ikala/tmmluplus', task)['train']
    test = load_dataset('ikala/tmmluplus', task)['test']
    # Convert datasets to DataFrames
    val_df = val.to_pandas()
    dev_df = dev.to_pandas()
    test_df = test.to_pandas()
    # Define file paths
    val_file = os.path.join(base_dir, 'val', f"{task}_val.csv")
    dev_file = os.path.join(base_dir, 'dev', f"{task}_dev.csv")
    test_file = os.path.join(base_dir, 'test', f"{task}_test.csv")
    # Create directories if they don't exist
    os.makedirs(os.path.dirname(val_file), exist_ok=True)
    os.makedirs(os.path.dirname(dev_file), exist_ok=True)
    os.makedirs(os.path.dirname(test_file), exist_ok=True)
    # Save DataFrames to CSV files
    val_df.to_csv(val_file, index=False)
    dev_df.to_csv(dev_file, index=False)
    test_df.to_csv(test_file, index=False)

# Define the zip file name
zip_file = "tmmluplus.zip"

# Function to zip a directory and its contents
def zip_dir(directory, zip_file):
    with zipfile.ZipFile(zip_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(directory):
            for file in files:
                zipf.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), directory))

# Zip the directory
zip_dir(base_dir, zip_file)
# Remove the base directory after zipping
shutil.rmtree(base_dir)

In [None]:
import pandas as pd
df = pd.read_csv("/work/u5110390/BenchWeaver/evaluation_data/tmmluplus/subject.tsv", delimiter="\t")
df

In [None]:
mmlu_mapping = {
    "health": "Other",
    "psychology": "Social Sciences",
    "other": "Other",
    "law": "Social Sciences",
    "business": "Other",
    "culture": "Humanities",
    "chemistry": "STEM",
    "physics": "STEM",
    "politics": "Social Sciences",
    "philosophy": "Humanities",
    "math": "STEM",
    "biology": "STEM",
    "engineering": "STEM",
    "computer science": "STEM",
    "education": "Social Sciences",
    "economics": "Social Sciences",
    "geography": "Social Sciences"
}


In [None]:
catogories = {}
for idx in range(df.shape[0]):
    catogories[df['subject'][idx]] = {
        "name": df['name'][idx],
        "category": mmlu_mapping[df['category'][idx]]
    }

In [None]:
import json
with open("/work/u5110390/BenchWeaver/evaluation_data/tmmluplus/mapping.json", "w") as f:
    json.dump(catogories, f, ensure_ascii=False, indent=2)

# TMLU

In [1]:
mapping ={
    "AST_civics": {"name": "分科測驗公民", "category": "Social Science"},
    "AST_geography": {"name": "分科測驗地理", "category": "Social Science"},
    "CAP_civics": {"name": "會考公民", "category": "Social Science"},
    "CAP_geography": {"name": "會考地理", "category": "Social Science"},
    "GSAT_civics": {"name": "學測公民", "category": "Social Science"},
    "GSAT_geography": {"name": "學測地理", "category": "Social Science"},
    "accountant": {"name": "會計師", "category": "Social Science"},
    "clinical_psychologist": {"name": "臨床心理師", "category": "Social Science"},
    "AST_biology": {"name": "分科測驗生物", "category": "STEM"},
    "AST_chemistry": {"name": "分科測驗化學", "category": "STEM"},
    "AST_mathematics": {"name": "分科測驗數學", "category": "STEM"},
    "AST_physics": {"name": "分科測驗物理", "category": "STEM"},
    "CAP_biology": {"name": "會考生物", "category": "STEM"},
    "CAP_chemistry": {"name": "會考化學", "category": "STEM"},
    "CAP_earth_science": {"name": "會考地球科學", "category": "STEM"},
    "CAP_mathematics": {"name": "會考數學", "category": "STEM"},
    "CAP_physics": {"name": "會考物理", "category": "STEM"},
    "GSAT_biology": {"name": "學測生物", "category": "STEM"},
    "GSAT_chemistry": {"name": "學測化學", "category": "STEM"},
    "GSAT_earth_science": {"name": "學測地球科學", "category": "STEM"},
    "GSAT_mathematics": {"name": "學測數學", "category": "STEM"},
    "GSAT_physics": {"name": "學測物理", "category": "STEM"},
    "AST_chinese": {"name": "分科測驗國文", "category": "Humanities"},
    "AST_history": {"name": "分科測驗歷史", "category": "Humanities"},
    "CAP_chinese": {"name": "會考國文", "category": "Humanities"},
    "CAP_history": {"name": "會考歷史", "category": "Humanities"},
    "GSAT_chinese": {"name": "學測國文", "category": "Humanities"},
    "GSAT_history": {"name": "學測歷史", "category": "Humanities"},
    "tour_guide": {"name": "導遊", "category": "Humanities"},
    "tour_leader": {"name": "領隊", "category": "Humanities"},
    "lawyer_qualification": {"name": "律師資格", "category": "Humanities"},
    "driving_rule": {"name": "台灣駕駛規則", "category": "Taiwan Specific"},
    "teacher_qualification": {"name": "教師資格", "category": "Taiwan Specific"},
    "taiwan_tourist_resources": {"name": "台灣觀光資源", "category": "Taiwan Specific"},
    "basic_traditional_chinese_medicine": {"name": "中醫基礎醫學", "category": "Others"},
    "clinical_traditional_chinese_medicine": {"name": "中醫針灸", "category": "Others"},
    "nutritionist": {"name": "營養師", "category": "Others"}
}


In [2]:
task_list = list(mapping.keys())
task_list

['AST_civics',
 'AST_geography',
 'CAP_civics',
 'CAP_geography',
 'GSAT_civics',
 'GSAT_geography',
 'accountant',
 'clinical_psychologist',
 'AST_biology',
 'AST_chemistry',
 'AST_mathematics',
 'AST_physics',
 'CAP_biology',
 'CAP_chemistry',
 'CAP_earth_science',
 'CAP_mathematics',
 'CAP_physics',
 'GSAT_biology',
 'GSAT_chemistry',
 'GSAT_earth_science',
 'GSAT_mathematics',
 'GSAT_physics',
 'AST_chinese',
 'AST_history',
 'CAP_chinese',
 'CAP_history',
 'GSAT_chinese',
 'GSAT_history',
 'tour_guide',
 'tour_leader',
 'lawyer_qualification',
 'driving_rule',
 'teacher_qualification',
 'taiwan_tourist_resources',
 'basic_traditional_chinese_medicine',
 'clinical_traditional_chinese_medicine',
 'nutritionist']

In [3]:
import json
with open("/work/u5110390/BenchWeaver/evaluation_data/tmlu/mapping.json", "w") as f:
    json.dump(mapping, f, ensure_ascii=False, indent=2)

In [7]:
from datasets import load_dataset
from tqdm.auto import tqdm
import pandas as pd
import os
import zipfile
import shutil

def process_df(df:pd.DataFrame) -> pd.DataFrame:
    choice_lists = []
    for idx in range(df.shape[0]):
        answer_opt = df['answer'].iloc[idx]
        choice_list = [
            df.loc[idx, 'A'],
            df.loc[idx, 'B'],
            df.loc[idx, 'C'],
            df.loc[idx, 'D'],
            df.loc[idx, 'E'],
            df.loc[idx, 'F'],
        ]
        df.loc[idx, 'choices'] = str(choice_list)
        df.loc[idx, 'answer'] = df.loc[idx, answer_opt]
        df.loc[idx, 'explanation'] = df.loc[idx, 'explanation'] if 'explanation' in df.columns else " "
    df = df[['question', 'choices', 'answer', 'explanation']]
    return df

base_dir = "/work/u5110390/BenchWeaver/evaluation_data/tmlu/data"
os.makedirs(base_dir, exist_ok=True)

for task in tqdm(mapping.keys()):
    ds = load_dataset("miulab/tmlu", task)
    dev_df = process_df(ds['dev'].to_pandas())
    test_df = process_df(ds['test'].to_pandas())
    dev_file = os.path.join(base_dir, 'dev', f"{task}_dev.csv")
    test_file = os.path.join(base_dir, 'test', f"{task}_test.csv")
    os.makedirs(os.path.dirname(dev_file), exist_ok=True)
    os.makedirs(os.path.dirname(test_file), exist_ok=True)
    dev_df.to_csv(dev_file, index=False)
    test_df.to_csv(test_file, index=False)

# Define the zip file name
zip_file = "/work/u5110390/BenchWeaver/evaluation_data/tmlu/tmlu.zip"

# Function to zip a directory and its contents
def zip_dir(directory, zip_file):
    with zipfile.ZipFile(zip_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(directory):
            for file in files:
                zipf.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), directory))

# Zip the directory
zip_dir(base_dir, zip_file)
# Remove the base directory after zipping
shutil.rmtree(base_dir)

  0%|          | 0/37 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/39.2k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.24k [00:00<?, ?B/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Downloading data:   0%|          | 0.00/127k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/10.0k [00:00<?, ?B/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Downloading data:   0%|          | 0.00/71.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.27k [00:00<?, ?B/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Downloading data:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.13k [00:00<?, ?B/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Downloading data:   0%|          | 0.00/81.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.33k [00:00<?, ?B/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Downloading data:   0%|          | 0.00/246k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/9.19k [00:00<?, ?B/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Downloading data:   0%|          | 0.00/198k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.06k [00:00<?, ?B/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Downloading data:   0%|          | 0.00/46.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.06k [00:00<?, ?B/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Downloading data:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.54k [00:00<?, ?B/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Downloading data:   0%|          | 0.00/89.1k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.08k [00:00<?, ?B/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Downloading data:   0%|          | 0.00/42.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.96k [00:00<?, ?B/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Downloading data:   0%|          | 0.00/62.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.80k [00:00<?, ?B/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

# DRCD

In [None]:
import json
from typing import Any
def load_json(file_path:str) -> Any:
    with open(file_path, "r") as f:
        data = json.load(f)
    return data

train_data = load_json("/work/u5110390/BenchWeaver/evaluation_data/drcd/DRCD_training.json")
dev_data = load_json("/work/u5110390/BenchWeaver/evaluation_data/drcd/DRCD_dev.json")
test_data = load_json("/work/u5110390/BenchWeaver/evaluation_data/drcd/DRCD_test.json")

In [None]:
train_record = {
    "title": [],
    "id": [],
    "context": [],
    "question": [],
    "answer": [],
    "answer_start": [],
}
dev_record = {
    "title": [],
    "id": [],
    "context": [],
    "question": [],
    "answer": [],
    "answer_start": [],
}
test_record = {
    "title": [],
    "id": [],
    "context": [],
    "question": [],
    "answer": [],
    "answer_start": [],
}

In [None]:
from tqdm.auto import tqdm
for idx in tqdm(range(len(train_data['data']))):
    for paragraph in train_data['data'][idx]['paragraphs']:
        for qa in paragraph['qas']:
            train_record['title'].append(train_data['data'][idx]['title'])
            train_record['id'].append(qa['id'])
            train_record['context'].append(paragraph['context'])
            train_record['question'].append(qa['question'])
            train_record['answer'].append(qa['answers'][0]['text'])
            train_record['answer_start'].append(qa['answers'][0]['answer_start'])

for idx in tqdm(range(len(dev_data['data']))):
    for paragraph in dev_data['data'][idx]['paragraphs']:
        for qa in paragraph['qas']:
            dev_record['title'].append(dev_data['data'][idx]['title'])
            dev_record['id'].append(qa['id'])
            dev_record['context'].append(paragraph['context'])
            dev_record['question'].append(qa['question'])
            dev_record['answer'].append(qa['answers'][0]['text'])
            dev_record['answer_start'].append(qa['answers'][0]['answer_start'])
            
for idx in tqdm(range(len(test_data['data']))):
    for paragraph in test_data['data'][idx]['paragraphs']:
        for qa in paragraph['qas']:
            test_record['title'].append(test_data['data'][idx]['title'])
            test_record['id'].append(qa['id'])
            test_record['context'].append(paragraph['context'])
            test_record['question'].append(qa['question'])
            test_record['answer'].append(qa['answers'][0]['text'])
            test_record['answer_start'].append(qa['answers'][0]['answer_start'])

In [None]:
import pandas as pd
train_df = pd.DataFrame(train_record)
dev_df = pd.DataFrame(dev_record)
test_df = pd.DataFrame(test_record)
test_df

In [None]:
train_df.to_csv("/work/u5110390/BenchWeaver/evaluation_data/drcd/data/dev/all_dev.csv", index=False)
dev_df.to_csv("/work/u5110390/BenchWeaver/evaluation_data/drcd/data/val/all_val.csv", index=False)
test_df.to_csv("/work/u5110390/BenchWeaver/evaluation_data/drcd/data/test/all_test.csv", index=False)

# awesome-taiwan-knowledge

In [None]:
import json
from typing import Any
import pandas as pd
def load_json(file_path:str) -> Any:
    with open(file_path, "r") as f:
        data = json.load(f)
    return data

data = load_json("/work/u5110390/BenchWeaver/evaluation_data/awesome-taiwan-knowledge/TTQA_1.0.0_tw_llama_v1.0.json")
data_record = {
    "title": [],
    "question": [],
    "answer": [],
}
for data_dict in data:
    data_record['title'].append(data_dict['answer'])
    data_record['question'].append(data_dict['examples'])
    data_record['answer'].append("以下皆為正確答案：" + "或".join([f"「{ans}」" for ans in data_dict['correct_answers']]))
    
df = pd.DataFrame(data_record)
df

In [None]:
df.to_csv("/work/u5110390/BenchWeaver/evaluation_data/awesome-taiwan-knowledge/data/test/all_test.csv", index=False)

# CMMLU

In [None]:
name_en2zh = {
    "agronomy": "农学",
    "anatomy": "解剖学",
    "ancient_chinese": "古汉语",
    "arts": "艺术学",
    "astronomy": "天文学",
    "business_ethics": "商业伦理",
    "chinese_civil_service_exam": "中国公务员考试",
    "chinese_driving_rule": "中国驾驶规则",
    "chinese_food_culture": "中国饮食文化",
    "chinese_foreign_policy": "中国外交政策",
    "chinese_history":"中国历史",
    "chinese_literature": "中国文学",
    "chinese_teacher_qualification": "中国教师资格",
    "clinical_knowledge": "临床知识",
    "college_actuarial_science":"大学精算学",
    "college_education":"大学教育学",
    "college_engineering_hydrology": "大学工程水文学",
    "college_law": "大学法律",
    "college_mathematics": "大学数学",
    "college_medical_statistics":"大学医学统计",
    "college_medicine": "大学医学",
    "computer_science": "计算机科学",
    "computer_security": "计算机安全",
    "conceptual_physics": "概念物理学",
    "construction_project_management": "建设工程管理",
    "economics": "经济学",
    "education": "教育学",
    "electrical_engineering": "电气工程",
    "elementary_chinese":"小学语文",
    "elementary_commonsense":"小学常识",
    "elementary_information_and_technology": "小学信息技术",
    "elementary_mathematics": "初等数学",
    "ethnology": "民族学",
    "food_science": "食品科学",
    "genetics": "遗传学",
    "global_facts": "全球事实",
    "high_school_biology": "高中生物",
    "high_school_chemistry": "高中化学",
    "high_school_geography": "高中地理",
    "high_school_mathematics": "高中数学",
    "high_school_physics": "高中物理学",
    "high_school_politics": "高中政治",
    "human_sexuality": "人类性行为",
    "international_law": "国际法学",
    "journalism": "新闻学",
    "jurisprudence": "法理学",
    "legal_and_moral_basis": "法律与道德基础",
    "logical": "逻辑学",
    "machine_learning": "机器学习",
    "management": "管理学",
    "marketing": "市场营销",
    "marxist_theory": "马克思主义理论",
    "modern_chinese": "现代汉语",
    "nutrition": "营养学",
    "philosophy": "哲学",
    "professional_accounting": "专业会计",
    "professional_law": "专业法学",
    "professional_medicine": "专业医学",
    "professional_psychology": "专业心理学",
    "public_relations": "公共关系",
    "security_study":"安全研究",
    "sociology": "社会学",
    "sports_science": "体育学",
    "traditional_chinese_medicine": "中医中药",
    "virology": "病毒学",
    "world_history":"世界历史",
    "world_religions": "世界宗教",
}

subcategories = {
    "agronomy": ['other'],
    "anatomy": ['biology'],
    "ancient_chinese": ['linguistics','china specific'],
    "arts": ['arts'],
    "astronomy": ['physics'],
    "business_ethics": ['business'],
    "chinese_civil_service_exam": ['politics','china specific'],
    "chinese_driving_rule": ['other','china specific'],
    "chinese_food_culture": ['culture','china specific'],
    "chinese_foreign_policy": ['politics','china specific'],
    "chinese_history":['history','china specific'],
    "chinese_literature": ['literature','china specific'],
    "chinese_teacher_qualification": ['education','china specific'],
    "college_actuarial_science":['math'],
    "college_education":['education'],
    "college_engineering_hydrology": ['engineering'],
    "college_law": ['law'],
    "college_mathematics": ['math'],
    "college_medical_statistics":['statistics'],
    "clinical_knowledge": ['other'],
    "college_medicine": ['other'],
    "computer_science": ['computer science'],
    "computer_security": ['other'],
    "conceptual_physics": ['physics'],
    "construction_project_management": ['other','china specific'],
    "economics": ['economics'],
    "education": ['education'],
    "elementary_chinese":['linguistics','china specific'],
    "elementary_commonsense":['other','china specific'],
    "elementary_information_and_technology": ['other'],
    "electrical_engineering": ['engineering'],
    "elementary_mathematics": ['math'],
    "ethnology": ['culture','china specific'],
    "food_science": ['other'],
    "genetics": ['biology'],
    "global_facts": ['global'],
    "high_school_biology": ['biology'],
    "high_school_chemistry": ['chemistry'],
    "high_school_geography": ['geography'],
    "high_school_mathematics": ['math'],
    "high_school_physics": ['physics'],
    "high_school_politics": ['politics','china specific'],
    "human_sexuality": ['other'],
    "international_law": ['law'],
    "journalism": ['sociology'],
    "jurisprudence": ['law'],
    "legal_and_moral_basis": ['other'],
    "logical": ['philosophy'],
    "machine_learning": ['computer science'],
    "management": ['business'],
    "marketing": ['business'],
    "marxist_theory": ['philosophy'],
    "modern_chinese": ['linguistics','china specific'],
    "nutrition": ['other'],
    "philosophy": ['philosophy'],
    "professional_accounting": ['business'],
    "professional_law": ['law'],
    "professional_medicine": ['other'],
    "professional_psychology": ['psychology'],
    "public_relations": ['politics'],
    "security_study": ['politics'],
    "sociology": ['culture'],
    "sports_science": ['other'],
    "traditional_chinese_medicine": ['other','china specific'],
    "virology": ['biology'],
    "world_history":['history'],
    "world_religions": ['global'],
}

categories = {
    "STEM": ["physics", "chemistry", "biology", "computer science", "math", "engineering", "statistics"],
    "Humanities": ["history", "philosophy", "law", "arts", "literature", "global"],
    "Social Science": ['linguistics',"business", "politics", "culture", "economics", "geography", "psychology", "education", "sociology"],
    "Other":["other"],
    # "China specific": ["china specific"],
}

In [None]:
mapping = {}

for name, catogories in name_en2zh.items():
    sub_cat = subcategories[name][0]
    for cat, sub_cat_list in categories.items():
        if sub_cat in sub_cat_list:
            category = cat
            break
    mapping[name] = {
        "name": name_en2zh[name],
        "category": cat
    }

In [None]:
import json
with open("/work/u5110390/BenchWeaver/evaluation_data/cmmlu/mapping.json", "w") as f:
    json.dump(mapping, f, ensure_ascii=False, indent=2)

In [None]:
from datasets import load_dataset
import os
import pandas as pd
import zipfile
import shutil

task_list = ['agronomy', 'anatomy', 'ancient_chinese', 'arts', 'astronomy', 'business_ethics', 'chinese_civil_service_exam', 'chinese_driving_rule', 'chinese_food_culture', 'chinese_foreign_policy', 'chinese_history', 'chinese_literature', 
'chinese_teacher_qualification', 'clinical_knowledge', 'college_actuarial_science', 'college_education', 'college_engineering_hydrology', 'college_law', 'college_mathematics', 'college_medical_statistics', 'college_medicine', 'computer_science',
'computer_security', 'conceptual_physics', 'construction_project_management', 'economics', 'education', 'electrical_engineering', 'elementary_chinese', 'elementary_commonsense', 'elementary_information_and_technology', 'elementary_mathematics', 
'ethnology', 'food_science', 'genetics', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_geography', 'high_school_mathematics', 'high_school_physics', 'high_school_politics', 'human_sexuality',
'international_law', 'journalism', 'jurisprudence', 'legal_and_moral_basis', 'logical', 'machine_learning', 'management', 'marketing', 'marxist_theory', 'modern_chinese', 'nutrition', 'philosophy', 'professional_accounting', 'professional_law', 
'professional_medicine', 'professional_psychology', 'public_relations', 'security_study', 'sociology', 'sports_science', 'traditional_chinese_medicine', 'virology', 'world_history', 'world_religions']


base_dir = "/work/u5110390/BenchWeaver/evaluation_data/cmmlu/data"
for task in task_list:
    dev = load_dataset('haonan-li/cmmlu', task)['dev']
    test = load_dataset('haonan-li/cmmlu', task)['test']
    # Convert datasets to DataFrames
    dev_df = dev.to_pandas()
    dev_df['question'] = dev_df['Question']
    dev_df['answer'] = dev_df['Answer']
    test_df = test.to_pandas()
    test_df['question'] = test_df['Question']
    test_df['answer'] = test_df['Answer']
    # Drop the original columns
    dev_df.drop(columns=['Question', 'Answer'], inplace=True)
    test_df.drop(columns=['Question', 'Answer'], inplace=True)
    # Define file paths
    dev_file = os.path.join(base_dir, 'dev', f"{task}_dev.csv")
    test_file = os.path.join(base_dir, 'test', f"{task}_test.csv")
    # Create directories if they don't exist
    os.makedirs(os.path.dirname(dev_file), exist_ok=True)
    os.makedirs(os.path.dirname(test_file), exist_ok=True)
    # Save DataFrames to CSV files
    dev_df.to_csv(dev_file, index=False)
    test_df.to_csv(test_file, index=False)

# Define the zip file name
zip_file = "cmmlu.zip"

# Function to zip a directory and its contents
def zip_dir(directory, zip_file):
    with zipfile.ZipFile(zip_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(directory):
            for file in files:
                zipf.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), directory))

# Zip the directory
zip_dir(base_dir, zip_file)
# Remove the base directory after zipping
shutil.rmtree(base_dir)

# TAIDE Bench

In [3]:
import json
import pandas as pd
import os
export_dir = "/work/u5110390/BenchWeaver/evaluation_data/taide-bench/data/test"
os.makedirs(export_dir, exist_ok=True)
file_path = "/work/u5110390/BenchWeaver/evaluation_data/taide-bench/data.jsonl"
with open(file_path, "r") as f:
    data = [json.loads(line) for line in f]
dfs = {
    "en2zh": [],
    "zh2en": [],
    "summary": [],
    "letter": [],
    "essay": [],
}
for data_dict in data:
    if data_dict['task'] == "en2zh":
        dfs["en2zh"].append({
            "task": data_dict["task"],
            "qid": data_dict["qid"],
            "question": data_dict["prompt"],
            "answer": data_dict["answer"],
        })
    elif data_dict['task'] == "zh2en":
        dfs["zh2en"].append({
            "task": data_dict["task"],
            "qid": data_dict["qid"],
            "question": data_dict["prompt"],
            "answer": data_dict["answer"],
        })
    elif data_dict['task'] == "summary":
        dfs["summary"].append({
            "task": data_dict["task"],
            "qid": data_dict["qid"],
            "question": data_dict["prompt"],
            "answer": data_dict["answer"],
        })
    elif data_dict['task'] == "letter":
        dfs["letter"].append({
            "task": data_dict["task"],
            "qid": data_dict["qid"],
            "question": data_dict["prompt"],
            "answer": data_dict["answer"],
        })
    elif data_dict['task'] == "essay":
        dfs["essay"].append({
            "task": data_dict["task"],
            "qid": data_dict["qid"],
            "question": data_dict["prompt"],
            "answer": data_dict["answer"],
        })
        
    
for task, df in dfs.items():
    df = pd.DataFrame(df)
    df.to_csv(f"{export_dir}/{task}_test.csv", index=False)
    print(f"Exported {task} data to {export_dir}/{task}_test.csv")

Exported en2zh data to /work/u5110390/BenchWeaver/evaluation_data/taide-bench/data/test/en2zh_test.csv
Exported zh2en data to /work/u5110390/BenchWeaver/evaluation_data/taide-bench/data/test/zh2en_test.csv
Exported summary data to /work/u5110390/BenchWeaver/evaluation_data/taide-bench/data/test/summary_test.csv
Exported letter data to /work/u5110390/BenchWeaver/evaluation_data/taide-bench/data/test/letter_test.csv
Exported essay data to /work/u5110390/BenchWeaver/evaluation_data/taide-bench/data/test/essay_test.csv


# CCPM

In [1]:
import json
train_file_path = "/work/u5110390/BenchWeaver/evaluation_data/ccpm/train.jsonl"
valid_file_path = "/work/u5110390/BenchWeaver/evaluation_data/ccpm/valid.jsonl"
with open(train_file_path, "r") as f:
    train_data = [json.loads(line) for line in f]

with open(valid_file_path, "r") as f: 
    valid_data = [json.loads(line) for line in f]

In [2]:
train_data[0]

{'translation': '诗人啊，你竟像在遥远的地方站立船头。',
 'choices': ['行人初上木兰舟', '骚人遥驻木兰舟', '有人独上木兰舟', '行人迢递木兰舟'],
 'answer': 1}

In [3]:
train_dict = {
    "question": [],
    "A": [],
    "B": [],
    "C": [],
    "D": [],
    "answer": [],
}

valid_dict = {
    "question": [],
    "A": [],
    "B": [],
    "C": [],
    "D": [],
    "answer": [],
}
idx2alphabet = {
    0: "A",
    1: "B",
    2: "C",
    3: "D"
}

In [None]:
for _dict in train_data:
    train_dict['question'].append("以下是将某句古诗文翻译而成的现代表述：{translation}\n该翻译所对应的古诗文是：".format(translation=_dict['translation']))
    train_dict['A'].append(_dict['choices'][0])
    train_dict['B'].append(_dict['choices'][1])
    train_dict['C'].append(_dict['choices'][2])
    train_dict['D'].append(_dict['choices'][3])
    train_dict['answer'].append(idx2alphabet[_dict['answer']])

for _dict in valid_data:
    valid_dict['question'].append("以下是将某句古诗文翻译而成的现代表述：{translation}\n该翻译所对应的古诗文是：".format(translation=_dict['translation']))
    valid_dict['A'].append(_dict['choices'][0])
    valid_dict['B'].append(_dict['choices'][1])
    valid_dict['C'].append(_dict['choices'][2])
    valid_dict['D'].append(_dict['choices'][3])
    valid_dict['answer'].append(idx2alphabet[_dict['answer']])
    
import pandas as pd    
train_df = pd.DataFrame(train_dict)
valid_df = pd.DataFrame(valid_dict)

In [5]:
train_df.to_csv("/work/u5110390/BenchWeaver/evaluation_data/ccpm/data/dev/all_train.csv", index=False)
valid_df.to_csv("/work/u5110390/BenchWeaver/evaluation_data/ccpm/data/test/all_test.csv", index=False)

# cmath

In [None]:
import json
main_file_path = "/work/u5110390/BenchWeaver/evaluation_data/cmath/cmath_test.jsonl"
distractor_file_path = "/work/u5110390/BenchWeaver/evaluation_data/cmath/distractor.jsonl"
with open(main_file_path, "r") as f:
    main_data = [json.loads(line) for line in f]

with open(distractor_file_path, "r") as f: 
    distractor_data = [json.loads(line) for line in f]

In [None]:
ditstractor_dict = {
    "question": [],
    "answer": [],
    "distractor": [],
    "original": [],
}
main_dict = {
    "question": [],
    "answer": [],
    "distractor": [],
    "original": [],
}

In [None]:
for data_dict in main_data:
    main_dict['question'].append(data_dict['question'])
    main_dict['answer'].append(data_dict['golden'])
    main_dict['distractor'].append(0)
    main_dict['original'].append(data_dict['question'])
    
for data_dict in distractor_data:
    ditstractor_dict['question'].append(data_dict['input'])
    ditstractor_dict['answer'].append(data_dict['golden'])
    ditstractor_dict['distractor'].append(data_dict['distractor'])
    ditstractor_dict['original'].append(data_dict['original'])
    
import pandas as pd
main_df = pd.DataFrame(main_dict)
distractor_df = pd.DataFrame(ditstractor_dict)

In [None]:
main_df.to_csv("/work/u5110390/BenchWeaver/evaluation_data/cmath/data/test/main_train.csv", index=False)
distractor_df.to_csv("/work/u5110390/BenchWeaver/evaluation_data/cmath/data/test/distractor_test.csv", index=False)

# CIF-Bench

In [None]:
import os
import json
import re

folder = "/work/u5110390/BenchWeaver/evaluation_data/cif-bench/public"

def retrieve_task_name(file_name:str):
    name = file_name.split(".")[0]
    task_name = re.sub(r"\d+_|_public", "", name).lower()
    return task_name

data_record = {}
for file_name in os.listdir(folder):
    with open(os.path.join(folder, file_name), "r") as f:
        data = [json.loads(line) for line in f]
    task_name = retrieve_task_name(file_name)
    if task_name not in data_record:
        data_record[task_name] = data
    else:
        print(task_name)
        data_record[task_name].extend(data)

In [None]:
new_data_record = {}

for task_name, data in data_record.items():
    if task_name not in new_data_record:
        new_data_record[task_name] = {
            "question": [],
            "answer": [],
        }
        for data_dict in data:
            new_data_record[task_name]['question'].append(
                "[任务指南]\n" + 
                data_dict['Instruction'] + "\n" +
                "[任务输入]\n" +
                data_dict['Input'] + "\n" +
                "[任务输出]："
            )
            new_data_record[task_name]['answer'].append(data_dict['Output'])

In [None]:
import pandas as pd
export_dir = "/work/u5110390/BenchWeaver/evaluation_data/cif-bench/data/test"

for task_name, data in new_data_record.items():
    df = pd.DataFrame(data)
    df.to_csv(f"{export_dir}/{task_name}_test.csv", index=False)
    

In [None]:
import json
with open("/work/u5110390/BenchWeaver/evaluation_data/cif-bench/mapping.json", "r") as f:
    mapping = json.load(f)
    
scoreList = list(set([_['category'] for _ in mapping.values()]))
scoreList

# C3

In [None]:
import json
import os
source_dir = "/work/u5110390/BenchWeaver/evaluation_data/c3/dataset"
file_name_list = os.listdir(source_dir)

datasets = []
for file_name in file_name_list:
    with open(os.path.join(source_dir, file_name), "r") as f:
        data = json.load(f)
    datasets.append(data)

In [None]:
dataset_record = [
    
]
for dataset in datasets:
    record = {
        "paragraph": [],
        "question": [],
        "choices": [],
        "answer": [],
        "idx": [],
    }
    for data_list in dataset:
        record["paragraph"].append("\n".join(data_list[0]))
        record["question"].append(data_list[1][0]['question'])
        record["choices"].append(str([str(text) for text in data_list[1][0]['choice']]))
        record["answer"].append(data_list[1][0]['answer'])
        record["idx"].append(data_list[2])
        if data_list[1][0]['answer'] not in [str(text) for text in data_list[1][0]['choice']]:
            print(data_list)
    dataset_record.append(record)
    
import pandas as pd

dataframe_list = []
for record in dataset_record:
    df = pd.DataFrame(record)
    dataframe_list.append(df)


In [None]:
for origin_name, df in zip(file_name_list, dataframe_list):
    task = "mixed" if "m" in origin_name else "dialogue"
    if "test" in origin_name:
        split = "test"
    elif "train" in origin_name:
        split = "dev"
    else:
        split = "val"
    df.to_csv(f"/work/u5110390/BenchWeaver/evaluation_data/c3/data/{split}/{task}_{split}.csv", index=False)

# Chinese SafetyQA

In [None]:
import json
file_path = "/work/u5110390/BenchWeaver/evaluation_data/chinese-safetyqa/chinese_safetyqa.jsonl"
with open(file_path, "r") as f:
    data = [json.loads(line) for line in f]
import pandas as pd
df = pd.DataFrame(data)

In [None]:
import ast
from typing import Tuple
choices_list = []
for idx in range(df.shape[0]):
    choices_list.append([
        _ for _ in
        ast.literal_eval(df['options'][idx]).values()
    ])
df['choices'] = choices_list
df['answer'] = df['standard_answer']
def split_cate(cate: str) -> Tuple:
    cate_list = cate.split("-")
    return cate_list[0], cate_list[1], cate_list[2]

for idx in range(df.shape[0]):
    df.loc[idx, 'main_cate'], df.loc[idx, 'sub_cate'], df.loc[idx, 'sub_sub_cate'] = split_cate(df['cate'][idx])
df = df[['question', 'answer', 'correct_answer', 'choices', 'main_cate', 'sub_cate', 'sub_sub_cate']]


In [None]:
df = df[df['answer'] == "2013年"]
df

In [None]:
task_list = df['main_cate'].unique()
task_list

In [None]:
en_task_list = [
    "theoretical_and_technical_knowledge",
    "ethical_and_moral_risks",
    "bias_and_discrimination_risks",
    "abuse_and_hate_speech_risks",
    "physical_and_mental_health_risks",
    "legal_and_regulatory_risks",
    "rumor_and_misinformation_risks"
]


In [None]:
dfs = df.groupby(['main_cate'])

In [None]:
import os
export_dir = "/work/u5110390/BenchWeaver/evaluation_data/chinese-safetyqa/data/test"
os.makedirs(export_dir, exist_ok=True)
for dataframe, task_name in zip(dfs,  en_task_list):
    dataframe[1].to_csv(f"{export_dir}/{task_name}_test.csv", index=False)

In [None]:
import pandas as pd
import ast
export_dir = "/work/u5110390/BenchWeaver/evaluation_data/chinese-safetyqa/data/test"
for df_name in os.listdir(export_dir):
    df = pd.read_csv(os.path.join(export_dir, df_name))
    for i in range(df.shape[0]):
        choices = ast.literal_eval(df['choices'][i])
        answer = df['answer'][i]
        if answer not in choices:
            print(df_name, i)
            print(df['question'][i])
            print(choices)
            print(answer)

# MT-Bench-TW

In [None]:
import json
file_path = "/work/u5110390/BenchWeaver/evaluation_data/mt-bench-tw/processed-data.jsonl"
with open(file_path, "r") as f:
    data = [json.loads(line) for line in f]
    
data[0].keys()

In [None]:

from tqdm.auto import tqdm
datasets = {}

for data_dict in tqdm(data):
    if data_dict['category'] not in datasets:
        datasets[data_dict['category']] = {
            "question_id": [],
            "question_turns": [],
            "answer_turns": [],
        }
    data_dict['turns'] = [_ for _ in data_dict['turns'] if _ != ""]
    assert len(data_dict['turns']) == len(data_dict['reference']), f"{data_dict['question_id']} {len(data_dict['turns'])} {len(data_dict['reference'])}"
    datasets[data_dict['category']]['question_id'].append(data_dict['question_id'])
    datasets[data_dict['category']]['question_turns'].append(data_dict['turns'])
    datasets[data_dict['category']]['answer_turns'].append(data_dict['reference'])


In [None]:
import os
os.makedirs("/work/u5110390/BenchWeaver/evaluation_data/mt-bench-tw/data/test", exist_ok=True)

import pandas as pd

for task_name, data in datasets.items():
    df = pd.DataFrame(data)
    df.to_csv(f"/work/u5110390/BenchWeaver/evaluation_data/mt-bench-tw/data/test/{task_name}_test.csv", index=False)

df

In [None]:
list(datasets.keys())

# hellaswag

In [None]:
import json

train_path = "/work/u5110390/BenchWeaver/evaluation_data/hellaswag/hellaswag_train.jsonl"
test_path = "/work/u5110390/BenchWeaver/evaluation_data/hellaswag/hellaswag_val.jsonl"

with open(train_path, "r") as f:
    train_data = [json.loads(line) for line in f]

with open(test_path, "r") as f:
    test_data = [json.loads(line) for line in f]


In [None]:
train_dict = {
    "activity_label": [],
    "split_type": [],
    "question": [],
    "A": [],
    "B": [],
    "C": [],
    "D": [],
    "answer": [],
}

test_dict = {
    "activity_label": [],
    "split_type": [],
    "question": [],
    "A": [],
    "B": [],
    "C": [],
    "D": [],
    "answer": [],
}

for data_dict in train_data:
    train_dict['activity_label'].append(data_dict['activity_label'])
    train_dict['split_type'].append(data_dict['split_type'])
    train_dict['question'].append(data_dict['ctx'])
    train_dict['A'].append(data_dict['endings'][0])
    train_dict['B'].append(data_dict['endings'][1])
    train_dict['C'].append(data_dict['endings'][2])
    train_dict['D'].append(data_dict['endings'][3])
    train_dict['answer'].append(chr(ord('A') + data_dict['label']))
    
for data_dict in test_data:
    test_dict['activity_label'].append(data_dict['activity_label'])
    test_dict['split_type'].append(data_dict['split_type'])
    test_dict['question'].append(data_dict['ctx'])
    test_dict['A'].append(data_dict['endings'][0])
    test_dict['B'].append(data_dict['endings'][1])
    test_dict['C'].append(data_dict['endings'][2])
    test_dict['D'].append(data_dict['endings'][3])
    test_dict['answer'].append(chr(ord('A') + data_dict['label']))

In [None]:
import pandas as pd
import os
train_df = pd.DataFrame(train_dict)
test_df = pd.DataFrame(test_dict)

train_df.to_csv("/work/u5110390/BenchWeaver/evaluation_data/hellaswag/data/dev/all_train.csv", index=False)
test_df.to_csv("/work/u5110390/BenchWeaver/evaluation_data/hellaswag/data/test/all_test.csv", index=False)

In [None]:
test_df

# IFEval

In [None]:
import json

data_path = "/work/u5110390/BenchWeaver/evaluation_data/ifeval/input_data.jsonl"

with open(data_path, "r") as f:
    data = [json.loads(line) for line in f]

In [None]:
import pandas as pd
import os
os.makedirs("/work/u5110390/BenchWeaver/evaluation_data/ifeval/data/test", exist_ok=True)
df = pd.DataFrame(data)
df = df.rename(columns={
    "prompt": "question",
})
df.to_parquet("/work/u5110390/BenchWeaver/evaluation_data/ifeval/data/test/all_test.parquet", index=False)

# flores_plus

In [None]:
# kor	Hang	kore1280	Korean
# cmn	Hant	taib1240	Mandarin Chinese (Taiwanese)
# cmn	Hans	beij1234	Mandarin Chinese (Standard Beijing)
# eng	Latn	stan1293	English 

In [None]:
from datasets import load_dataset
# 
cmn_Hans = load_dataset("openlanguagedata/flores_plus", "cmn_Hans")
cmn_Hant = load_dataset("openlanguagedata/flores_plus", "cmn_Hant")
kor_Hang = load_dataset("openlanguagedata/flores_plus", "kor_Hang")
eng_Latn = load_dataset("openlanguagedata/flores_plus", "eng_Latn")

Using the latest cached version of the dataset since openlanguagedata/flores_plus couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'cmn_Hans' at /home/u5110390/.cache/huggingface/datasets/openlanguagedata___flores_plus/cmn_Hans/0.0.0/938b11b24deff75792c7b904cb9802b8cd168564 (last modified on Mon Apr 21 16:41:41 2025).
Using the latest cached version of the dataset since openlanguagedata/flores_plus couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'cmn_Hant' at /home/u5110390/.cache/huggingface/datasets/openlanguagedata___flores_plus/cmn_Hant/0.0.0/938b11b24deff75792c7b904cb9802b8cd168564 (last modified on Mon Apr 21 16:41:47 2025).
Using the latest cached version of the dataset since openlanguagedata/flores_plus couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'kor_Hang' at /home/u5110390/.cache/huggingface/datasets/openlanguagedata___flores_plus/kor_Hang/0.0.0/938b11b24

In [None]:
import os
os.makedirs("/work/u5110390/BenchWeaver/evaluation_data/flores-plus/data/dev", exist_ok=True)
os.makedirs("/work/u5110390/BenchWeaver/evaluation_data/flores-plus/data/test", exist_ok=True)
for dataset, name in zip(
    [cmn_Hans, cmn_Hant, kor_Hang, eng_Latn], 
    ["cmn_Hans", "cmn_Hant", "kor_Hang", "eng_Latn"]
    ):
    dev_df = dataset['dev'].to_pandas()
    test_df = dataset['devtest'].to_pandas()
    dev_df.to_csv(f"/work/u5110390/BenchWeaver/evaluation_data/flores-plus/data/dev/{name}_dev.csv", index=False)
    test_df.to_csv(f"/work/u5110390/BenchWeaver/evaluation_data/flores-plus/data/test/{name}_test.csv", index=False)

# MBPP

In [None]:
# You are an expert Python programmer, and here is your task: {prompt} Your code should pass these tests:\n\n{tests}\n[BEGIN]\n{code}\n[DONE]

In [2]:
from datasets import load_dataset
dataset_full = load_dataset("mbpp")
dataset_full

DatasetDict({
    train: Dataset({
        features: ['task_id', 'text', 'code', 'test_list', 'test_setup_code', 'challenge_test_list'],
        num_rows: 374
    })
    test: Dataset({
        features: ['task_id', 'text', 'code', 'test_list', 'test_setup_code', 'challenge_test_list'],
        num_rows: 500
    })
    validation: Dataset({
        features: ['task_id', 'text', 'code', 'test_list', 'test_setup_code', 'challenge_test_list'],
        num_rows: 90
    })
    prompt: Dataset({
        features: ['task_id', 'text', 'code', 'test_list', 'test_setup_code', 'challenge_test_list'],
        num_rows: 10
    })
})

In [3]:
train_df = dataset_full['prompt'].to_pandas()
test_df = dataset_full['test'].to_pandas()
val_df = dataset_full['validation'].to_pandas()

train_df.to_parquet("/work/u5110390/BenchWeaver/evaluation_data/mbpp/data/dev/full_train.parquet", index=False)
test_df.to_parquet("/work/u5110390/BenchWeaver/evaluation_data/mbpp/data/test/full_test.parquet", index=False)
val_df.to_parquet("/work/u5110390/BenchWeaver/evaluation_data/mbpp/data/val/full_val.parquet", index=False)


# LogicKor

In [1]:
import json
def load_jsonl(file_path: str):
    with open(file_path, "r") as f:
        data = [json.loads(line) for line in f]
    return data

data = load_jsonl("/work/u5110390/BenchWeaver/evaluation_data/logickor/questions.jsonl")
data[0]

{'id': 1,
 'category': '추론(Reasoning)',
 'questions': ['각국의 법률에서는 정의라는 개념이 자주 등장하며, 법령의 형성과 해석에 있어 매우 중요한 부분을 차지한다. 하지만 정의란 명확히 규정할 수 없는 개념이기에 해석의 논란이 있을 수 있다. 그렇다면 사회구성원의 대다수가 납득할 수 있는 보편적 정의를 입증하는 방법은 무엇일지 생각해보아라.',
  '위 답변을 영어 문장 3개로 요약한 후. 해당 문장에 있는 단어 3개를 선택하여 단어의 의미를 설명해라.'],
 'references': [None, None]}

In [7]:
from tqdm.auto import tqdm
datasets = {}

for data_dict in tqdm(data):
    if data_dict['category'] not in datasets:
        datasets[data_dict['category']] = {
            "question_id": [],
            "question_turns": [],
            "answer_turns": [],
        }
    datasets[data_dict['category']]['question_id'].append(data_dict['id'])
    datasets[data_dict['category']]['question_turns'].append(data_dict['questions'])
    answer_turns = []
    for answer in data_dict['references']:
        if answer is not None:
            answer_turns.append(answer)
        else:
            answer_turns.append("")
    datasets[data_dict['category']]['answer_turns'].append(answer_turns)
    


  0%|          | 0/42 [00:00<?, ?it/s]

In [10]:
mapping = {}
for cat in datasets.keys():
    mapping[cat] = {
        "name": cat,
        "category": cat,
    }

In [12]:
with open("/work/u5110390/BenchWeaver/evaluation_data/logickor/mapping.json", "w") as f:
    json.dump(mapping, f, ensure_ascii=False, indent=2)

In [None]:
import pandas as pd

for task_name, data in datasets.items():
    df = pd.DataFrame(data)
    df.to_csv(f"/work/u5110390/BenchWeaver/evaluation_data/logickor/data/test/{task_name}_test.csv", index=False)

# MedMCQA

In [1]:
from datasets import load_dataset

ds = load_dataset("openlifescienceai/medmcqa")

Downloading readme: 0.00B [00:00, ?B/s]

Downloading data:   0%|          | 0.00/85.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/936k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.48M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/182822 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6150 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4183 [00:00<?, ? examples/s]

In [4]:
ds['train'][0]

{'id': 'e9ad821a-c438-4965-9f77-760819dfa155',
 'question': 'Chronic urethral obstruction due to benign prismatic hyperplasia can lead to the following change in kidney parenchyma',
 'opa': 'Hyperplasia',
 'opb': 'Hyperophy',
 'opc': 'Atrophy',
 'opd': 'Dyplasia',
 'cop': 2,
 'choice_type': 'single',
 'exp': 'Chronic urethral obstruction because of urinary calculi, prostatic hyperophy, tumors, normal pregnancy, tumors, uterine prolapse or functional disorders cause hydronephrosis which by definition is used to describe dilatation of renal pelvis and calculus associated with progressive atrophy of the kidney due to obstruction to the outflow of urine Refer Robbins 7yh/9,1012,9/e. P950',
 'subject_name': 'Anatomy',
 'topic_name': 'Urinary tract'}

In [None]:
from typing import Literal
import pandas as pd
def format_df(ds, split: str = Literal["train", "validation", "test"]):
    record = {
        "question": [],
        "A": [],
        "B": [],
        "C": [],
        "D": [],
        "answer": [],
    }
    for item in ds[split]:
        record['question'].append(item['question'])
        record['A'].append(item['opa'])
        record['B'].append(item['opb'])
        record['C'].append(item['opc'])
        record['D'].append(item['opd'])
        record['answer'].append(chr(ord('A') + item['cop']))
    return pd.DataFrame(record)

format_df(ds, "validation").to_csv("/work/u5110390/BenchWeaver/evaluation_data/medmcqa/data/test/all_test.csv", index=False)
format_df(ds, "train").sample(100).to_csv("/work/u5110390/BenchWeaver/evaluation_data/medmcqa/data/dev/all_dev.csv", index=False)

# MedQA

In [18]:
from datasets import load_dataset

ds = load_dataset("openlifescienceai/medqa")

Downloading readme:   0%|          | 0.00/858 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.68M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/739k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/720k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10178 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1273 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/1272 [00:00<?, ? examples/s]

In [35]:
ds['train'][0]['data']

{'Correct Answer': 'Nitrofurantoin',
 'Correct Option': 'D',
 'Options': {'A': 'Ampicillin',
  'B': 'Ceftriaxone',
  'C': 'Doxycycline',
  'D': 'Nitrofurantoin'},
 'Question': 'A 23-year-old pregnant woman at 22 weeks gestation presents with burning upon urination. She states it started 1 day ago and has been worsening despite drinking more water and taking cranberry extract. She otherwise feels well and is followed by a doctor for her pregnancy. Her temperature is 97.7°F (36.5°C), blood pressure is 122/77 mmHg, pulse is 80/min, respirations are 19/min, and oxygen saturation is 98% on room air. Physical exam is notable for an absence of costovertebral angle tenderness and a gravid uterus. Which of the following is the best treatment for this patient?'}

In [36]:
def format_df(ds, split: str = Literal["train", "dev", "test"]):
    record = {
        "question": [],
        "A": [],
        "B": [],
        "C": [],
        "D": [],
        "answer": [],
    }
    for item in ds[split]:
        record['question'].append(item['data']['Question'])
        record['A'].append(item['data']['Options']['A'])
        record['B'].append(item['data']['Options']['B'])
        record['C'].append(item['data']['Options']['C'])
        record['D'].append(item['data']['Options']['D'])
        record['answer'].append(item['data']['Correct Option'])
    return pd.DataFrame(record)

format_df(ds, "test").to_csv("/work/u5110390/BenchWeaver/evaluation_data/medqa/data/test/all_test.csv", index=False)
format_df(ds, "dev").to_csv("/work/u5110390/BenchWeaver/evaluation_data/medqa/data/val/all_val.csv", index=False)
format_df(ds, "train").sample(100).to_csv("/work/u5110390/BenchWeaver/evaluation_data/medqa/data/dev/all_dev.csv", index=False)