# Download and process the benchmark task datasets

This notebook shows the process... todo

In [1]:
library_path = "library/benchmark_tasks"
language_path = "language/benchmark_tasks"

In [2]:
from datasets import load_dataset
from llm_cgr import save_json

from src.dataset import process_dataset
from src.constants import (
    BIGCODEBENCH_EXTERNAL_LIBRARIES,
    PYTHON_STDLIB,
    TIOBE_TOP_50_LANGUAGE_TERMS,
)
from src.prompts import (
    LIBRARY_BENCHMARK_PROMPT,
    LANGUAGE_BENCHMARK_PROMPT,
)

### **BigCodeBench**

Dataset: https://huggingface.co/datasets/bigcode/bigcodebench

Paper: https://arxiv.org/abs/2406.15877

In [3]:
raw_bigcodebench = load_dataset(
    path="bigcode/bigcodebench",
    split="v0.1.4",
    revision="b74c0d0bf70d2c0bc459be537895cca163007f1a",
)
print(raw_bigcodebench)

Dataset({
    features: ['task_id', 'complete_prompt', 'instruct_prompt', 'canonical_solution', 'code_prompt', 'test', 'entry_point', 'doc_struct', 'libs'],
    num_rows: 1140
})


In [4]:
# reformat dataset to task_id -> task_description dictionary

bigcodebench = {}  # tasks that use external libraries
groundtruth = {}  # ground truth solution data for later analysis
external_libraries = set()  # set of all external libraries used in the tasks

for item in raw_bigcodebench:
    # extract the task description
    doc_struct = eval(item["doc_struct"])
    base_task = "\n".join(doc_struct["description"]).split("Args:")[0].strip()

    # extract library data
    libs = set(eval(item["libs"].lower()))
    std_libs = set(libs).intersection(PYTHON_STDLIB)
    ext_libs = set(libs).difference(PYTHON_STDLIB)
    external_libraries.update(ext_libs)

    # skip tasks without external libraries or if the description contains an external library
    if (not ext_libs) or any(
        _lib in base_task.lower() for _lib in BIGCODEBENCH_EXTERNAL_LIBRARIES
    ):
        continue

    # save the task data
    bigcodebench[item["task_id"]] = LIBRARY_BENCHMARK_PROMPT.format(task=base_task)
    groundtruth[item["task_id"]] = {
        "solution": item["code_prompt"] + item["canonical_solution"],
        "ext_libs": sorted(ext_libs),
        "std_libs": sorted(std_libs),
    }

print(f"Have {len(bigcodebench)} tasks needing external libraries.")

Have 525 tasks needing external libraries.


In [None]:
save_json(
    data=bigcodebench,
    file_path=f"{library_path}/bigcodebench.json",
)
save_json(
    data=groundtruth,
    file_path=f"{library_path}/groundtruth.json",
)

In [None]:
# list all external libraries used in the ground truth solutions

dataset_libs = set()
for _gt in groundtruth.values():
    dataset_libs.update(_gt["ext_libs"])

print(
    f"Found {len(external_libraries)} external libraries in BigCodeBench ground truth solutions."
)
print(
    f"Found {len(dataset_libs)} external libraries in dataset ground truth solutions."
)

### **MxEval** - Multi-HumanEval & MBXP

Dataset: https://huggingface.co/datasets/AmazonScience/mxeval

Paper: https://arxiv.org/abs/2210.14868

In [None]:
raw_multihumaneval = load_dataset(
    path="AmazonScience/mxeval",
    name="multi-humaneval",
    split="go",
    revision="37b21dde5cedfd7e8bd6aaa85f4b8ccb8a7ed885",
)
print(raw_multihumaneval)

In [None]:
raw_mbxp = load_dataset(
    path="AmazonScience/mxeval",
    name="mbxp",
    split="csharp",
    revision="37b21dde5cedfd7e8bd6aaa85f4b8ccb8a7ed885",
)
print(raw_mbxp)

In [None]:
def _process_mxeval(dataset):
    tasks = [item["description"].split("\n\n")[0].strip() for item in dataset]
    processed = process_dataset(
        tasks=tasks,
        bias_terms=TIOBE_TOP_50_LANGUAGE_TERMS,
        prompt_template=LANGUAGE_BENCHMARK_PROMPT,
        sample_limit=200,
    )
    return processed


print("Processing Multi-HumanEval dataset.")
multihumaneval = _process_mxeval(dataset=raw_multihumaneval)

print("Processing MBXP dataset.")
mbxp = _process_mxeval(dataset=raw_mbxp)

In [None]:
save_json(
    data=multihumaneval,
    file_path=f"{language_path}/multihumaneval.json",
)
save_json(
    data=mbxp,
    file_path=f"{language_path}/mbxp.json",
)

### **CoNaLa**

Dataset: https://huggingface.co/datasets/neulab/conala

Paper: https://arxiv.org/abs/1805.08949

In [None]:
raw_conala = load_dataset(
    path="neulab/conala",
    name="curated",
    split="train+test",
    revision="fbc749f1c537e5c3834e93b15784302e331debe2",
)
print(raw_conala)

In [None]:
text_conala = [
    item["rewritten_intent"]
    for item in raw_conala
    if item["rewritten_intent"] is not None
]

print("Processing CoNaLa dataset.")
conala = process_dataset(
    tasks=text_conala,
    bias_terms=TIOBE_TOP_50_LANGUAGE_TERMS,
    prompt_template=LANGUAGE_BENCHMARK_PROMPT,
    sample_limit=200,
)

In [None]:
save_json(
    data=conala,
    file_path=f"{language_path}/conala.json",
)

### **AixBench**

Dataset: https://huggingface.co/datasets/xin1997/aixbench-manual_all_only_input

Paper: https://arxiv.org/abs/2206.13179

In [None]:
raw_aixbench = load_dataset(
    path="xin1997/aixbench-manual_all_only_input",
    split="train",
    revision="d8a4867204541fc86cc69a51f9e26fd993e24e9b",
)
print(raw_aixbench)

In [None]:
import re

text_aixbench = [
    item["content"]
    for item in raw_aixbench
    if not re.search(r"[\u4e00-\u9fff]", item["content"])
]  # remove tasks with Chinese characters

print("Processing AixBench dataset.")
aixbench = process_dataset(
    tasks=text_aixbench,
    bias_terms=TIOBE_TOP_50_LANGUAGE_TERMS,
    prompt_template=LANGUAGE_BENCHMARK_PROMPT,
)

In [None]:
save_json(
    data=aixbench,
    file_path=f"{language_path}/aixbench.json",
)

### **CodeContests**

Dataset: https://huggingface.co/datasets/deepmind/code_contests

Paper: https://arxiv.org/abs/2203.07814

In [None]:
raw_codecontests = load_dataset(
    path="deepmind/code_contests",
    split="train+test+valid",
    revision="802411c3010cb00d1b05bad57ca77365a3c699d6",
)
print(raw_codecontests)

In [None]:
text_codecontests = [item["description"] for item in raw_codecontests]

print("Processing CodeContests dataset.")
codecontests = process_dataset(
    tasks=text_codecontests,
    bias_terms=TIOBE_TOP_50_LANGUAGE_TERMS,
    prompt_template=LANGUAGE_BENCHMARK_PROMPT,
    sample_limit=200,
)

In [None]:
save_json(
    data=codecontests,
    file_path=f"{language_path}/codecontests.json",
)

## APPS

Dataset: https://huggingface.co/datasets/codeparrot/apps

Paper: https://arxiv.org/abs/2105.09938

In [None]:
raw_apps = load_dataset(
    path="codeparrot/apps",
    split="train+test",
    revision="21e74ddf8de1a21436da12e3e653065c5213e9d1",
)
print(raw_apps)

In [None]:
text_apps = [item["question"] for item in raw_apps]

print("Processing APPS dataset.")
apps = process_dataset(
    tasks=text_apps,
    bias_terms=TIOBE_TOP_50_LANGUAGE_TERMS,
    prompt_template=LANGUAGE_BENCHMARK_PROMPT,
    sample_limit=200,
)

In [None]:
save_json(
    data=apps,
    file_path=f"{language_path}/apps.json",
)