Skip to content

Commit

Permalink
Update HumanEval and MBPP notebooks (#1895)
Browse files Browse the repository at this point in the history
This updates HumanEval and MBPP instruction dataset generation notebooks
to correct the output format and push their results to HuggingFace Hub,
while also moving them to the new `data` directory.

It also changes the name of the `grade-school-math-instructions`
directory to `grade_school_math_instructions` for consistency with other
directories in `data/datasets`.
  • Loading branch information
olliestanley committed Mar 2, 2023
1 parent ebfe3f9 commit 0b6865b
Show file tree
Hide file tree
Showing 6 changed files with 133 additions and 95 deletions.
4 changes: 3 additions & 1 deletion data/datasets/__init__.py
Expand Up @@ -4,7 +4,9 @@
}

INSTRUCTION_DATASETS = {
"grade-school-math-instructions": "qwedsacf/grade-school-math-instructions",
"humaneval_mbpp_codegen_qa": "OllieStanley/humaneval-mbpp-codegen-qa",
"humaneval_mbpp_testgen_qa": "OllieStanley/humaneval-mbpp-testgen-qa",
"grade_school_math_instructions": "qwedsacf/grade-school-math-instructions",
"recipes": "dctanner/oa_recipes",
"ubuntu_dialogue_qa": "sedthh/ubuntu_dialogue_qa",
"cmu_wiki_qa": "sedthh/cmu_wiki_qa",
Expand Down
@@ -1,19 +1,21 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## CodeT Code Generation Datasets\n",
"## HumanEval and MBPP Code Generation Datasets\n",
"\n",
"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/main/notebooks/data-augmentation/codet-data/Augment_CodeT_codegen.ipynb)"
"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/main/data/datasets/codet_humaneval_mbpp/HumanEval_and_MBPP_code_gen.ipynb)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"This notebook contains code to parse CodeT code generation prompt and solution data and modify to `(prompt, solution)` pairs outputted in a `.jsonl` file.\n",
"This notebook contains code to parse HumanEval and MBPP Python code generation prompt and solution data and modify to `(prompt, solution)` pairs outputted in a `.jsonl` file.\n",
"\n",
"Requirements: `requests`"
]
Expand All @@ -26,8 +28,7 @@
"source": [
"import json\n",
"from pathlib import Path\n",
"import requests\n",
"from typing import List, Tuple"
"import requests"
]
},
{
Expand All @@ -36,21 +37,16 @@
"metadata": {},
"outputs": [],
"source": [
"DATA_FILES: List[str] = [\n",
" \"HumanEval_for_code_generation.jsonl\",\n",
" \"mbpp_sanitized_for_code_generation.jsonl\",\n",
"]\n",
"\n",
"OUT_FILES: List[str] = [\n",
" \"HumanEval_codegen.jsonl\",\n",
" \"mbpp_codegen.jsonl\",\n",
"]\n",
"DATA_SOURCES: list[str] = [\"HumanEval\", \"mbpp_sanitized\"]\n",
"DATA_FILES: list[str] = [f\"{source}_for_code_generation.jsonl\" for source in DATA_SOURCES]\n",
"OUT_FILES: list[str] = [f\"{source}_codegen_qa.jsonl\" for source in DATA_SOURCES]\n",
"\n",
"Path(\"data/augmented\").mkdir(parents=True, exist_ok=True)\n",
"IN_DIR = Path(\"data\")\n",
"OUT_DIR = Path(\"data/qa\")\n",
"OUT_DIR.mkdir(parents=True, exist_ok=True)\n",
"\n",
"FILE_PATHS: List[Path] = [Path(f\"data/{data_file}\") for data_file in DATA_FILES]\n",
"\n",
"OUT_PATHS: List[Path] = [Path(f\"data/augmented/{out_file}\") for out_file in OUT_FILES]"
"FILE_PATHS: list[Path] = [IN_DIR / data_file for data_file in DATA_FILES]\n",
"OUT_PATHS: list[Path] = [OUT_DIR / out_file for out_file in OUT_FILES]"
]
},
{
Expand All @@ -71,10 +67,11 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"We can find the docstring, use its contents as the instruction (prefixed with \"Write a function corresponding to the docstring:\") and then use the content prior to the docstring and the canonical solution as the response."
"We can find the docstring, use its contents as the instruction (prefixed with a prompt) and then use the content prior to the docstring and the canonical solution as the response."
]
},
{
Expand All @@ -83,7 +80,7 @@
"metadata": {},
"outputs": [],
"source": [
"def get_docstring_indices(prompt_lines: List[str]) -> Tuple[int, int]:\n",
"def get_docstring_indices(prompt_lines: list[str]) -> tuple[int, int]:\n",
" docstring_start, docstring_end = None, None\n",
"\n",
" for i, line in enumerate(prompt_lines):\n",
Expand All @@ -99,12 +96,12 @@
" raise ValueError(f\"No complete docstring found!\\n{prompt_lines}\")\n",
"\n",
"\n",
"def get_before(prompt_lines: List[str], before: int) -> List[str]:\n",
"def get_before(prompt_lines: list[str], before: int) -> list[str]:\n",
" before_lines = prompt_lines[:before]\n",
" return before_lines\n",
"\n",
"\n",
"def get_between(prompt_lines: List[str], start: int, end: int) -> List[str]:\n",
"def get_between(prompt_lines: list[str], start: int, end: int) -> list[str]:\n",
" between_lines = prompt_lines[start:end]\n",
" return between_lines"
]
Expand All @@ -115,7 +112,7 @@
"metadata": {},
"outputs": [],
"source": [
"def get_request_and_solution(sample: dict) -> Tuple[List[str], List[str]]:\n",
"def get_request_and_solution(sample: dict) -> tuple[list[str], list[str]]:\n",
" prompt = sample[\"prompt\"]\n",
" prompt_lines = prompt.splitlines()\n",
"\n",
Expand All @@ -124,8 +121,8 @@
" # Extract prompt\n",
" in_docstring = get_between(prompt_lines, docstring_start, docstring_end)\n",
" if '\"\"\"' in in_docstring[0] or \"'''\" in in_docstring[0]:\n",
" in_docstring[0] = in_docstring[0].replace('\"\"\"', \"\").replace(\"...\", \"\").strip()\n",
" request = \"Write a Python function corresponding to the docstring: \" + \" \".join([p.strip() for p in in_docstring])\n",
" in_docstring[0] = in_docstring[0].replace('\"\"\"', \"\").replace(\"'''\", \"\").strip()\n",
" request = \"Write a Python function which follows this instruction: \" + \" \".join([p.strip() for p in in_docstring])\n",
"\n",
" # Extract solution\n",
" before_docstring = get_before(prompt_lines, docstring_start)\n",
Expand All @@ -144,14 +141,14 @@
"metadata": {},
"outputs": [],
"source": [
"def process_file(file_path: Path, out_path: Path):\n",
"def process_file(file_path: Path, out_path: Path, source: str):\n",
" lines = file_path.read_text().splitlines()\n",
" samples = list(map(json.loads, lines))\n",
"\n",
" output = []\n",
" for sample in samples:\n",
" prompt, solution = get_request_and_solution(sample)\n",
" output.append({\"prompt\": prompt, \"solution\": solution})\n",
" output.append({\"INSTRUCTION\": prompt, \"RESPONSE\": solution, \"SOURCE\": source})\n",
"\n",
" with open(out_path, \"w\") as f:\n",
" for sample in output:\n",
Expand All @@ -165,8 +162,8 @@
"metadata": {},
"outputs": [],
"source": [
"for file_path, out_path in zip(FILE_PATHS, OUT_PATHS):\n",
" process_file(file_path, out_path)"
"for file_path, out_path, source in zip(FILE_PATHS, OUT_PATHS, DATA_SOURCES):\n",
" process_file(file_path, out_path, source)"
]
},
{
Expand All @@ -187,7 +184,7 @@
"output_type": "stream",
"text": [
"Prompt\n",
"Write a Python function corresponding to the docstring: Check if in given list of numbers, are any two numbers closer to each other than given threshold.\n",
"Write a Python function which follows this instruction: Check if in given list of numbers, are any two numbers closer to each other than given threshold.\n",
"\n",
"Solution\n",
"from typing import List\n",
Expand All @@ -205,13 +202,13 @@
}
],
"source": [
"sample = json.loads(Path(\"data/augmented/HumanEval_codegen.jsonl\").read_text().splitlines()[0])\n",
"sample = json.loads(OUT_PATHS[0].read_text().splitlines()[0])\n",
"\n",
"print(\"Prompt\")\n",
"print(sample[\"prompt\"])\n",
"print(sample[\"INSTRUCTION\"])\n",
"print()\n",
"print(\"Solution\")\n",
"print(sample[\"solution\"])"
"print(sample[\"RESPONSE\"])"
]
},
{
Expand All @@ -232,7 +229,7 @@
"output_type": "stream",
"text": [
"Prompt\n",
"Write a Python function corresponding to the docstring: ''' Write a function to find the shared elements from the given two lists.\n",
"Write a Python function which follows this instruction: Write a function to find the shared elements from the given two lists.\n",
"\n",
"Solution\n",
"def similar_elements(test_tup1, test_tup2):\n",
Expand All @@ -242,19 +239,39 @@
}
],
"source": [
"sample = json.loads(Path(\"data/augmented/mbpp_codegen.jsonl\").read_text().splitlines()[0])\n",
"sample = json.loads(OUT_PATHS[1].read_text().splitlines()[0])\n",
"\n",
"print(\"Prompt\")\n",
"print(sample[\"prompt\"])\n",
"print(sample[\"INSTRUCTION\"])\n",
"print()\n",
"print(\"Solution\")\n",
"print(sample[\"solution\"])"
"print(sample[\"RESPONSE\"])"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Upload to HuggingFace"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from datasets import Dataset\n",
"\n",
"humaneval_mbpp_codegen_qa_ds = Dataset.from_json([str(p) for p in OUT_PATHS])\n",
"humaneval_mbpp_codegen_qa_ds.push_to_hub(\"OllieStanley/humaneval-mbpp-codegen-qa\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "oa-env",
"language": "python",
"name": "python3"
},
Expand All @@ -268,12 +285,12 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6"
"version": "3.10.8"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
"hash": "329db72e1bc5b877a83eb22fb38cb9ecd67b294bcd84477fe23b24f012cf9e60"
}
}
},
Expand Down

0 comments on commit 0b6865b

Please sign in to comment.