Update HumanEval and MBPP notebooks (#1895)

This updates HumanEval and MBPP instruction dataset generation notebooks to correct the output format and push their results to HuggingFace Hub, while also moving them to the new `data` directory. It also changes the name of the `grade-school-math-instructions` directory to `grade_school_math_instructions` for consistency with other directories in `data/datasets`.
LAION-AI · Mar 2, 2023 · 0b6865b · 0b6865b
1 parent ebfe3f9
commit 0b6865b
Show file tree

Hide file tree

Showing 6 changed files with 133 additions and 95 deletions.
diff --git a/data/datasets/__init__.py b/data/datasets/__init__.py
@@ -4,7 +4,9 @@
 }
 
 INSTRUCTION_DATASETS = {
-    "grade-school-math-instructions": "qwedsacf/grade-school-math-instructions",
+    "humaneval_mbpp_codegen_qa": "OllieStanley/humaneval-mbpp-codegen-qa",
+    "humaneval_mbpp_testgen_qa": "OllieStanley/humaneval-mbpp-testgen-qa",
+    "grade_school_math_instructions": "qwedsacf/grade-school-math-instructions",
     "recipes": "dctanner/oa_recipes",
     "ubuntu_dialogue_qa": "sedthh/ubuntu_dialogue_qa",
     "cmu_wiki_qa": "sedthh/cmu_wiki_qa",

diff --git a/...on/codet-data/Augment_CodeT_codegen.ipynb → ...al_mbpp/HumanEval_and_MBPP_code_gen.ipynb b/...on/codet-data/Augment_CodeT_codegen.ipynb → ...al_mbpp/HumanEval_and_MBPP_code_gen.ipynb
@@ -1,19 +1,21 @@
 {
  "cells": [
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## CodeT Code Generation Datasets\n",
+    "## HumanEval and MBPP Code Generation Datasets\n",
     "\n",
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/main/notebooks/data-augmentation/codet-data/Augment_CodeT_codegen.ipynb)"
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/main/data/datasets/codet_humaneval_mbpp/HumanEval_and_MBPP_code_gen.ipynb)"
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "This notebook contains code to parse CodeT code generation prompt and solution data and modify to `(prompt, solution)` pairs outputted in a `.jsonl` file.\n",
+    "This notebook contains code to parse HumanEval and MBPP Python code generation prompt and solution data and modify to `(prompt, solution)` pairs outputted in a `.jsonl` file.\n",
     "\n",
     "Requirements: `requests`"
    ]
@@ -26,8 +28,7 @@
    "source": [
     "import json\n",
     "from pathlib import Path\n",
-    "import requests\n",
-    "from typing import List, Tuple"
+    "import requests"
    ]
   },
   {
@@ -36,21 +37,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "DATA_FILES: List[str] = [\n",
-    "    \"HumanEval_for_code_generation.jsonl\",\n",
-    "    \"mbpp_sanitized_for_code_generation.jsonl\",\n",
-    "]\n",
-    "\n",
-    "OUT_FILES: List[str] = [\n",
-    "    \"HumanEval_codegen.jsonl\",\n",
-    "    \"mbpp_codegen.jsonl\",\n",
-    "]\n",
+    "DATA_SOURCES: list[str] = [\"HumanEval\", \"mbpp_sanitized\"]\n",
+    "DATA_FILES: list[str] = [f\"{source}_for_code_generation.jsonl\" for source in DATA_SOURCES]\n",
+    "OUT_FILES: list[str] = [f\"{source}_codegen_qa.jsonl\" for source in DATA_SOURCES]\n",
     "\n",
-    "Path(\"data/augmented\").mkdir(parents=True, exist_ok=True)\n",
+    "IN_DIR = Path(\"data\")\n",
+    "OUT_DIR = Path(\"data/qa\")\n",
+    "OUT_DIR.mkdir(parents=True, exist_ok=True)\n",
     "\n",
-    "FILE_PATHS: List[Path] = [Path(f\"data/{data_file}\") for data_file in DATA_FILES]\n",
-    "\n",
-    "OUT_PATHS: List[Path] = [Path(f\"data/augmented/{out_file}\") for out_file in OUT_FILES]"
+    "FILE_PATHS: list[Path] = [IN_DIR / data_file for data_file in DATA_FILES]\n",
+    "OUT_PATHS: list[Path] = [OUT_DIR / out_file for out_file in OUT_FILES]"
    ]
   },
   {
@@ -71,10 +67,11 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We can find the docstring, use its contents as the instruction (prefixed with \"Write a function corresponding to the docstring:\") and then use the content prior to the docstring and the canonical solution as the response."
+    "We can find the docstring, use its contents as the instruction (prefixed with a prompt) and then use the content prior to the docstring and the canonical solution as the response."
    ]
   },
   {
@@ -83,7 +80,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def get_docstring_indices(prompt_lines: List[str]) -> Tuple[int, int]:\n",
+    "def get_docstring_indices(prompt_lines: list[str]) -> tuple[int, int]:\n",
     "    docstring_start, docstring_end = None, None\n",
     "\n",
     "    for i, line in enumerate(prompt_lines):\n",
@@ -99,12 +96,12 @@
     "    raise ValueError(f\"No complete docstring found!\\n{prompt_lines}\")\n",
     "\n",
     "\n",
-    "def get_before(prompt_lines: List[str], before: int) -> List[str]:\n",
+    "def get_before(prompt_lines: list[str], before: int) -> list[str]:\n",
     "    before_lines = prompt_lines[:before]\n",
     "    return before_lines\n",
     "\n",
     "\n",
-    "def get_between(prompt_lines: List[str], start: int, end: int) -> List[str]:\n",
+    "def get_between(prompt_lines: list[str], start: int, end: int) -> list[str]:\n",
     "    between_lines = prompt_lines[start:end]\n",
     "    return between_lines"
    ]
@@ -115,7 +112,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def get_request_and_solution(sample: dict) -> Tuple[List[str], List[str]]:\n",
+    "def get_request_and_solution(sample: dict) -> tuple[list[str], list[str]]:\n",
     "    prompt = sample[\"prompt\"]\n",
     "    prompt_lines = prompt.splitlines()\n",
     "\n",
@@ -124,8 +121,8 @@
     "    # Extract prompt\n",
     "    in_docstring = get_between(prompt_lines, docstring_start, docstring_end)\n",
     "    if '\"\"\"' in in_docstring[0] or \"'''\" in in_docstring[0]:\n",
-    "        in_docstring[0] = in_docstring[0].replace('\"\"\"', \"\").replace(\"...\", \"\").strip()\n",
-    "    request = \"Write a Python function corresponding to the docstring: \" + \" \".join([p.strip() for p in in_docstring])\n",
+    "        in_docstring[0] = in_docstring[0].replace('\"\"\"', \"\").replace(\"'''\", \"\").strip()\n",
+    "    request = \"Write a Python function which follows this instruction: \" + \" \".join([p.strip() for p in in_docstring])\n",
     "\n",
     "    # Extract solution\n",
     "    before_docstring = get_before(prompt_lines, docstring_start)\n",
@@ -144,14 +141,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def process_file(file_path: Path, out_path: Path):\n",
+    "def process_file(file_path: Path, out_path: Path, source: str):\n",
     "    lines = file_path.read_text().splitlines()\n",
     "    samples = list(map(json.loads, lines))\n",
     "\n",
     "    output = []\n",
     "    for sample in samples:\n",
     "        prompt, solution = get_request_and_solution(sample)\n",
-    "        output.append({\"prompt\": prompt, \"solution\": solution})\n",
+    "        output.append({\"INSTRUCTION\": prompt, \"RESPONSE\": solution, \"SOURCE\": source})\n",
     "\n",
     "    with open(out_path, \"w\") as f:\n",
     "        for sample in output:\n",
@@ -165,8 +162,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "for file_path, out_path in zip(FILE_PATHS, OUT_PATHS):\n",
-    "    process_file(file_path, out_path)"
+    "for file_path, out_path, source in zip(FILE_PATHS, OUT_PATHS, DATA_SOURCES):\n",
+    "    process_file(file_path, out_path, source)"
    ]
   },
   {
@@ -187,7 +184,7 @@
      "output_type": "stream",
      "text": [
       "Prompt\n",
-      "Write a Python function corresponding to the docstring: Check if in given list of numbers, are any two numbers closer to each other than given threshold.\n",
+      "Write a Python function which follows this instruction: Check if in given list of numbers, are any two numbers closer to each other than given threshold.\n",
       "\n",
       "Solution\n",
       "from typing import List\n",
@@ -205,13 +202,13 @@
     }
    ],
    "source": [
-    "sample = json.loads(Path(\"data/augmented/HumanEval_codegen.jsonl\").read_text().splitlines()[0])\n",
+    "sample = json.loads(OUT_PATHS[0].read_text().splitlines()[0])\n",
     "\n",
     "print(\"Prompt\")\n",
-    "print(sample[\"prompt\"])\n",
+    "print(sample[\"INSTRUCTION\"])\n",
     "print()\n",
     "print(\"Solution\")\n",
-    "print(sample[\"solution\"])"
+    "print(sample[\"RESPONSE\"])"
    ]
   },
   {
@@ -232,7 +229,7 @@
      "output_type": "stream",
      "text": [
       "Prompt\n",
-      "Write a Python function corresponding to the docstring: ''' Write a function to find the shared elements from the given two lists.\n",
+      "Write a Python function which follows this instruction:  Write a function to find the shared elements from the given two lists.\n",
       "\n",
       "Solution\n",
       "def similar_elements(test_tup1, test_tup2):\n",
@@ -242,19 +239,39 @@
     }
    ],
    "source": [
-    "sample = json.loads(Path(\"data/augmented/mbpp_codegen.jsonl\").read_text().splitlines()[0])\n",
+    "sample = json.loads(OUT_PATHS[1].read_text().splitlines()[0])\n",
     "\n",
     "print(\"Prompt\")\n",
-    "print(sample[\"prompt\"])\n",
+    "print(sample[\"INSTRUCTION\"])\n",
     "print()\n",
     "print(\"Solution\")\n",
-    "print(sample[\"solution\"])"
+    "print(sample[\"RESPONSE\"])"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Upload to HuggingFace"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import Dataset\n",
+    "\n",
+    "humaneval_mbpp_codegen_qa_ds = Dataset.from_json([str(p) for p in OUT_PATHS])\n",
+    "humaneval_mbpp_codegen_qa_ds.push_to_hub(\"OllieStanley/humaneval-mbpp-codegen-qa\")"
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "oa-env",
    "language": "python",
    "name": "python3"
   },
@@ -268,12 +285,12 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.6"
+   "version": "3.10.8"
   },
   "orig_nbformat": 4,
   "vscode": {
    "interpreter": {
-    "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
+    "hash": "329db72e1bc5b877a83eb22fb38cb9ecd67b294bcd84477fe23b24f012cf9e60"
    }
   }
  },