Updated the links to the datasets, and the notebooks. The improvements (

#3191) of the new version include: - Cleaned the summary dataset removing repetitions and removing meaningless summaries. - Removed the docstrings from the functions when the task is to generate a description or a doctring from all samples (previously 10% of the samples kept the docstring) - Added some NLP magic to generate more cohesive sentences when fusing the template to generate code with the summary.
LAION-AI · May 19, 2023 · 00263f2 · 00263f2
1 parent 1a14d9d
commit 00263f2
Show file tree

Hide file tree

Showing 4 changed files with 151 additions and 110 deletions.
diff --git a/data/datasets/__init__.py b/data/datasets/__init__.py
@@ -27,7 +27,7 @@
     "oa_stackexchange": "donfu/oa-stackexchange",
     "stable_diffusion_instructional_dataset": "MadVoyager/stable_diffusion_instructional_dataset",
     "ru_riddles_337": "0x22almostEvil/ru-riddles-377",
-    "instructional_codesearchnet_python": "Nan-Do/instructional_codesearchnet_python",
+    "instructional_codesearchnet_python": "Nan-Do/instructional_code-search-net-python",
     "tatoeba_mt_qna_oa": "0x22almostEvil/tatoeba-mt-qna-oa",
     "reasoning_bg_oa": "0x22almostEvil/reasoning_bg_oa",
     "reasoning_gsm_qna_oa": "0x22almostEvil/reasoning-gsm-qna-oa",

diff --git a/...s/instructional_codesearchnet_python/GenerateOpenAssistantInstructionResponseFormat.ipynb b/...s/instructional_codesearchnet_python/GenerateOpenAssistantInstructionResponseFormat.ipynb
@@ -1,29 +1,16 @@
 {
- "nbformat": 4,
- "nbformat_minor": 0,
- "metadata": {
-  "colab": {
-   "provenance": []
-  },
-  "kernelspec": {
-   "name": "python3",
-   "display_name": "Python 3"
-  },
-  "language_info": {
-   "name": "python"
-  }
- },
  "cells": [
   {
    "cell_type": "code",
-   "source": [
-    "!pip install datasets tqdm"
-   ],
+   "execution_count": null,
    "metadata": {
-    "id": "zLxBMw9Lsr6I"
+    "id": "zLxBMw9Lsr6I",
+    "scrolled": true
    },
-   "execution_count": null,
-   "outputs": []
+   "outputs": [],
+   "source": [
+    "!pip install datasets tqdm lemminflect"
+   ]
   },
   {
    "cell_type": "code",
@@ -41,27 +28,36 @@
     "from datasets import load_dataset\n",
     "from tqdm.auto import tqdm\n",
     "from random import random, randint\n",
+    "from lemminflect import getAllInflections, getLemma\n",
     "\n",
     "ONE_STEP_OUPUT_CODE_TEMPLATES = [\n",
+    "    # VBZ\n",
     "    \"Can you write a program in {lang} where it\\n\",\n",
     "    \"How would you implement a function in {lang} that\\n\",\n",
-    "    \"Write a {lang} function for\\n\",\n",
     "    \"Can you create a {lang} program that\\n\",\n",
+    "    \"Can you implement a function in {lang} that\\n\",\n",
+    "    # VBP\n",
     "    \"Implement a function in {lang} to\\n\",\n",
-    "    \"Write a {lang} script for\\n\",\n",
     "    \"How would you code a program in {lang} to\\n\",\n",
-    "    \"Create a {lang} function for\\n\",\n",
+    "    \"Write a {lang} script to\\n\",\n",
+    "    \"Create a {lang} function to\\n\",\n",
     "    \"Write a {lang} program that can\\n\",\n",
-    "    \"Can you implement a function in {lang} that\\n\",\n",
+    "    # VBG\n",
+    "    \"Write a {lang} script for\\n\",\n",
+    "    \"Write a {lang} function for\\n\",\n",
+    "    \"Create a {lang} function for\\n\",\n",
+    "    \"Implement a {lang} function for\\n\",\n",
     "]\n",
     "\n",
     "ONE_STEP_OUPUT_SUMMARY_TEMPLATES = [\n",
+    "    # General answer\n",
     "    \"Explain what the following {lang} code does\\n\",\n",
     "    \"Can you tell what is the following {lang} function doing\\n\",\n",
     "    \"Here you have a function in {lang}, explain what it does\\n\",\n",
     "    \"Make a summary of the following {lang} code\\n\",\n",
     "    \"Can you generate a brief explanation for the following {lang} code\\n\",\n",
     "    \"How would you explain what the following {lang} function does\\n\",\n",
+    "    # Documentation\n",
     "    \"Can you generate the documentation for the following {lang} function\\n\",\n",
     "    \"Create a docstring for the following {lang} code\\n\",\n",
     "    \"Given the following {lang} function, write the documentation\\n\",\n",
@@ -88,9 +84,22 @@
     "    return \"\\n\".join([lines[0]] + lines[idx + 1 :])\n",
     "\n",
     "\n",
+    "def process_summary(summary, tag):\n",
+    "    words = summary.split()\n",
+    "    lemma = getLemma(words[0].lower(), upos=\"VERB\")[0]\n",
+    "    inflections = getAllInflections(lemma)\n",
+    "\n",
+    "    if tag not in inflections:\n",
+    "        words[0] = words[0].lower()\n",
+    "    else:\n",
+    "        words[0] = inflections[tag][0]\n",
+    "\n",
+    "    return \" \".join(words)\n",
+    "\n",
+    "\n",
     "lang = \"Python 3\"\n",
     "data = defaultdict(list)\n",
-    "dataset = load_dataset(\"Nan-Do/codesearchnet-python\")\n",
+    "dataset = load_dataset(\"Nan-Do/code-search-net-python\")\n",
     "\n",
     "for data_point in tqdm(dataset[\"train\"]):\n",
     "    code = data_point[\"original_string\"]\n",
@@ -99,15 +108,23 @@
     "    # Generate code\n",
     "    if random() > 0.5:\n",
     "        idx = randint(0, len(ONE_STEP_OUPUT_CODE_TEMPLATES) - 1)\n",
+    "        if 0 <= idx <= 3:\n",
+    "            tag = \"VBZ\"\n",
+    "        elif 4 <= idx <= 8:\n",
+    "            tag = \"VBP\"\n",
+    "        else:\n",
+    "            tag = \"VBG\"\n",
+    "        summary = process_summary(summary, tag)\n",
     "        template = ONE_STEP_OUPUT_CODE_TEMPLATES[idx].format(lang=lang) + summary\n",
     "        data[\"INSTRUCTION\"].append(template)\n",
     "        data[\"RESPONSE\"].append(code)\n",
     "    # Generate summary\n",
     "    else:\n",
     "        # We are generating the docstring or a summary so we better remove it from\n",
     "        # the function\n",
-    "        if random() < 0.9:\n",
-    "            code = remove_docstring(code)\n",
+    "        # if random() < 0.9:\n",
+    "        #    code = remove_docstring(code)\n",
+    "        code = remove_docstring(code)\n",
     "        idx = randint(0, len(ONE_STEP_OUPUT_SUMMARY_TEMPLATES) - 1)\n",
     "        template = ONE_STEP_OUPUT_SUMMARY_TEMPLATES[idx].format(lang=lang) + code\n",
     "        data[\"INSTRUCTION\"].append(template)\n",
@@ -117,35 +134,66 @@
     "            data[\"RESPONSE\"].append('\"\"\"' + summary + '\"\"\"')\n",
     "\n",
     "df = pd.DataFrame(data=data)\n",
-    "df.to_parquet(\"dataset.parquet\", row_group_size=100, engine=\"pyarrow\", index=False)"
+    "df.to_parquet(\"instructional_dataset.parquet\", row_group_size=100, engine=\"pyarrow\", index=False)"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "_6jaUZRsy1-R"
+   },
+   "outputs": [],
    "source": [
     "from huggingface_hub import notebook_login\n",
     "\n",
     "notebook_login()"
-   ],
-   "metadata": {
-    "id": "_6jaUZRsy1-R"
-   },
-   "execution_count": null,
-   "outputs": []
+   ]
   },
   {
    "cell_type": "code",
-   "source": [
-    "from datasets import Dataset\n",
-    "\n",
-    "ds = Dataset.from_parquet(\"dataset.parquet\")\n",
-    "ds.push_to_hub(\"Nan-Do/open-assistant-codesearchnet-python\")"
-   ],
+   "execution_count": null,
    "metadata": {
     "id": "DSHrvbF6tIyd"
    },
+   "outputs": [],
+   "source": [
+    "from datasets import Dataset\n",
+    "\n",
+    "ds = Dataset.from_parquet(\"instructional_dataset.parquet\")\n",
+    "ds.push_to_hub(\"Nan-Do/instructional_code-search-net-python\")"
+   ]
+  },
+  {
+   "cell_type": "code",
    "execution_count": null,
-   "outputs": []
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
   }
- ]
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
 }
diff --git a/data/datasets/instructional_codesearchnet_python/README.md b/data/datasets/instructional_codesearchnet_python/README.md
@@ -1,7 +1,7 @@
 This dataset is a template generated instructional Python datastet generated
 from an annotated version of the code-search-net dataset. The annotated version
 of code-search-net dataset can be found
-[here](https://huggingface.co/datasets/Nan-Do/codesearchnet-python).
+[here](https://huggingface.co/datasets/Nan-Do/code-search-net-python).
 
 The dataset contains around 450000 python annotated functions. The dataset is
 split into two blocks, one in which the task is starting from the annotated
@@ -14,7 +14,8 @@ been used.
 
 **Note**: some summarisation tasks are very easy because the prompt already
 contains a docstring in the function which is then used as the ground truth
-response. It may be useful to filter these in future.
+response. It may be useful to filter these in future. (All the docstrings have
+been removed now)
 
 ### Summarize_codesearchnet_for_python.ipynb