Skip to content

Commit

Permalink
Updated the links to the datasets, and the notebooks. The improvements (
Browse files Browse the repository at this point in the history
#3191)

of the new version include:
- Cleaned the summary dataset removing repetitions and removing
meaningless summaries.
- Removed the docstrings from the functions when the task is to generate
a description or a doctring from all samples (previously 10% of the
samples kept the docstring)
- Added some NLP magic to generate more cohesive sentences when fusing
the template to generate code with the summary.
  • Loading branch information
Nan-Do committed May 19, 2023
1 parent 1a14d9d commit 00263f2
Show file tree
Hide file tree
Showing 4 changed files with 151 additions and 110 deletions.
2 changes: 1 addition & 1 deletion data/datasets/__init__.py
Expand Up @@ -27,7 +27,7 @@
"oa_stackexchange": "donfu/oa-stackexchange",
"stable_diffusion_instructional_dataset": "MadVoyager/stable_diffusion_instructional_dataset",
"ru_riddles_337": "0x22almostEvil/ru-riddles-377",
"instructional_codesearchnet_python": "Nan-Do/instructional_codesearchnet_python",
"instructional_codesearchnet_python": "Nan-Do/instructional_code-search-net-python",
"tatoeba_mt_qna_oa": "0x22almostEvil/tatoeba-mt-qna-oa",
"reasoning_bg_oa": "0x22almostEvil/reasoning_bg_oa",
"reasoning_gsm_qna_oa": "0x22almostEvil/reasoning-gsm-qna-oa",
Expand Down
@@ -1,29 +1,16 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "code",
"source": [
"!pip install datasets tqdm"
],
"execution_count": null,
"metadata": {
"id": "zLxBMw9Lsr6I"
"id": "zLxBMw9Lsr6I",
"scrolled": true
},
"execution_count": null,
"outputs": []
"outputs": [],
"source": [
"!pip install datasets tqdm lemminflect"
]
},
{
"cell_type": "code",
Expand All @@ -41,27 +28,36 @@
"from datasets import load_dataset\n",
"from tqdm.auto import tqdm\n",
"from random import random, randint\n",
"from lemminflect import getAllInflections, getLemma\n",
"\n",
"ONE_STEP_OUPUT_CODE_TEMPLATES = [\n",
" # VBZ\n",
" \"Can you write a program in {lang} where it\\n\",\n",
" \"How would you implement a function in {lang} that\\n\",\n",
" \"Write a {lang} function for\\n\",\n",
" \"Can you create a {lang} program that\\n\",\n",
" \"Can you implement a function in {lang} that\\n\",\n",
" # VBP\n",
" \"Implement a function in {lang} to\\n\",\n",
" \"Write a {lang} script for\\n\",\n",
" \"How would you code a program in {lang} to\\n\",\n",
" \"Create a {lang} function for\\n\",\n",
" \"Write a {lang} script to\\n\",\n",
" \"Create a {lang} function to\\n\",\n",
" \"Write a {lang} program that can\\n\",\n",
" \"Can you implement a function in {lang} that\\n\",\n",
" # VBG\n",
" \"Write a {lang} script for\\n\",\n",
" \"Write a {lang} function for\\n\",\n",
" \"Create a {lang} function for\\n\",\n",
" \"Implement a {lang} function for\\n\",\n",
"]\n",
"\n",
"ONE_STEP_OUPUT_SUMMARY_TEMPLATES = [\n",
" # General answer\n",
" \"Explain what the following {lang} code does\\n\",\n",
" \"Can you tell what is the following {lang} function doing\\n\",\n",
" \"Here you have a function in {lang}, explain what it does\\n\",\n",
" \"Make a summary of the following {lang} code\\n\",\n",
" \"Can you generate a brief explanation for the following {lang} code\\n\",\n",
" \"How would you explain what the following {lang} function does\\n\",\n",
" # Documentation\n",
" \"Can you generate the documentation for the following {lang} function\\n\",\n",
" \"Create a docstring for the following {lang} code\\n\",\n",
" \"Given the following {lang} function, write the documentation\\n\",\n",
Expand All @@ -88,9 +84,22 @@
" return \"\\n\".join([lines[0]] + lines[idx + 1 :])\n",
"\n",
"\n",
"def process_summary(summary, tag):\n",
" words = summary.split()\n",
" lemma = getLemma(words[0].lower(), upos=\"VERB\")[0]\n",
" inflections = getAllInflections(lemma)\n",
"\n",
" if tag not in inflections:\n",
" words[0] = words[0].lower()\n",
" else:\n",
" words[0] = inflections[tag][0]\n",
"\n",
" return \" \".join(words)\n",
"\n",
"\n",
"lang = \"Python 3\"\n",
"data = defaultdict(list)\n",
"dataset = load_dataset(\"Nan-Do/codesearchnet-python\")\n",
"dataset = load_dataset(\"Nan-Do/code-search-net-python\")\n",
"\n",
"for data_point in tqdm(dataset[\"train\"]):\n",
" code = data_point[\"original_string\"]\n",
Expand All @@ -99,15 +108,23 @@
" # Generate code\n",
" if random() > 0.5:\n",
" idx = randint(0, len(ONE_STEP_OUPUT_CODE_TEMPLATES) - 1)\n",
" if 0 <= idx <= 3:\n",
" tag = \"VBZ\"\n",
" elif 4 <= idx <= 8:\n",
" tag = \"VBP\"\n",
" else:\n",
" tag = \"VBG\"\n",
" summary = process_summary(summary, tag)\n",
" template = ONE_STEP_OUPUT_CODE_TEMPLATES[idx].format(lang=lang) + summary\n",
" data[\"INSTRUCTION\"].append(template)\n",
" data[\"RESPONSE\"].append(code)\n",
" # Generate summary\n",
" else:\n",
" # We are generating the docstring or a summary so we better remove it from\n",
" # the function\n",
" if random() < 0.9:\n",
" code = remove_docstring(code)\n",
" # if random() < 0.9:\n",
" # code = remove_docstring(code)\n",
" code = remove_docstring(code)\n",
" idx = randint(0, len(ONE_STEP_OUPUT_SUMMARY_TEMPLATES) - 1)\n",
" template = ONE_STEP_OUPUT_SUMMARY_TEMPLATES[idx].format(lang=lang) + code\n",
" data[\"INSTRUCTION\"].append(template)\n",
Expand All @@ -117,35 +134,66 @@
" data[\"RESPONSE\"].append('\"\"\"' + summary + '\"\"\"')\n",
"\n",
"df = pd.DataFrame(data=data)\n",
"df.to_parquet(\"dataset.parquet\", row_group_size=100, engine=\"pyarrow\", index=False)"
"df.to_parquet(\"instructional_dataset.parquet\", row_group_size=100, engine=\"pyarrow\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "_6jaUZRsy1-R"
},
"outputs": [],
"source": [
"from huggingface_hub import notebook_login\n",
"\n",
"notebook_login()"
],
"metadata": {
"id": "_6jaUZRsy1-R"
},
"execution_count": null,
"outputs": []
]
},
{
"cell_type": "code",
"source": [
"from datasets import Dataset\n",
"\n",
"ds = Dataset.from_parquet(\"dataset.parquet\")\n",
"ds.push_to_hub(\"Nan-Do/open-assistant-codesearchnet-python\")"
],
"execution_count": null,
"metadata": {
"id": "DSHrvbF6tIyd"
},
"outputs": [],
"source": [
"from datasets import Dataset\n",
"\n",
"ds = Dataset.from_parquet(\"instructional_dataset.parquet\")\n",
"ds.push_to_hub(\"Nan-Do/instructional_code-search-net-python\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"outputs": []
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
}
]
},
"nbformat": 4,
"nbformat_minor": 1
}
5 changes: 3 additions & 2 deletions data/datasets/instructional_codesearchnet_python/README.md
@@ -1,7 +1,7 @@
This dataset is a template generated instructional Python datastet generated
from an annotated version of the code-search-net dataset. The annotated version
of code-search-net dataset can be found
[here](https://huggingface.co/datasets/Nan-Do/codesearchnet-python).
[here](https://huggingface.co/datasets/Nan-Do/code-search-net-python).

The dataset contains around 450000 python annotated functions. The dataset is
split into two blocks, one in which the task is starting from the annotated
Expand All @@ -14,7 +14,8 @@ been used.

**Note**: some summarisation tasks are very easy because the prompt already
contains a docstring in the function which is then used as the ground truth
response. It may be useful to filter these in future.
response. It may be useful to filter these in future. (All the docstrings have
been removed now)

### Summarize_codesearchnet_for_python.ipynb

Expand Down

0 comments on commit 00263f2

Please sign in to comment.