OA-261.tell.a.joke.dataset (#2209)

Co-authored-by: mishka <gartsocial@gmail.com>
LAION-AI · Apr 3, 2023 · d27e5c4 · d27e5c4
1 parent 9e46bc4
commit d27e5c4
Show file tree

Hide file tree

Showing 2 changed files with 177 additions and 0 deletions.
diff --git a/data/datasets/__init__.py b/data/datasets/__init__.py
@@ -17,6 +17,7 @@
     "youtube_subs_howto100M": "totuta/youtube_subs_howto100M",
     "iapp_wiki_qa_squad": "wannaphong/iapp_wiki_qa_squad_oa",
     "zhihu-kol": "wangrui6/zhihu-kol",
+    "tell_a_joke": "mikegarts/oa_tell_a_joke_20000",
     "oa_wiki_qa_bart_10000row": "michaelthwan/oa_wiki_qa_bart_10000row",
 }
 

diff --git a/data/datasets/tell_a_joke/tell_a_joke.ipynb b/data/datasets/tell_a_joke/tell_a_joke.ipynb
@@ -0,0 +1,176 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Tell a Joke instruction dataset. Based on https://huggingface.co/datasets/SocialGrep/one-million-reddit-jokes and augmented using keybert."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/data/datasets/tell_a_joke/tell_a_joke.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install keybert\n",
+    "!pip install datasets\n",
+    "!pip install pandas\n",
+    "!pip install huggingface_hub\n",
+    "!pip install swifter"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import random\n",
+    "from datasets import load_dataset\n",
+    "import pandas as pd\n",
+    "from keybert import KeyBERT\n",
+    "from huggingface_hub import notebook_login\n",
+    "from datasets import Dataset\n",
+    "import swifter"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "SIZE_LIMIT = 20000\n",
+    "MIN_JOKE_SCORE = 100\n",
+    "HF_DATASET = \"SocialGrep/one-million-reddit-jokes\"\n",
+    "pd.set_option(\"display.max_colwidth\", None)\n",
+    "jokes_ds = load_dataset(HF_DATASET)\n",
+    "kw_model = KeyBERT()\n",
+    "pd_jokes = jokes_ds[\"train\"].to_pandas()\n",
+    "pd_jokes = pd_jokes.reset_index(drop=True)\n",
+    "\n",
+    "filtered_jokes = pd_jokes[pd_jokes[\"subreddit.nsfw\"] == False]\n",
+    "filtered_jokes.dropna(subset=[\"selftext\"], inplace=True)\n",
+    "filtered_jokes = filtered_jokes[filtered_jokes[\"score\"] > MIN_JOKE_SCORE]\n",
+    "filtered_jokes = filtered_jokes[filtered_jokes[\"selftext\"] != \"[deleted]\"]\n",
+    "filtered_jokes = filtered_jokes[filtered_jokes[\"selftext\"] != \"[removed]\"]\n",
+    "filtered_jokes = filtered_jokes[~filtered_jokes[\"selftext\"].str.contains(\"edit:\", case=False)]\n",
+    "\n",
+    "filtered_jokes = filtered_jokes.sort_values(\"score\", ascending=False)\n",
+    "\n",
+    "filtered_jokes[[\"score\", \"title\", \"selftext\", \"subreddit.nsfw\"]]\n",
+    "filtered_jokes = filtered_jokes[0:SIZE_LIMIT]\n",
+    "print(len(filtered_jokes))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "joke_requests = [\n",
+    "    \"Can you share a joke that involves {}?\",\n",
+    "    \"Do you know any jokes related to {}?\",\n",
+    "    \"Could you tell me a funny joke that has to do with {}?\",\n",
+    "    \"I'm in the mood for a joke about {}. Do you have any good ones?\",\n",
+    "    \"Would you happen to have a joke about {} that you could tell me?\",\n",
+    "    \"Can you think of a joke that centers around {}?\",\n",
+    "    \"I'd love to hear a witty joke related to {}. Do you have one?\",\n",
+    "    \"Tell me a humorous joke that involves {}.\",\n",
+    "    \"Could you please entertain me with a joke related to {}?\",\n",
+    "    \"What's a good joke that relates to {}?\",\n",
+    "    \"I could use a good laugh. How about a joke about {}?\",\n",
+    "    \"What's a funny joke that relates to {}?\",\n",
+    "    \"Can you make me chuckle with a joke that involves {}?\",\n",
+    "    \"I'm curious if you have a joke up your sleeve that pertains to {}?\",\n",
+    "    \"Do you have a favorite joke that involves {}?\",\n",
+    "    \"Mind sharing a joke with me that has to do with {}?\",\n",
+    "    \"How about a joke related to {}? Do you have one?\",\n",
+    "    \"I'm in need of a good joke. Something that centers around {} should do the trick.\",\n",
+    "    \"Would you be willing to share a joke about {} with me?\",\n",
+    "    \"Can you think of a joke that relates to {} that you could tell me?\",\n",
+    "]\n",
+    "\n",
+    "\n",
+    "def make_item(joke):\n",
+    "    title = joke[\"title\"]\n",
+    "    body = joke[\"selftext\"]\n",
+    "    permalink = joke[\"permalink\"]\n",
+    "    joke_text = f\"{title}\\n{body}\"\n",
+    "    prefix = random.choice(joke_requests)\n",
+    "\n",
+    "    try:\n",
+    "        keywords = kw_model.extract_keywords(joke_text, keyphrase_ngram_range=(1, 2), stop_words=\"english\")\n",
+    "        main_keyword = keywords[0][0]\n",
+    "        instruction = f\"{prefix.format(main_keyword)}\"\n",
+    "    except Exception as e:\n",
+    "        print(\"Error:\", e, joke_text, joke)\n",
+    "        instruction = \"Could you tell me a random joke?\"\n",
+    "\n",
+    "    return pd.Series(\n",
+    "        [instruction, joke_text, HF_DATASET, {\"nsfw\": False, \"link\": permalink}],\n",
+    "        index=[\"INSTRUCTION\", \"RESPONSE\", \"SOURCE\", \"METADATA\"],\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%time oa_format = filtered_jokes.swifter.apply(make_item, axis=1)\n",
+    "print(len(oa_format))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "oa_format.to_parquet(\"dataset.parquet\", row_group_size=100, engine=\"pyarrow\", index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds = Dataset.from_parquet(\"dataset.parquet\")\n",
+    "ds.push_to_hub(f\"mikegarts/oa_tell_a_joke_{SIZE_LIMIT}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}