Update course notebooks

huggingface · Nov 15, 2021 · 2dedfdf · 2dedfdf
1 parent 22b7382
commit 2dedfdf
Show file tree

Hide file tree

Showing 18 changed files with 703 additions and 27 deletions.
diff --git a/course/chapter5/section3.ipynb b/course/chapter5/section3.ipynb
@@ -691,6 +691,16 @@
     "drug_dataset_reloaded"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for split, dataset in drug_dataset_clean.items():\n",
+    "    dataset.to_json(f\"drug-reviews-{split}.jsonl\")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,

diff --git a/course/chapter5/section5.ipynb b/course/chapter5/section5.ipynb
@@ -20,7 +20,43 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!pip install datasets transformers[sentencepiece]"
+    "!pip install datasets transformers[sentencepiece]\n",
+    "!apt install git-lfs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You will need to setup git, adapt your email and name in the following cell."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!git config --global user.email \"you@example.com\"\n",
+    "!git config --global user.name \"Your Name\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import notebook_login\n",
+    "\n",
+    "notebook_login()"
    ]
   },
   {

diff --git a/course/chapter5/section6.ipynb b/course/chapter5/section6.ipynb
@@ -180,6 +180,8 @@
     }
    ],
    "source": [
+    "from datasets import Dataset\n",
+    "\n",
     "comments_dataset = Dataset.from_pandas(comments_df)\n",
     "comments_dataset"
    ]
@@ -285,7 +287,7 @@
     "    )\n",
     "    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}\n",
     "    model_output = model(**encoded_input)\n",
-    "    return cls_pooling(model_output, encoded_input[\"attention_mask\"])"
+    "    return cls_pooling(model_output)"
    ]
   },
   {
@@ -305,7 +307,7 @@
     }
    ],
    "source": [
-    "embedding = get_embeddings(explode_dataset[\"text\"][0])\n",
+    "embedding = get_embeddings(comments_dataset[\"text\"][0])\n",
     "embedding.shape"
    ]
   },
@@ -315,11 +317,20 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "embeddings_dataset = explode_dataset.map(\n",
-    "    lambda x: {\"embeddings\": get_embeddings(x[\"text\"]).cpu().numpy()[0]}\n",
+    "embeddings_dataset = comments_dataset.map(\n",
+    "    lambda x: {\"embeddings\": get_embeddings(x[\"text\"]).detach().cpu().numpy()[0]}\n",
     ")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install faiss-gpu"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -368,6 +379,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import pandas as pd\n",
+    "\n",
     "samples_df = pd.DataFrame.from_dict(samples)\n",
     "samples_df[\"scores\"] = scores\n",
     "samples_df.sort_values(\"scores\", ascending=False, inplace=True)"

diff --git a/course/chapter6/section2.ipynb b/course/chapter6/section2.ipynb
@@ -20,7 +20,43 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!pip install datasets transformers[sentencepiece]"
+    "!pip install datasets transformers[sentencepiece]\n",
+    "!apt install git-lfs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You will need to setup git, adapt your email and name in the following cell."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!git config --global user.email \"you@example.com\"\n",
+    "!git config --global user.name \"Your Name\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import notebook_login\n",
+    "\n",
+    "notebook_login()"
    ]
   },
   {
@@ -275,6 +311,17 @@
     "tokenizer.save_pretrained(\"code-search-net-tokenizer\")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import notebook_login\n",
+    "\n",
+    "notebook_login()"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,

diff --git a/course/chapter6/section5.ipynb b/course/chapter6/section5.ipynb
@@ -266,7 +266,7 @@
     "vocab_size = 50\n",
     "\n",
     "while len(vocab) < vocab_size:\n",
-    "    pair_freqs, splits = compute_pair_freqs(splits)\n",
+    "    pair_freqs = compute_pair_freqs(splits)\n",
     "    best_pair = \"\"\n",
     "    max_freq = None\n",
     "    for pair, freq in pair_freqs.items():\n",

diff --git a/course/chapter7/section2_pt.ipynb b/course/chapter7/section2_pt.ipynb
@@ -20,7 +20,44 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!pip install datasets transformers[sentencepiece]"
+    "!pip install datasets transformers[sentencepiece]\n",
+    "! pip install accelerate cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl\n",
+    "!apt install git-lfs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You will need to setup git, adapt your email and name in the following cell."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!git config --global user.email \"you@example.com\"\n",
+    "!git config --global user.name \"Your Name\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import notebook_login\n",
+    "\n",
+    "notebook_login()"
    ]
   },
   {
@@ -395,6 +432,15 @@
     "    print(tokenized_datasets[\"train\"][i][\"labels\"])"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install seqeval"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,

diff --git a/course/chapter7/section2_tf.ipynb b/course/chapter7/section2_tf.ipynb
@@ -20,7 +20,43 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!pip install datasets transformers[sentencepiece]"
+    "!pip install datasets transformers[sentencepiece]\n",
+    "!apt install git-lfs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You will need to setup git, adapt your email and name in the following cell."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!git config --global user.email \"you@example.com\"\n",
+    "!git config --global user.name \"Your Name\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import notebook_login\n",
+    "\n",
+    "notebook_login()"
    ]
   },
   {
@@ -496,6 +532,15 @@
     ")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install seqeval"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,

diff --git a/course/chapter7/section3_pt.ipynb b/course/chapter7/section3_pt.ipynb
@@ -20,7 +20,44 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!pip install datasets transformers[sentencepiece]"
+    "!pip install datasets transformers[sentencepiece]\n",
+    "! pip install accelerate cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl\n",
+    "!apt install git-lfs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You will need to setup git, adapt your email and name in the following cell."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!git config --global user.email \"you@example.com\"\n",
+    "!git config --global user.name \"Your Name\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import notebook_login\n",
+    "\n",
+    "notebook_login()"
    ]
   },
   {
@@ -673,6 +710,27 @@
     "    return {\"masked_\" + k: v.numpy() for k, v in masked_inputs.items()}"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "downsampled_dataset = downsampled_dataset.remove_columns([\"word_ids\"])\n",
+    "eval_dataset = downsampled_dataset[\"test\"].map(\n",
+    "    insert_random_mask,\n",
+    "    batched=True,\n",
+    "    remove_columns=downsampled_dataset[\"test\"].column_names,\n",
+    ")\n",
+    "eval_dataset = eval_dataset.rename_columns(\n",
+    "    {\n",
+    "        \"masked_input_ids\": \"input_ids\",\n",
+    "        \"masked_attention_mask\": \"attention_mask\",\n",
+    "        \"masked_labels\": \"labels\",\n",
+    "    }\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,