Skip to content

Commit

Permalink
Update course notebooks
Browse files Browse the repository at this point in the history
  • Loading branch information
sgugger committed Nov 15, 2021
1 parent 22b7382 commit 2dedfdf
Show file tree
Hide file tree
Showing 18 changed files with 703 additions and 27 deletions.
10 changes: 10 additions & 0 deletions course/chapter5/section3.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -691,6 +691,16 @@
"drug_dataset_reloaded"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for split, dataset in drug_dataset_clean.items():\n",
" dataset.to_json(f\"drug-reviews-{split}.jsonl\")"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down
38 changes: 37 additions & 1 deletion course/chapter5/section5.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,43 @@
"metadata": {},
"outputs": [],
"source": [
"!pip install datasets transformers[sentencepiece]"
"!pip install datasets transformers[sentencepiece]\n",
"!apt install git-lfs"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You will need to setup git, adapt your email and name in the following cell."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!git config --global user.email \"you@example.com\"\n",
"!git config --global user.name \"Your Name\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from huggingface_hub import notebook_login\n",
"\n",
"notebook_login()"
]
},
{
Expand Down
21 changes: 17 additions & 4 deletions course/chapter5/section6.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,8 @@
}
],
"source": [
"from datasets import Dataset\n",
"\n",
"comments_dataset = Dataset.from_pandas(comments_df)\n",
"comments_dataset"
]
Expand Down Expand Up @@ -285,7 +287,7 @@
" )\n",
" encoded_input = {k: v.to(device) for k, v in encoded_input.items()}\n",
" model_output = model(**encoded_input)\n",
" return cls_pooling(model_output, encoded_input[\"attention_mask\"])"
" return cls_pooling(model_output)"
]
},
{
Expand All @@ -305,7 +307,7 @@
}
],
"source": [
"embedding = get_embeddings(explode_dataset[\"text\"][0])\n",
"embedding = get_embeddings(comments_dataset[\"text\"][0])\n",
"embedding.shape"
]
},
Expand All @@ -315,11 +317,20 @@
"metadata": {},
"outputs": [],
"source": [
"embeddings_dataset = explode_dataset.map(\n",
" lambda x: {\"embeddings\": get_embeddings(x[\"text\"]).cpu().numpy()[0]}\n",
"embeddings_dataset = comments_dataset.map(\n",
" lambda x: {\"embeddings\": get_embeddings(x[\"text\"]).detach().cpu().numpy()[0]}\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!pip install faiss-gpu"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down Expand Up @@ -368,6 +379,8 @@
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"samples_df = pd.DataFrame.from_dict(samples)\n",
"samples_df[\"scores\"] = scores\n",
"samples_df.sort_values(\"scores\", ascending=False, inplace=True)"
Expand Down
49 changes: 48 additions & 1 deletion course/chapter6/section2.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,43 @@
"metadata": {},
"outputs": [],
"source": [
"!pip install datasets transformers[sentencepiece]"
"!pip install datasets transformers[sentencepiece]\n",
"!apt install git-lfs"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You will need to setup git, adapt your email and name in the following cell."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!git config --global user.email \"you@example.com\"\n",
"!git config --global user.name \"Your Name\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from huggingface_hub import notebook_login\n",
"\n",
"notebook_login()"
]
},
{
Expand Down Expand Up @@ -275,6 +311,17 @@
"tokenizer.save_pretrained(\"code-search-net-tokenizer\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from huggingface_hub import notebook_login\n",
"\n",
"notebook_login()"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down
2 changes: 1 addition & 1 deletion course/chapter6/section5.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@
"vocab_size = 50\n",
"\n",
"while len(vocab) < vocab_size:\n",
" pair_freqs, splits = compute_pair_freqs(splits)\n",
" pair_freqs = compute_pair_freqs(splits)\n",
" best_pair = \"\"\n",
" max_freq = None\n",
" for pair, freq in pair_freqs.items():\n",
Expand Down
48 changes: 47 additions & 1 deletion course/chapter7/section2_pt.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,44 @@
"metadata": {},
"outputs": [],
"source": [
"!pip install datasets transformers[sentencepiece]"
"!pip install datasets transformers[sentencepiece]\n",
"! pip install accelerate cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl\n",
"!apt install git-lfs"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You will need to setup git, adapt your email and name in the following cell."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!git config --global user.email \"you@example.com\"\n",
"!git config --global user.name \"Your Name\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from huggingface_hub import notebook_login\n",
"\n",
"notebook_login()"
]
},
{
Expand Down Expand Up @@ -395,6 +432,15 @@
" print(tokenized_datasets[\"train\"][i][\"labels\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!pip install seqeval"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down
47 changes: 46 additions & 1 deletion course/chapter7/section2_tf.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,43 @@
"metadata": {},
"outputs": [],
"source": [
"!pip install datasets transformers[sentencepiece]"
"!pip install datasets transformers[sentencepiece]\n",
"!apt install git-lfs"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You will need to setup git, adapt your email and name in the following cell."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!git config --global user.email \"you@example.com\"\n",
"!git config --global user.name \"Your Name\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from huggingface_hub import notebook_login\n",
"\n",
"notebook_login()"
]
},
{
Expand Down Expand Up @@ -496,6 +532,15 @@
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!pip install seqeval"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down
60 changes: 59 additions & 1 deletion course/chapter7/section3_pt.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,44 @@
"metadata": {},
"outputs": [],
"source": [
"!pip install datasets transformers[sentencepiece]"
"!pip install datasets transformers[sentencepiece]\n",
"! pip install accelerate cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl\n",
"!apt install git-lfs"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You will need to setup git, adapt your email and name in the following cell."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!git config --global user.email \"you@example.com\"\n",
"!git config --global user.name \"Your Name\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from huggingface_hub import notebook_login\n",
"\n",
"notebook_login()"
]
},
{
Expand Down Expand Up @@ -673,6 +710,27 @@
" return {\"masked_\" + k: v.numpy() for k, v in masked_inputs.items()}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"downsampled_dataset = downsampled_dataset.remove_columns([\"word_ids\"])\n",
"eval_dataset = downsampled_dataset[\"test\"].map(\n",
" insert_random_mask,\n",
" batched=True,\n",
" remove_columns=downsampled_dataset[\"test\"].column_names,\n",
")\n",
"eval_dataset = eval_dataset.rename_columns(\n",
" {\n",
" \"masked_input_ids\": \"input_ids\",\n",
" \"masked_attention_mask\": \"attention_mask\",\n",
" \"masked_labels\": \"labels\",\n",
" }\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down
Loading

0 comments on commit 2dedfdf

Please sign in to comment.