diff --git a/examples/DVCLive-HuggingFace.ipynb b/examples/DVCLive-HuggingFace.ipynb index 2e5ed571..583bd9bd 100644 --- a/examples/DVCLive-HuggingFace.ipynb +++ b/examples/DVCLive-HuggingFace.ipynb @@ -1,167 +1,320 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!pip install accelerate datasets dvclive evaluate 'transformers[torch]' --upgrade" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "3SJ8SY6ldmsS" + }, + "source": [ + "### How to do Experiment tracking with DVCLive\n", + "\n", + "What you will learn ?\n", + "\n", + "- Fine-tuning a model on a binary text classification task\n", + "- Track Machine Learning experiments with DVCLive\n", + "- Visualize results and create report\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nxiSBytidmsU" + }, + "source": [ + "#### Git & Install Dependencies\n", + "\n", + "- Install accelerate , Datasets , evaluate , transformers and dvclive\n", + "- Start a Git repo . Your experiments will be saved in a commit but hidden in\n", + " order to not clutter your repo.\n", + "- Initialize DVC\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "CLRgy2W4dmsU" + }, + "outputs": [], + "source": [ + "!pip install accelerate datasets dvclive evaluate pandas 'transformers[torch]' --upgrade" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fo0sq84UdmsV" + }, + "outputs": [], + "source": [ + "!git init -q\n", + "!git config --local user.email \"you@example.com\"\n", + "!git config --local user.name \"Your Name\"\n", + "!dvc init -q\n", + "!git commit -m \"DVC init\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "T5WYJ31UdmsV" + }, + "source": [ + "### Fine-tuning a model on a text classification task\n", + "\n", + "#### Loading the dataset\n", + "\n", + "We will use [imdb](https://huggingface.co/datasets/imdb) Large Movie Review Dataset. This is a dataset for binary\n", + "sentiment classification containing a set of 25K movie reviews for traning, and\n", + "25K for testing.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "41fP0WCbdmsV" + }, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "from transformers import AutoTokenizer\n", + "\n", + "dataset = load_dataset(\"imdb\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "V3gDKbbSdmsV" + }, + "source": [ + "#### Preprocessing the data\n", + "\n", + "We use Transformers Tokenizer which transforms the inputs and put them in a format\n", + "the model expects.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uVr5lufodmsV" + }, + "outputs": [], + "source": [ + "tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-cased\")\n", + "\n", + "def tokenize_function(examples):\n", + " return tokenizer(examples[\"text\"], padding=\"max_length\", truncation=True)\n", + "\n", + "small_train_dataset = dataset[\"train\"].shuffle(seed=42).select(range(2000)).map(tokenize_function, batched=True)\n", + "small_eval_dataset = dataset[\"test\"].shuffle(seed=42).select(range(200)).map(tokenize_function, batched=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "g9sELYMHdmsV" + }, + "source": [ + "#### Define evaluation metrics\n", + "\n", + "f1 is a metric for combining precision recall metrics in one unique value , so\n", + "we take this criteria for evaluating the models.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wmJoy5V-dmsW" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import evaluate\n", + "\n", + "metric = evaluate.load(\"f1\")\n", + "\n", + "def compute_metrics(eval_pred):\n", + " logits, labels = eval_pred\n", + " predictions = np.argmax(logits, axis=-1)\n", + " return metric.compute(predictions=predictions, references=labels)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NwFntrIKdmsW" + }, + "source": [ + "### Training and Tracking experiments with DVCLive\n", + "\n", + "Track experiments in DVC by changing a few lines of your Python code.\n", + "Enable experiment tracking using ´save_dvc_exp=True´ and ´log_model=True´\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "referenced_widgets": [ + "425795652e7047eab04fdb8816d85fe7", + "47f9d18c35a04f69b9bfe4e835a98d13", + "eae5029fcd6a49aab13a1aa11bb55a77", + "6b272c5eff1b49aab4f906cc0cea84bf", + "8f86d83b55b04afdb74ced76a4326f98", + "f5a4ea7cfa8a4cbbad83bf3a33db4172", + "cc96d129d3014c1a9fbb598a985c4c88", + "fb0f45a06d47475d83b7b9eec42d4e06", + "2c92c08cd14e46179fb1be7d5e36e2d1", + "1b760f9910934859ba90b92d94460855", + "30cb39221ea6463d9070fd9d0eefbfae", + "fc06215ebdfc4be0bcf8b2a3c227f4de", + "9db967ac8d4048d3bfedc3b2c21e15ba", + "0306ec09003e4271804a17a37a6e3db5", + "d2192a8e3e7b4df09d3cd7904b104519", + "ad9df876a82040f2853f63899c36e5fa", + "e40116d5bdbc43d0b0ffc27476c9eb6a", + "20c15ecc60794e07b70d96083514655b", + "674a922aef3b4d82b5b65241ef5d4de1", + "cec588e36e814237add88c6695d54952", + "1236f4a882fc47389c392fa88f85d6cc", + "3288c1eaa7b640f4bcf74fcdf7b12aaa", + "d7f7f24763b844128f44e7b787e848ff", + "d17ee4b49c184d63afa71fee9f15d4a1", + "92afad02b9064c1ba40396e5bc36931c", + "94d4335888e24e69964197f029f2389d", + "c6472537577a4900a3b190b64285b7a9", + "ab41d610477b47708634c7be45df9bc6", + "b802e2906072462abebd4b7b41fc1ab9", + "87576f1a78664458be6f7c4cbaa806a8", + "6f6a505d2bf84d588dc5a23dc34ef988", + "c9b665b4da694075b02ed09189dee813", + "30fad6f36d9f4f8eb6a1a3f70a93fe31" + ] + }, + "id": "gKKSTh0ZdmsW", + "outputId": "54733639-db04-4b82-f8ac-3d20f81712bb" + }, + "outputs": [], + "source": [ + "from dvclive.huggingface import DVCLiveCallback\n", + "from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer\n", + "\n", + "for epochs in (5, 10, 15):\n", + " model = AutoModelForSequenceClassification.from_pretrained(\"distilbert-base-cased\", num_labels=2)\n", + " for param in model.base_model.parameters():\n", + " param.requires_grad = False\n", + "\n", + " training_args = TrainingArguments(\n", + " evaluation_strategy=\"epoch\",\n", + " learning_rate=3e-4,\n", + " logging_strategy=\"epoch\",\n", + " num_train_epochs=epochs,\n", + " output_dir=\"output\",\n", + " overwrite_output_dir=True,\n", + " load_best_model_at_end=True,\n", + " report_to=\"none\",\n", + " save_strategy=\"epoch\",\n", + " weight_decay=0.01,\n", + " )\n", + "\n", + " trainer = Trainer(\n", + " model=model,\n", + " args=training_args,\n", + " train_dataset=small_train_dataset,\n", + " eval_dataset=small_eval_dataset,\n", + " compute_metrics=compute_metrics,\n", + " callbacks=[DVCLiveCallback(report=\"notebook\", save_dvc_exp=True, log_model=True)],\n", + " )\n", + " trainer.train()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "l29wqAaDdmsW" + }, + "source": [ + "### Comparing Experiments\n", + "\n", + "We create a dataframe with the experiments in order to visualize it.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wwMwHvVtdmsW", + "outputId": "7db5ce6b-4f70-4a5f-fa3e-84218d2c1446" + }, + "outputs": [], + "source": [ + "import dvc.api\n", + "import pandas as pd\n", + "\n", + "columns = [\"Experiment\", \"epoch\", \"eval.f1\"]\n", + "\n", + "df = pd.DataFrame(dvc.api.exp_show(), columns=columns)\n", + "\n", + "df.dropna(inplace=True)\n", + "df.reset_index(drop=True, inplace=True)\n", + "df\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "TNBGUqoCdmsW", + "outputId": "2a4ebf29-e7e3-40f7-ec56-2ccc857c215f" + }, + "outputs": [], + "source": [ + "!dvc plots diff $(dvc exp list --names-only)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sL5pH4X5dmsW", + "outputId": "168ffdad-baff-4b79-8596-c54a5f4e4c86" + }, + "outputs": [], + "source": [ + "from IPython.display import HTML\n", + "HTML(filename='./dvc_plots/index.html')" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + }, + "orig_nbformat": 4 }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!git init -q\n", - "!git config --local user.email \"you@example.com\"\n", - "!git config --local user.name \"Your Name\"\n", - "!dvc init -q\n", - "!git commit -m \"DVC init\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from datasets import load_dataset\n", - "from transformers import AutoTokenizer\n", - "\n", - "dataset = load_dataset(\"imdb\")\n", - "\n", - "tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-cased\")\n", - "\n", - "def tokenize_function(examples):\n", - " return tokenizer(examples[\"text\"], padding=\"max_length\", truncation=True)\n", - "\n", - "small_train_dataset = dataset[\"train\"].shuffle(seed=42).select(range(2000)).map(tokenize_function, batched=True)\n", - "small_eval_dataset = dataset[\"test\"].shuffle(seed=42).select(range(200)).map(tokenize_function, batched=True)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import evaluate\n", - "\n", - "metric = evaluate.load(\"f1\")\n", - "\n", - "def compute_metrics(eval_pred):\n", - " logits, labels = eval_pred\n", - " predictions = np.argmax(logits, axis=-1)\n", - " return metric.compute(predictions=predictions, references=labels)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Tracking experiments with DVCLive" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from dvclive.huggingface import DVCLiveCallback\n", - "from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer\n", - "\n", - "for epochs in (5, 10, 15):\n", - " model = AutoModelForSequenceClassification.from_pretrained(\"distilbert-base-cased\", num_labels=2)\n", - " for param in model.base_model.parameters():\n", - " param.requires_grad = False\n", - "\n", - " training_args = TrainingArguments(\n", - " evaluation_strategy=\"epoch\", \n", - " learning_rate=3e-4,\n", - " logging_strategy=\"epoch\",\n", - " num_train_epochs=epochs,\n", - " output_dir=\"output\", \n", - " overwrite_output_dir=True,\n", - " load_best_model_at_end=True,\n", - " report_to=\"none\",\n", - " save_strategy=\"epoch\",\n", - " weight_decay=0.01,\n", - " )\n", - "\n", - " trainer = Trainer(\n", - " model=model,\n", - " args=training_args,\n", - " train_dataset=small_train_dataset,\n", - " eval_dataset=small_eval_dataset,\n", - " compute_metrics=compute_metrics,\n", - " callbacks=[DVCLiveCallback(report=\"notebook\", save_dvc_exp=True, log_model=True)],\n", - " )\n", - " trainer.train()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Comparing" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import dvc.api\n", - "import pandas as pd\n", - "\n", - "columns = [\"Experiment\", \"epoch\", \"eval.f1\"]\n", - "\n", - "df = pd.DataFrame(dvc.api.exp_show(), columns=columns)\n", - "\n", - "df.dropna(inplace=True)\n", - "df.reset_index(drop=True, inplace=True)\n", - "df\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!dvc plots diff $(dvc exp list --names-only)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from IPython.display import HTML\n", - "HTML(filename='./dvc_plots/index.html')" - ] - } - ], - "metadata": { - "language_info": { - "name": "python" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 + "nbformat": 4, + "nbformat_minor": 0 }