From 56b2f876e47d31c8f5273bcaa9797bf0cd294382 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Thu, 24 Nov 2022 14:35:22 +0100 Subject: [PATCH 1/2] Add distillation trainer example --- README.md | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 61 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 057523e5..81450b20 100644 --- a/README.md +++ b/README.md @@ -84,7 +84,7 @@ trainer.push_to_hub("my-awesome-setfit-model") # Download from Hub and run inference model = SetFitModel.from_pretrained("lewtun/my-awesome-setfit-model") # Run inference -preds = model(["i loved the spiderman movie!", "pineapple on pizza is the worst 🤮"]) +preds = model(["i loved the spiderman movie!", "pineapple on pizza is the worst 🤮"]) ``` Here is an end-to-end example using `SetFitHead`: @@ -94,15 +94,14 @@ Here is an end-to-end example using `SetFitHead`: from datasets import load_dataset from sentence_transformers.losses import CosineSimilarityLoss -from setfit import SetFitModel, SetFitTrainer +from setfit import SetFitModel, SetFitTrainer, sample_dataset # Load a dataset from the Hugging Face Hub dataset = load_dataset("sst2") # Simulate the few-shot regime by sampling 8 examples per class -num_classes = 2 -train_dataset = dataset["train"].shuffle(seed=42).select(range(8 * num_classes)) +train_dataset = sample_dataset(dataset["train"], label_column="label", num_samples=8) eval_dataset = dataset["validation"] # Load a SetFit model from Hub @@ -150,7 +149,7 @@ trainer.push_to_hub("my-awesome-setfit-model") # Download from Hub and run inference model = SetFitModel.from_pretrained("lewtun/my-awesome-setfit-model") # Run inference -preds = model(["i loved the spiderman movie!", "pineapple on pizza is the worst 🤮"]) +preds = model(["i loved the spiderman movie!", "pineapple on pizza is the worst 🤮"]) ``` Based on our experiments, `SetFitHead` can achieve similar performance as using a `scikit-learn` head. We use `AdamW` as the optimizer and scale down learning rates by 0.5 every 5 epochs. For more details about the experiments, please check out [here](https://github.com/huggingface/setfit/pull/112#issuecomment-1295773537). We recommend using a large learning rate (e.g. `1e-2`) for `SetFitHead` and a small learning rate (e.g. `1e-5`) for the body in your first attempt. @@ -246,7 +245,7 @@ def hp_space(trial): # Training parameters "seed": trial.suggest_int("seed", 1, 40), "num_iterations": trial.suggest_categorical("num_iterations", [5, 10, 20]), "max_iter": trial.suggest_int("max_iter", 50, 300), - "solver": trial.suggest_categorical("solver", ["newton-cg", "lbfgs", "liblinear"]), + "solver": trial.suggest_categorical("solver", ["newton-cg", "lbfgs", "liblinear"]), } ``` @@ -279,6 +278,62 @@ trainer.apply_hyperparameters(best_run.hyperparameters, final_model=True) trainer.train() ``` +## Compressing a SetFit model with knowledge distillation + +If you have access to unlabeled data, you can use knowledge distillation to compress a trained SetFit model into a smaller version. The result is a model that can run inference much faster, with little to no drop in accuracy. Here's an end-to-end example (see our paper for more details): + +```python +from datasets import load_dataset +from sentence_transformers.losses import CosineSimilarityLoss + +from setfit import SetFitModel, SetFitTrainer, DistillationSetFitTrainer, sample_dataset + +# Load a dataset from the Hugging Face Hub +dataset = load_dataset("ag_news") + +# Simulate the few-shot regime by sampling 16 examples per class +train_dataset = sample_dataset(dataset["train"], label_column="label", num_samples=16) +# Create a dataset of unlabeled examples +train_dataset_student = dataset["train"].shuffle(seed=0).select(range(500)) +eval_dataset = dataset["test"] + +# Load teacher model +teacher_model = SetFitModel.from_pretrained( + "sentence-transformers/paraphrase-mpnet-base-v2" +) + +# Create trainer for teacher model +teacher_trainer = SetFitTrainer( + model=teacher_model, + train_dataset=train_dataset_teacher, + eval_dataset=eval_dataset, + loss_class=CosineSimilarityLoss, +) + +# Train teacher model +teacher_trainer.train() + +# Load small student model +student_model = SetFitModel.from_pretrained("paraphrase-MiniLM-L3-v2") + +# Create trainer for knowledge distillation +student_trainer = DistillationSetFitTrainer( + teacher_model=teacher_model, + train_dataset=train_dataset_student, + student_model=student_model, + eval_dataset=eval_dataset, + loss_class=CosineSimilarityLoss, + metric="accuracy", + batch_size=16, + num_iterations=20, + num_epochs=1, +) + +# Train student with knowledge distillation +student_trainer.train() +``` + + ## Reproducing the results from the paper We provide scripts to reproduce the results for SetFit and various baselines presented in Table 2 of our paper. Check out the setup and training instructions in the `scripts/` directory. From a3777d44caa163edf3fe37413849b51b0b5de19f Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Thu, 24 Nov 2022 15:06:49 +0100 Subject: [PATCH 2/2] Tweak --- README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 81450b20..2c436112 100644 --- a/README.md +++ b/README.md @@ -291,10 +291,11 @@ from setfit import SetFitModel, SetFitTrainer, DistillationSetFitTrainer, sample # Load a dataset from the Hugging Face Hub dataset = load_dataset("ag_news") -# Simulate the few-shot regime by sampling 16 examples per class -train_dataset = sample_dataset(dataset["train"], label_column="label", num_samples=16) -# Create a dataset of unlabeled examples +# Create a sample few-shot dataset to train the teacher model +train_dataset_teacher = sample_dataset(dataset["train"], label_column="label", num_samples=16) +# Create a dataset of unlabeled examples to train the student train_dataset_student = dataset["train"].shuffle(seed=0).select(range(500)) +# Dataset for evaluation eval_dataset = dataset["test"] # Load teacher model