In [11]:
from datasets import load_dataset
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitModel, SetFitTrainer, DistillationSetFitTrainer, sample_dataset
from setfit.exporters.onnx import export_onnx
from optuna.visualization.matplotlib import plot_param_importances
from sklearn.metrics import f1_score

In [12]:

dataset = load_dataset("sst2")

train_dataset_teacher = sample_dataset(dataset["train"],label_column="label")
train_dataset_student = dataset["train"].shuffle(seed=0).select(range(500))
eval_dataset = dataset["validation"]


Found cached dataset sst2 (/Users/isidoracupara/.cache/huggingface/datasets/sst2/default/2.0.0/9896208a8d85db057ac50c72282bcb8fe755accc671a57dd8059d4e130961ed5)
100%|██████████| 3/3 [00:00<00:00, 337.99it/s]
Loading cached shuffled indices for dataset at /Users/isidoracupara/.cache/huggingface/datasets/sst2/default/2.0.0/9896208a8d85db057ac50c72282bcb8fe755accc671a57dd8059d4e130961ed5/cache-2459d1a782cafb86.arrow
Loading cached shuffled indices for dataset at /Users/isidoracupara/.cache/huggingface/datasets/sst2/default/2.0.0/9896208a8d85db057ac50c72282bcb8fe755accc671a57dd8059d4e130961ed5/cache-7a06a0835566eb48.arrow


In [36]:
# model_id = ["sentence-transformers/paraphrase-mpnet-base-v2", "paraphrase-multilingual-MiniLM-L12-v2"]


def make_model_teacher(params=None):
    params = params or {}
    max_iter = params.get("max_iter", 100)
    solver = params.get("solver", "liblinear")
    params = {
        "head_params": {
            "max_iter": max_iter,
            "solver": solver,
        }
    }
    return SetFitModel.from_pretrained(
        "sentence-transformers/paraphrase-mpnet-base-v2"
    )

def make_model_student(params=None):
    params = params or {}
    max_iter = params.get("max_iter", 100)
    solver = params.get("solver", "liblinear")
    params = {
        "head_params": {
            "max_iter": max_iter,
            "solver": solver,
        }
    }
    return SetFitModel.from_pretrained(
        "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
    )

teacher_model_init = make_model_teacher()
student_model_init = make_model_student()

# make_model = list(map(make_model(), model_id))

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Downloading (…)lve/main/config.json: 100%|██████████| 645/645 [00:00<00:00, 427kB/s]
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [14]:
def hyperparameter_search_function(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
        "num_epochs": trial.suggest_int("num_epochs", 1, 5),
        "batch_size": trial.suggest_categorical("batch_size", [4, 8, 16, 32, 64]),
        "seed": trial.suggest_int("seed", 1, 40),
        "num_iterations": trial.suggest_categorical("num_iterations", [5, 10, 20]),
    }

In [15]:
def train_teacher_model():

    # Create trainer for teacher model
    teacher_trainer = SetFitTrainer(
        model_init= teacher_model_init,
        train_dataset=train_dataset_teacher,
        eval_dataset=eval_dataset,
        loss_class=CosineSimilarityLoss,
        metric="accuracy",
        batch_size=16,
        num_iterations=20, # The number of text pairs to generate for contrastive learning
        num_epochs=1, #A good rule of thumb is to start with a value that is 3 times the number of features in your data
        # Excerpt from the research paper: "...perform a hyperparameter search on the number of epochs in the range [25,75] and pick the best performing model on a validation split"
        column_mapping={"sentence": "text", "label": "label"} # Map dataset columns to text/label expected by trainer
        )

    # Train and evaluate

    best_teacher = teacher_trainer.hyperparameter_search(hyperparameter_search_function, n_trials=15)
    teacher_trainer.apply_hyperparameters(best_teacher.hyperparameters, final_model=True)
    teacher_trainer.train()
    print(f"🐈‍⬛ Teacher traininer metrics: {best_teacher.objective}")
    # plot_param_importances(best_teacher.backend)

    return teacher_trainer

teacher_trainer = train_teacher_model()


model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
[I 2023-07-24 11:31:46,118] A new study created in memory with name: no-name-5b4ff039-c06a-46c2-9173-b63513e87890
Trial: {'learning_rate': 3.1670840248189388e-06, 'num_epochs': 2, 'batch_size': 8, 'seed': 15, 'num_iterations': 20}
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Applying column mapping to training dataset
Generating Training Pairs: 100%|██████████| 20/20 [00:00<00:00, 4088.02it/s]
***** Running training *****
  Num examples = 640
  Num epochs = 2
  Total optimization steps = 160
  Total train batch size = 8
Iteration: 100%|██████████| 80/80 [01:12<00:00,  1.11it/s]
Iteration: 100%|██████████| 80/80 [01:11<00:00,  1.12it/s]
Epoch: 100%|██████████|

🐈‍⬛ Teacher traininer metrics: 0.8704128440366973


In [38]:
# def train_student_model(teacher_model_init):

#     teacher_model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-mpnet-base-v2")
#     student_model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

#     # Create trainer for knowledge distillation
#     student_trainer = DistillationSetFitTrainer(
#         teacher_model=teacher_model,
#         train_dataset=train_dataset_student,
#         student_model=student_model,
#         eval_dataset=eval_dataset,
#         # model_init=student_model_init,
#         loss_class=CosineSimilarityLoss,
#         metric="accuracy",
#         batch_size=16,
#         num_iterations=20,
#         num_epochs=1,
#         column_mapping={"sentence": "text", "label": "label"} # Map dataset columns to text/label expected by trainer
#     )

#     # Train and evaluate

#     best_student = student_trainer.hyperparameter_search(hyperparameter_search_function, n_trials=15)
#     student_trainer.apply_hyperparameters(best_student.hyperparameters, final_model=True)
#     student_trainer.train()
#     # print(f"🐈‍⬛ Student traininer hyperparameters best: {best_student}")
#     print(f"🐈‍⬛ Student traininer metrics: {best_student.objective}")
#     # plot_param_importances(best_student.backend)

#     return student_trainer

# student_trainer = train_student_model(teacher_model_init)

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


RuntimeError: To use hyperparameter search, you need to pass your model through a model_init function.

In [41]:
def train_student_model():

    # Create trainer for knowledge distillation
    student_trainer = SetFitTrainer(
        train_dataset=train_dataset_student,
        eval_dataset=eval_dataset,
        model_init=make_model_student,
        loss_class=CosineSimilarityLoss,
        metric="accuracy",
        batch_size=16,
        num_iterations=20,
        num_epochs=1,
        column_mapping={"sentence": "text", "label": "label"} # Map dataset columns to text/label expected by trainer
    )

    # Train and evaluate

    best_student = student_trainer.hyperparameter_search(hyperparameter_search_function, n_trials=15)
    student_trainer.apply_hyperparameters(best_student.hyperparameters, final_model=True)
    student_trainer.train()
    print(f"🐈‍⬛ Student traininer hyperparameters best: {best_student}")
    print(f"🐈‍⬛ Student traininer metrics: {best_student.objective}")
    # plot_param_importances(best_student.backend)

    return student_trainer

student_trainer = train_student_model()

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
[I 2023-07-24 15:23:45,090] A new study created in memory with name: no-name-9b75bdaa-d0da-4a83-9822-dd41eb2ae010
Trial: {'learning_rate': 2.0525220061066955e-05, 'num_epochs': 5, 'batch_size': 32, 'seed': 32, 'num_iterations': 10}
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Applying column mapping to training dataset
Generating Training Pairs: 100%|██████████| 10/10 [00:00<00:00, 191.88it/s]
***** Running training *****
  Num examples = 10000
  Num epochs = 5
  Total optimization steps = 1565
  Total train batch size = 32
Iteration: 100%|██████████| 313/313 [09:36<00:00,  1.84s/it]
Iteration: 100%|██████████| 313/313 [09:43<00:00,  1.86s/it]
Iteration: 100%

In [None]:
def train_student_model(teacher_model, student_model):

    student_model = SetFitModel.from_pretrained("paraphrase-multilingual-MiniLM-L12-v2")

    # Create trainer for knowledge distillation
    student_trainer = DistillationSetFitTrainer(
        teacher_model=teacher_model,
        train_dataset=train_dataset_student,
        student_model=student_model,
        eval_dataset=eval_dataset,
        # model_init=make_model_student,
        loss_class=CosineSimilarityLoss,
        metric="accuracy",
        batch_size=16,
        num_iterations=20,
        num_epochs=1,
        column_mapping={"sentence": "text", "label": "label"} # Map dataset columns to text/label expected by trainer
    )

    # Train and evaluate

    metrics = student_trainer.evaluate
    student_trainer.train()
    print(f"🐈‍⬛ Student traininer metrics: {metrics}")
    # plot_param_importances(best_student.backend)

    return student_trainer

model = train_student_model(teacher_trainer,student_trainer)

In [None]:
def export_model(model):

    output_path = f"model/setfit_model_distilled.onnx"
    export_onnx(model.model_body,
                model.model_head,
                opset=12,
                output_path=output_path)
    message = f"Distilled model exported to onnx format.\n"
    print("~" * len(message) + "\n" + message + "~" * len(message))

export_model(model)