In [None]:
!pip install setfit[optuna]
!pip install datasets

# Imports

In [None]:
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets
import evaluate
import numpy as np
import random
import os

from sklearn.metrics import accuracy_score, f1_score, classification_report

from huggingface_hub import notebook_login, login
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitModel, SetFitTrainer

from optuna.visualization.matplotlib import plot_param_importances

In [None]:
login()

# Functions

In [None]:
def encode_labels(record):
    onehot_vec = [0 for x in range(6)]
    onehot_vec[record['intent']] = 1
    record['label'] = onehot_vec
    return record

In [None]:
def evaluate_model(ds, model):

    y_pred = model(ds['text'])
    y_true = ds['label']

    print(f'Accuracy: {round(accuracy_score(y_true, y_pred), 2)}')

    print(classification_report(y_true, y_pred))


In [None]:
def hyperparameter_search_function(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True),
        "batch_size": trial.suggest_categorical("batch_size", [8, 16]),
        "multi_target_strategy": trial.suggest_categorical("multi_target_strategy", ["multi-output"]),
        "num_epochs": trial.suggest_categorical("num_epochs", [5, 10]),
    }

In [None]:
def make_model(params=None):
    multi_target_strategy = params["multi_target_strategy"] if params else "one-vs-rest"
    return SetFitModel.from_pretrained(
        model_id, multi_target_strategy=multi_target_strategy
    )

# Global Variables

In [None]:
# base LLM model: sentence-transformers/all-mpnet-base-v2
# robust LLM model: sentence-transformers/all-MiniLM-L6-v2
model_id = "sentence-transformers/all-mpnet-base-v2"

In [None]:
update_hf_model = False

# Read Data

In [None]:
dataset = load_dataset("ialvarenga/acl-arc-revised")

In [None]:
# Select the train and test sets
train_ds = dataset["train"]
test_ds = dataset["test"]
eval_ds = dataset["eval"]

In [None]:
# here we have to map the taget column from a single int indicating the label to a vector, so that it can be in the right format for the neural network
train_ds = train_ds.map(encode_labels)
test_ds = test_ds.map(encode_labels)
eval_ds = eval_ds.map(encode_labels)

In [None]:
#TODO make a function to print each dataset distribution
concatenate_datasets([train_ds, test_ds, eval_ds]).to_pandas().intent.value_counts()

# Experiment with all the data

In [None]:
all_data_model_name = "ialvarenga/setfit-intent-clf-fine-tuned"

In [None]:
# Create trainer
trainer = SetFitTrainer(
    model_init=make_model,
    metric='f1',
    metric_kwargs={'average': 'macro'},
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    loss_class=CosineSimilarityLoss,
    num_iterations=20,
    use_amp=True
)

In [None]:
best = trainer.hyperparameter_search(hyperparameter_search_function, n_trials=10)
best

In [None]:
try:
    plot_param_importances(best.backend)
except:
    pass

In [None]:
trainer.apply_hyperparameters(best.hyperparameters, final_model=True) # replaces model_init with a fixed model
trainer.train()

In [None]:
metrics = trainer.evaluate()
metrics

In [None]:
trainer.batch_size

In [None]:
if update_hf_model:
    trainer.push_to_hub(all_data_model_name)

In [None]:
model = SetFitModel.from_pretrained(all_data_model_name)

In [None]:
evaluate_model(test_ds, model)