In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import torch
import sk2torch

from sklearn.metrics import roc_auc_score

from datasets import Dataset, load_dataset
from sentence_transformers.losses import CosineSimilarityLoss
from sentence_transformers.SentenceTransformer import SentenceTransformer
from setfit import SetFitModel, SetFitTrainer
from tqdm.auto import tqdm

from setfit_ig.html_text_colorizer import WordImportanceColorsSetFit
from setfit_ig.integrated_gradients import integrated_gradients_on_text
from setfit_ig.model_head import SklearnToPyTorchLogisticRegression

from setfit_ig.setfit_extensions import SetFitGrad, SetFitModelWithTorchHead

from IPython.display import HTML

from sklearn.model_selection import train_test_split

In [36]:
data = load_dataset("sst2", split="train")
data = data.train_test_split(
    train_size=20,
    test_size=300,
)


train = data["train"]
test = data["test"]


model_name = "sentence-transformers/all-MiniLM-L6-v2"


model = SetFitModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
trainer = SetFitTrainer(
    model=model,
    train_dataset=train,
    eval_dataset=test,
    loss_class=CosineSimilarityLoss,
    batch_size=15,
    num_epochs=1,
    num_iterations=20,
    column_mapping={"sentence": "text", "label": "label"},
)

Using custom data configuration default
Reusing dataset sst2 (/Users/kostis/.cache/huggingface/datasets/sst2/default/2.0.0/9896208a8d85db057ac50c72282bcb8fe755accc671a57dd8059d4e130961ed5)
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [37]:
trainer.train()

Applying column mapping to training dataset
***** Running training *****
  Num examples = 800
  Num epochs = 1
  Total optimization steps = 54
  Total train batch size = 15


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/54 [00:00<?, ?it/s]

In [38]:
model_st = SetFitModelWithTorchHead(
    model_body=model.model_body,
    model_head=SklearnToPyTorchLogisticRegression(model.model_head),
)

In [40]:
scores_st = model_st.predict_proba(test["sentence"]).detach().numpy()
scores = model.predict_proba(test["sentence"])[:, 1]

In [41]:
# sanity check
print(roc_auc_score(test["label"], scores_st))
print(roc_auc_score(test["label"], scores))

0.7312435069334657
0.7312435069334657


In [47]:
grd = SetFitGrad(model_st)
m = WordImportanceColorsSetFit(grd)

# blue for class 0 and red for class 1


def return_prediction(N):
    test_text, test_label = test["sentence"][N], test["label"][N]
    colors, df, prob, _ = m.show_colors_for_sentence(test_text, integration_steps=80)
    print(test_label)
    print(f"class probability: {prob:1.2f}")
    return colors


colors = return_prediction(10)
HTML(colors)

Remember to use:
from IPython.display import HTML
HTML(colored_text)


100%|█████████████████████████████████████████████████████████████████████| 80/80 [00:06<00:00, 11.60it/s]

1
class probability: 0.59





In [48]:
colors = return_prediction(50)
HTML(colors)

100%|█████████████████████████████████████████████████████████████████████| 80/80 [00:10<00:00,  7.78it/s]

0
class probability: 0.47





In [54]:
colors = return_prediction(220)
HTML(colors)

100%|█████████████████████████████████████████████████████████████████████| 80/80 [00:08<00:00,  9.62it/s]

0
class probability: 0.46



