In [2]:
from datasets import load_dataset
from sentence_transformers.losses import CosineSimilarityLoss
import numpy as np
from setfit import SetFitModel, SetFitTrainer

### 1. Binary classification

In [3]:
dataset = load_dataset("joshuapsa/gpt-generated-news-paragraphs")

Downloading readme:   0%|          | 0.00/2.62k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/93.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.3k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/540 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/180 [00:00<?, ? examples/s]

Generating valid split:   0%|          | 0/180 [00:00<?, ? examples/s]

In [2]:
dataset = load_dataset("SetFit/SentEval-CR")

Repo card metadata block was not found. Setting CardData to empty.


In [3]:
train_ds = dataset["train"].shuffle(seed=42).select(range(8 * 2))
print(train_ds['text'][:15])
print(train_ds['label'][:15])

['* slick-looking design and improved interface', "the day finally arrived when i was sure i 'd leave sprint .", 'as for bluetooth , no problems at all .', '2 ) storage capacity', "neither message was answered ( they ask for 24 hours before replying - i 've been waiting 27 days . )", "for a price that 's still less than even the lowest level ipod i was able to get this 40gb monster , and the best part is it works as great as it was advertised to and then some .", 'i bought the player this week and i like it by far .', 'only problem is that is a bit heavy .', 'i love the slim design ; . the weight would only be an issue if it were bulky .', 'it fits into a hand well , it has a removable battery ( this is important ) , great sound quality , fm stereo , recorder , smooth ui , and a feature that most uni pods lack . . . char ! .', 'once a depth is locked , it will jump off a little while working .', 'the thought of not having to buy refills and just using regular bags is awesome ! .', 'whe

In [4]:
test_ds = dataset["test"]

In [5]:
# check the type of the 2 datasets:
print(type(train_ds))

<class 'datasets.arrow_dataset.Dataset'>


In [6]:
# Load SetFit model from Hub
model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-mpnet-base-v2")

# Create trainer
trainer = SetFitTrainer(
    model=model,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    loss_class=CosineSimilarityLoss,
    batch_size=16,
    num_iterations=20, # Number of text pairs to generate for contrastive learning
    num_epochs=1 # Number of epochs to use for contrastive learning
)

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [7]:
# Train and evaluate!
trainer.train()
metrics = trainer.evaluate()

Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 640
  Num epochs = 1
  Total optimization steps = 40
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/40 [00:00<?, ?it/s]

***** Running evaluation *****


In [14]:
model.__dict__

{'model_body': SentenceTransformer(
   (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: MPNetModel 
   (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
 ),
 'model_head': LogisticRegression(),
 'multi_target_strategy': None,
 'l2_weight': 0.01,
 'normalize_embeddings': False}

In [9]:
preds = model(["i loved the spiderman movie!", "pineapple on pizza is the worst 🤮"])
preds

tensor([1, 0], dtype=torch.int32)

In [11]:
# model.predict(["i loved the spiderman movie!", "pineapple on pizza is the worst 🤮"])
model.predict_proba(["i loved the spiderman movie!", "pineapple on pizza is the worst 🤮"])

tensor([[0.2417, 0.7583],
        [0.7358, 0.2642]], dtype=torch.float64)

### 2. Multi Label Text Classification

In [2]:
model_id = "sentence-transformers/paraphrase-mpnet-base-v2"
dataset_ml = load_dataset("ethos", "multilabel")

In [3]:
#note that there is no "test" nor "validate" for this dataset out from the box
# (since the ethos dataset does not have a test split on the hub).
dataset_ml

DatasetDict({
    train: Dataset({
        features: ['text', 'violence', 'directed_vs_generalized', 'gender', 'race', 'national_origin', 'disability', 'religion', 'sexual_orientation'],
        num_rows: 433
    })
})

In [4]:
# exploring the dataset:
# the text data -> this is a list of strings
print(dataset_ml['train']['text'][:5])

# in this case, the classes are already transformed into one-hot columns
# where each column represents a class label
# e.g., the 1st, 4th and 5th articles are about 'gender'
print(dataset_ml['train']['gender'][:5])

# we can list the classes a.k.a "features":
features = dataset_ml["train"].column_names
features.remove("text")
print(features)

["You should know women's sports are a joke", 'You look like Sloth with deeper Down’s syndrome', 'You look like Russian and speak like Indian. Both are disgusting go kill yourself', 'Women deserve to be abused, I guess.', 'Women are made for making babies and cooking dinner and nothing else!!!']
[1, 0, 0, 1, 1]
['violence', 'directed_vs_generalized', 'gender', 'race', 'national_origin', 'disability', 'religion', 'sexual_orientation']


In [5]:
a = np.arange(10,20)
np.where(a) # returns the indices of all elements in a
np.where(a>15) # returns the indices of the elements in a meeting the condition

(array([6, 7, 8, 9], dtype=int64),)

In [6]:
# To simulate the effect of training on a limited number of examples, 
# let's subsample the training set to have at least 8 labeled examples per feature.
num_samples = 8
samples = np.concatenate(
    [np.random.choice(np.where(dataset_ml["train"][f])[0], num_samples, replace=False) for f in features]
)
print(samples.shape)
samples # this is an array of indices of 8 records per feature


(64,)


array([206, 291, 300,  65, 420,  28, 411, 228,   6, 321,   8, 408, 378,
        90,  67, 353, 101, 360, 406, 250, 388, 378, 423, 353,  35, 206,
        19, 305, 169,  24, 199,  16, 156,  91, 252, 263, 314, 277, 235,
       325, 100, 200, 422, 229, 162, 288,  73, 140, 241, 419, 303, 120,
        38, 189,  99, 308, 176, 133, 397,  77, 427, 222,  36,  90],
      dtype=int64)

In [7]:
# rewritten for better clarity:
# import collections

# sample_list = []
# for f in features:
#     print(f)
#     # get all the indices of the records belonging to this feature:
#     indices = np.where(dataset_ml["train"][f])[0] # using [0] to get the value of the tuple returned from np.where
#     print(indices)
#     sample = np.random.choice(indices,num_samples, replace=False)
#     print('duplicated item in a sample of a class:')
#     print([item for item, count in collections.Counter(list(sample)).items() if count > 1])
#     sample_list.append(sample)
# samples = np.concatenate(sample_list)
# samples
# len(samples) == len(set(samples))

In [8]:
# list = list(samples)
# duplicated_list = list(samples)
# unique_list = list(set(samples))
# print(sorted(duplicated_list))
# print([item for item, count in collections.Counter(duplicated_list).items() if count > 1])

In [9]:
# We encode the classes one-hot values in a single 'label' feature
# returns a dictionary
def encode_labels(record):
    return {"labels": [record[feature] for feature in features]}


# this will act on all the records of the dataset object and create a new "column" called "labels"
# effectively, this is equivalent to dataframe['labels'] = dataframe.apply(lambda row: encode_labels(row))
dataset_ml = dataset_ml.map(encode_labels) 
print(dataset_ml['train']['labels'][0])
print(dataset_ml['train']['gender'][0]) # 'gender is the index #2 = 1

[0, 0, 1, 0, 0, 0, 0, 0]
1


In [10]:
train_dataset = dataset_ml["train"].select(samples) # pass the array of indices above to select
eval_dataset = dataset_ml["train"].select(
    np.setdiff1d(np.arange(len(dataset_ml["train"])), samples)
) # find the difference in 2 arrays: indices of the samples, and indices of the whole dataset

In [11]:
# Note that the multi_target_strategy parameter here signals to both the model and the trainer 
# to expect a multi-labelled dataset.
# Here, we've downloaded a pretrained Sentence Transformer from the Hub 
# and added a logistic classification head to the create the SetFit model.
model_ml = SetFitModel.from_pretrained(model_id, # use any Sentence transformer model
                                    multi_target_strategy="one-vs-rest" 
                                    )
# note that we can load a locally available model by using a
# path to a `directory` containing model weights saved using
# [`~transformers.PreTrainedModel.save_pretrained`], e.g., `../path/to/my_model_directory/`.

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [12]:
# note that the logistic regression head is added by default by SetFitModel class
model_ml.model_head

In [13]:
# now we can initialize the trainer to configure how we want to train the model:
# note `column_mapping` argument: The SetFitTrainer expects the inputs to be found in a "text" and "label" column. 
# This mapping automatically formats the training and evaluation datasets for us.
trainer = SetFitTrainer(
    model=model_ml,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss_class=CosineSimilarityLoss,
    num_iterations=5,
    column_mapping={"text": "text", "labels": "label"}, # IMPORTANT for SetFit - hardcoded in the source code
    num_epochs=1,
    batch_size=5
)

# trainer = SetFitTrainer(
#     model=model,
#     train_dataset=train_dataset,
#     eval_dataset=eval_dataset,
#     loss_class=CosineSimilarityLoss,
#     # batch_size=16,
#     num_iterations=20, # Number of text pairs to generate for contrastive learning
#     num_epochs=1, # Number of epochs to use for contrastive learning
#     # column_mapping={"text": "text", "labels": "label"}
# )

In [14]:
# with the trainer configured, we can now train the model proper:
# np.random.seed()
trainer.train()

Applying column mapping to training dataset


Generating Training Pairs:   0%|          | 0/5 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 940
  Num epochs = 1
  Total optimization steps = 188
  Total train batch size = 5


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/188 [00:00<?, ?it/s]

In [44]:
metrics = trainer.evaluate()
metrics

Applying column mapping to evaluation dataset
***** Running evaluation *****


{'accuracy': 0.3967828418230563}

In [19]:
features

['violence',
 'directed_vs_generalized',
 'gender',
 'race',
 'national_origin',
 'disability',
 'religion',
 'sexual_orientation']

In [43]:
sample_texts = ['men tend to be more interested in things and women more in people, but this may be different across races',
                'Ukraine is in a long war that has led to millions displaced',
                'Christianity is the largest religion in the world'
                ]
preds = model_ml.predict(sample_texts)
pred_probas = model_ml.predict_proba(sample_texts)

print("labels:")
print(features)
print("predicted class label: ")
print([features[int(pred_proba.argmax())] for pred_proba in pred_probas]) # somehow pred does not always give a class prediction -> can be a zeros array(???)

print('details:')
print(preds)
print(pred_probas)

labels:
['violence', 'directed_vs_generalized', 'gender', 'race', 'national_origin', 'disability', 'religion', 'sexual_orientation']
predicted class label: 
['race', 'national_origin', 'religion']
details:
tensor([[0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 1, 0]], dtype=torch.int32)
tensor([[0.0054, 0.2941, 0.1928, 0.3491, 0.0103, 0.0084, 0.0033, 0.0576],
        [0.2756, 0.0135, 0.0365, 0.0146, 0.5041, 0.1338, 0.0457, 0.0099],
        [0.1776, 0.0114, 0.0261, 0.0128, 0.0430, 0.0154, 0.9251, 0.0056]],
       dtype=torch.float64)


### 3. Getting from pre-trained and check details

In [8]:
from setfit import SetFitModel

setfit_model = SetFitModel.from_pretrained("lewtun/setfit-ethos-multilabel-example")

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [11]:
# the setfit model can be used as is but we need to know the labels and their sequences which were used to train this particular model
preds = setfit_model(
    [
        "Jewish people often don't eat pork.",
        "Is this lipstick suitable for people with dark skin?"
    ]
)
preds

tensor([[0, 0, 0, 0, 0, 0, 1, 0],
        [0, 1, 0, 0, 0, 0, 0, 0]], dtype=torch.int32)

In [9]:
# behind the scene, the model is of SentenceTransformer class
# when taking the model_body, it has the same methods as a normal ST model such as encoding a text
setfit_model.__dict__

{'model_body': SentenceTransformer(
   (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: MPNetModel 
   (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
 ),
 'model_head': OneVsRestClassifier(estimator=LogisticRegression()),
 'multi_target_strategy': None,
 'l2_weight': 0.01,
 'normalize_embeddings': False}

In [17]:
setfit_encodings = setfit_model.model_body.encode("I love apple")
setfit_encodings

array([ 4.93167713e-02,  3.17766182e-02, -4.10573892e-02,  5.57808280e-02,
        9.08400640e-02, -1.77587401e-02, -1.36027306e-01, -1.47274258e-02,
        1.59128204e-01,  1.16944551e-01, -4.62600812e-02,  2.98268616e-01,
       -5.22914454e-02, -8.29667673e-02, -4.44911234e-02, -4.40300219e-02,
        1.66153476e-01,  1.91608459e-01,  4.12492529e-02, -2.89649013e-02,
       -1.03310630e-01,  1.47008151e-01,  1.29550248e-01, -3.77085879e-02,
       -2.10230798e-01, -1.06654964e-01,  9.99176949e-02,  1.21255957e-01,
        1.41623139e-01,  3.66448388e-02,  1.32339492e-01,  1.83584392e-02,
       -2.79984921e-01,  1.19764067e-01,  9.95515436e-02,  1.32437855e-01,
        1.01705149e-01,  3.26256976e-02, -1.57782301e-01,  1.11090027e-01,
        9.38620195e-02,  4.56442982e-02, -1.28066212e-01,  9.21679754e-03,
       -2.83422321e-02, -5.04130051e-02,  1.33230761e-01, -3.63847166e-02,
        1.81325838e-01, -9.41544920e-02, -3.93210240e-02, -3.06797773e-02,
        1.67208649e-02,  

In [14]:
from sentence_transformers import SentenceTransformer
st_model = SentenceTransformer('lewtun/setfit-ethos-multilabel-example')

In [15]:
# the same model is obtained via calling the pretrained model from the SentenceTransformer module directly
st_model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [25]:
st_encodings = st_model.encode("I love apple")
print(setfit_encodings.shape)
st_encodings

(768,)


array([ 4.93167713e-02,  3.17766182e-02, -4.10573892e-02,  5.57808280e-02,
        9.08400640e-02, -1.77587401e-02, -1.36027306e-01, -1.47274258e-02,
        1.59128204e-01,  1.16944551e-01, -4.62600812e-02,  2.98268616e-01,
       -5.22914454e-02, -8.29667673e-02, -4.44911234e-02, -4.40300219e-02,
        1.66153476e-01,  1.91608459e-01,  4.12492529e-02, -2.89649013e-02,
       -1.03310630e-01,  1.47008151e-01,  1.29550248e-01, -3.77085879e-02,
       -2.10230798e-01, -1.06654964e-01,  9.99176949e-02,  1.21255957e-01,
        1.41623139e-01,  3.66448388e-02,  1.32339492e-01,  1.83584392e-02,
       -2.79984921e-01,  1.19764067e-01,  9.95515436e-02,  1.32437855e-01,
        1.01705149e-01,  3.26256976e-02, -1.57782301e-01,  1.11090027e-01,
        9.38620195e-02,  4.56442982e-02, -1.28066212e-01,  9.21679754e-03,
       -2.83422321e-02, -5.04130051e-02,  1.33230761e-01, -3.63847166e-02,
        1.81325838e-01, -9.41544920e-02, -3.93210240e-02, -3.06797773e-02,
        1.67208649e-02,  

In [19]:
# we can confirm that the same embeddings can be produced from both pretrained models
# as they are essentially the same thing but retrieved differently
setfit_encodings == st_encodings

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [20]:
# we can test with another small ST model which was finetuned on different dataset(s)
# note that different pretrained models produce different dimensions of embeddings, choose one that is comparable (e.g., 768 here)
# refer to Sentence Transformer website / Pretrained Models
st_model_2 = SentenceTransformer('sentence-transformers/paraphrase-albert-small-v2')

Downloading (…)f333f/.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)4d423f333f/README.md:   0%|          | 0.00/4.03k [00:00<?, ?B/s]

Downloading (…)423f333f/config.json:   0%|          | 0.00/827 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/46.7M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/245 [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading (…)f333f/tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

Downloading (…)23f333f/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [24]:
# as it is trained on a different dataset, the embeddings produced will not be the same as the previous model
st_encodings_2 = st_model_2.encode("I love apple")
print(st_encodings_2.shape)
st_encodings_2 == st_encodings

(768,)


array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,