In [64]:
from huggingface_hub import PyTorchModelHubMixin, hf_hub_download
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from dataclasses import dataclass
import os
import joblib
import requests

from sentence_transformers import InputExample, SentenceTransformer, losses
from sentence_transformers.datasets import SentenceLabelDataset
from sentence_transformers.losses.BatchHardTripletLoss import BatchHardTripletLossDistanceFunction
from sentence_transformers import losses

from datasets import load_dataset
import evaluate

from setfit.data import create_fewshot_splits
from setfit.modeling import LOSS_NAME_TO_CLASS, SupConLoss, sentence_pairs_generation
import numpy as np
from torch.utils.data import DataLoader
import math

In [2]:
MODEL_HEAD_NAME = "model_head.pkl"

In [71]:
@dataclass
class SetFitModel(PyTorchModelHubMixin):
    def __init__(self, model_body=None, model_head=None):
        super(SetFitModel, self).__init__()
        self.model_body = model_body
        self.model_head = model_head

    def fit(self, x_train, y_train):
        embeddings = self.model_body.encode(x_train)
        self.model_head.fit(embeddings, y_train)

    def predict(self, x_test):
        embeddings = self.model_body.encode(x_test)
        return self.model_head.predict(embeddings)

    def predict_proba(self, x_test):
        embeddings = self.model_body.encode(x_test)
        return self.model_head.predict_proba(embeddings)
    
    def __call__(self, inputs):
        embeddings = self.model_body.encode(inputs)
        return self.model_head.predict(embeddings)

    def _save_pretrained(self, save_directory):
        self.model_body.save(path=save_directory)
        joblib.dump(self.model_head, f"{save_directory}/{MODEL_HEAD_NAME}")
        
    @classmethod
    def _from_pretrained(
        cls,
        model_id,
        revision=None,
        cache_dir=None,
        force_download=None,
        proxies=None,
        resume_download=None,
        local_files_only=None,
        use_auth_token=None,
        **model_kwargs
    ):
        model_body = SentenceTransformer(model_id)
        
        if os.path.isdir(model_id) and MODEL_HEAD_NAME in os.listdir(model_id):
            model_head_file = os.path.join(model_id, MODEL_HEAD_NAME)
        else:
            try:
                model_head_file = hf_hub_download(
                    repo_id=model_id,
                    filename=MODEL_HEAD_NAME,
                    revision=revision,
                    cache_dir=cache_dir,
                    force_download=force_download,
                    proxies=proxies,
                    resume_download=resume_download,
                    use_auth_token=use_auth_token,
                    local_files_only=local_files_only,
                )
            except requests.exceptions.RequestException:
                print(f"{MODEL_HEAD_NAME} not found on HuggingFace Hub, initialising classification head with random weights.")
                model_head_file = None

        if model_head_file is not None:
            model_head = joblib.load(model_head_file)
        else:
            model_head = LogisticRegression()
        return SetFitModel(model_body=model_body, model_head=model_head)

In [72]:
model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-mpnet-base-v2")

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights.


In [110]:
model.save_pretrained("dummy-setfit-model")

In [114]:
model.push_to_hub("dummy-setfit-model")

Cloning https://huggingface.co/lewtun/dummy-setfit-model into local empty directory.


Upload file pytorch_model.bin:   0%|          | 32.0k/418M [00:00<?, ?B/s]

Upload file model_head.pkl: 100%|##########| 324/324 [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/lewtun/dummy-setfit-model
   922af47..84fb8f1  main -> main



'https://huggingface.co/lewtun/dummy-setfit-model/commit/84fb8f100ddeb5db789640fe893e96481b49d25c'

In [4]:
model = SetFitModel.from_pretrained("lewtun/dummy-setfit-model")

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/280 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/712k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/324 [00:00<?, ?B/s]

In [5]:
model.model_head

In [67]:
class SetFitTrainer:
    def __init__(
        self,
        model = None,
        train_dataset = None,
        eval_dataset = None,
        compute_metrics = None,
        loss_class = None,
        num_epochs = None,
        learning_rate = None,
        batch_size = None
        ):
        
        self.model = model
        self.train_dataset = train_dataset
        self.eval_dataset = eval_dataset
        self.compute_metrics = compute_metrics
        self.loss_class = loss_class
        self.num_epochs = num_epochs
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        
    def train(self):
        x_train = self.train_dataset["text"]
        y_train = self.train_dataset["label"]

        if self.loss_class is None:
            return

        # sentence-transformers adaptation
        batch_size = self.batch_size
#         if self.loss_class in [
#             losses.BatchAllTripletLoss,
#             losses.BatchHardTripletLoss,
#             losses.BatchSemiHardTripletLoss,
#             losses.BatchHardSoftMarginTripletLoss,
#             SupConLoss,
#         ]:

#             train_examples = [InputExample(texts=[text], label=label) for text, label in zip(x_train, y_train)]
#             train_data_sampler = SentenceLabelDataset(train_examples)

#             batch_size = min(self.args.batch_size, len(train_data_sampler))
#             train_dataloader = DataLoader(train_data_sampler, batch_size=batch_size, drop_last=True)

#             if self.loss_class is losses.BatchHardSoftMarginTripletLoss:
#                 train_loss = self.loss_class(
#                     model=self.model,
#                     distance_metric=BatchHardTripletLossDistanceFunction.cosine_distance,
#                 )
#             elif self.loss_class is SupConLoss:
#                 train_loss = self.loss_class(model=self.model)
#             else:
#                 train_loss = self.loss_class(
#                     model=self.model,
#                     distance_metric=BatchHardTripletLossDistanceFunction.cosine_distance,
#                     margin=0.25,
#                 )

#             train_steps = len(train_dataloader) * self.args.num_epochs
#         else:
        train_examples = []
        for _ in range(self.num_epochs):
            print("Generating sentence pairs")
            train_examples = sentence_pairs_generation(np.array(x_train), np.array(y_train), train_examples)

        train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
        train_loss = self.loss_class(self.model)
        train_steps = len(train_dataloader)

        print(f"{len(x_train)} train samples in total, {train_steps} train steps with batch size {batch_size}")

        warmup_steps = math.ceil(train_steps * 0.1)
        self.model.model_body.fit(
            train_objectives=[(train_dataloader, train_loss)],
            epochs=1,
            steps_per_epoch=train_steps,
            warmup_steps=warmup_steps,
            show_progress_bar=False,
        )

        # Train the final classifier
        # classifier = self.get_classifier(self.model)
        self.model.model_head.fit(x_train, y_train)
        return self.model
    
    def evaluate(self):
        pass
    
    def predict(self):
        pass

## Train

In [77]:
dataset = "SetFit/sst2"

##### We load the "train" and "test" portions of the data

In [25]:
sst2_data = load_dataset(dataset)

Using custom data configuration SetFit--sst2-4811211b52125821
Reusing dataset json (/home/lewis_huggingface_co/.cache/huggingface/datasets/SetFit___json/SetFit--sst2-4811211b52125821/0.0.0/da492aad5680612e4028e7f6ddc04b1dfcec4b64db470ed7cc5f2bb265b9b6b5)


  0%|          | 0/3 [00:00<?, ?it/s]

In [26]:
train_sst2 = sst2_data["train"]

In [27]:
test_sst2 = sst2_data["test"]

##### Let's now import this fewshot split generating function for the train set.

##### We not sample our data so that we have n number of examples for each class. We start with 16 in this case.

In [32]:
n = 8
fewshot_sst2 = create_fewshot_splits(train_sst2, [n])

##### Create_fewshot_splits has samples 10 different groups of n=16 (per class) data splits.

In [33]:
for name in fewshot_sst2:
    print(name)

train-8-0
train-8-1
train-8-2
train-8-3
train-8-4
train-8-5
train-8-6
train-8-7
train-8-8
train-8-9


##### Let's try our SetFit test on just one run. We'll call it try1. This means we're training our model on just one run of 16 examples of each class.

In [35]:
try1 = 'train-8-0'
fewshot_sst2[try1]

Dataset({
    features: ['text', 'label', 'label_text'],
    num_rows: 16
})

In [74]:
model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-mpnet-base-v2")

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights.


In [75]:
trainer = SetFitTrainer(model=model, train_dataset=fewshot_sst2[try1], loss_class=losses.CosineSimilarityLoss, batch_size=16, num_epochs=5)

In [76]:
trainer.train()

Generating sentence pairs
Generating sentence pairs
Generating sentence pairs
Generating sentence pairs
Generating sentence pairs
16 train samples in total, 10 train steps with batch size 16


KeyError: 1