**Description**: Sigmoid loss for text similarity training. Inspired by
[SigLIP](https://arxiv.org/abs/2303.15343). Benefits are outlined in the SigLIP paper;
it's easier to increase the batch size to get more negatives.

**Usage**: run on a T4 GPU.

Modified from this SentenceTransformers
[script](https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/matryoshka/matryoshka_nli.py).

In [None]:
!pip install datasets --upgrade sentence-transformers

In [None]:
!pip uninstall wandb

In [1]:
from datetime import datetime
from typing import Any, Iterable

from datasets import load_dataset
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    losses,
    util,
)
from sentence_transformers.evaluation import (
    EmbeddingSimilarityEvaluator,
    SimilarityFunction,
)
from sentence_transformers.training_args import BatchSamplers
import torch

In [2]:
USE_CUSTOM = True

In [3]:
model_name = "distilroberta-base"

batch_size = 128
num_train_epochs = 1

# Save path of the model
output_dir = f"output/sigltt_nli_{model_name.replace('/', '-')}-{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"

In [4]:
# 1. Here we define our SentenceTransformer model. If not already a Sentence Transformer model, it will automatically
# create one with "mean" pooling.
model = SentenceTransformer(model_name)
# If we want, we can limit the maximum sequence length for the model
# model.max_seq_length = 75



In [5]:
# 2. Load the AllNLI dataset: https://huggingface.co/datasets/sentence-transformers/all-nli
train_dataset = load_dataset("sentence-transformers/all-nli", "triplet", split="train")

# If you wish, you can limit the number of training samples
train_dataset = train_dataset.select(range(10_000))

# Quick demo

In [6]:
b = 2
_nn = 3
m = b * (1 + _nn)  # m = len(candidates)

scores = torch.randn((b, m))
Y = torch.concat((torch.eye(b), torch.zeros((b, m - b))), dim=1)

In [7]:
loss = torch.nn.BCEWithLogitsLoss(reduction="sum")
loss(scores, Y) / b

tensor(6.7539)

In [8]:
log_sigmoid = torch.nn.LogSigmoid()
-torch.sum(log_sigmoid((2 * Y - 1) * scores)) / b

tensor(6.7539)

# Loss implementation

In [None]:
# 3. Define our training loss
class MultipleNegativesRankingSigmoidLoss(torch.nn.Module):
    def __init__(
        self,
        model: SentenceTransformer,
        scale: float = 20.0,
        similarity_fct=util.cos_sim,
        bias: float = -10.0,
    ) -> None:
        super(MultipleNegativesRankingSigmoidLoss, self).__init__()
        self.model = model
        self.scale = torch.nn.Parameter(torch.tensor(scale, device=model.device))
        self.similarity_fct = similarity_fct
        self.bias = torch.nn.Parameter(torch.tensor(bias, device=model.device))
        self.bce_loss = torch.nn.BCEWithLogitsLoss(reduction="sum")

    def forward(
        self, sentence_features: Iterable[dict[str, torch.Tensor]], labels: torch.Tensor
    ) -> torch.Tensor:
        # Compute the embeddings and distribute them to anchor and candidates (positive and optionally negatives)
        embeddings = [
            self.model(sentence_feature)["sentence_embedding"]
            for sentence_feature in sentence_features
        ]
        anchors = embeddings[0]  # (batch_size, embedding_dim)
        candidates = torch.cat(
            embeddings[1:]
        )  # (batch_size * (1 + num_negatives), embedding_dim)

        # For every anchor, we compute the similarity to all other candidates (positives and negatives),
        # also from other anchors. This gives us a lot of in-batch negatives.
        scores: torch.Tensor = (
            self.similarity_fct(anchors, candidates) * self.scale
        ) + self.bias
        # (batch_size, batch_size * (1 + num_negatives))

        # anchor[i] should be most similar to candidates[i], as that is the paired positive,
        # so the label for anchor[i] is i
        b = len(anchors)
        m = len(candidates)
        labels = torch.concat(
            (
                torch.eye(b, device=scores.device),
                torch.zeros((b, m - b), device=scores.device),
            ),
            dim=1,
        )
        return self.bce_loss(scores, labels) / b

    def get_config_dict(self) -> dict[str, Any]:
        return {
            "scale": self.scale.item(),
            "similarity_fct": self.similarity_fct.__name__,
            "bias": self.bias.item(),
        }

    @property
    def citation(self) -> str:
        return """
@inproceedings{zhai2023sigmoid,
    title={Sigmoid loss for language image pre-training},
    author={Zhai, Xiaohua and Mustafa, Basil and Kolesnikov, Alexander and Beyer, Lucas},
    booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
    pages={11975--11986},
    year={2023}
}
"""

# Demo

In [10]:
bf16 = torch.cuda.is_bf16_supported()
if bf16:
    print("Using mixed precision in bf16")
else:
    print("Not using mixed precision")

Using mixed precision in bf16


In [11]:
# 5. Define the training arguments
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir=output_dir,
    use_mps_device=False,
    # Optional training parameters:
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_ratio=0.1,
    fp16=False,
    bf16=bf16,
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
    seed=42,
)

In [None]:
# 6. Create the trainer
if USE_CUSTOM:
    print("Using **CUSTOM** Sigmoid loss")
    train_loss = MultipleNegativesRankingSigmoidLoss(model)
else:
    print("Using OG MNRL")
    train_loss = losses.MultipleNegativesRankingLoss(model)
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
)

Using **CUSTOM** Sigmoid loss


In [13]:
torch.cuda.reset_peak_memory_stats()

In [14]:
trainer.train()  # og

Step,Training Loss


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

TrainOutput(global_step=79, training_loss=1.7303571097458466, metrics={'train_runtime': 97.3195, 'train_samples_per_second': 102.754, 'train_steps_per_second': 0.812, 'total_flos': 0.0, 'train_loss': 1.7303571097458466, 'epoch': 1.0})

In [14]:
trainer.train()  # custom

Step,Training Loss


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

TrainOutput(global_step=79, training_loss=181.11539507515823, metrics={'train_runtime': 97.373, 'train_samples_per_second': 102.698, 'train_steps_per_second': 0.811, 'total_flos': 0.0, 'train_loss': 181.11539507515823, 'epoch': 1.0})

In [15]:
if USE_CUSTOM:
    print(train_loss.scale)
    print(train_loss.bias)

Parameter containing:
tensor(19.9979, device='cuda:0', requires_grad=True)
Parameter containing:
tensor(-10.0021, device='cuda:0', requires_grad=True)


In [16]:
# og
peak_memory_allocated = torch.cuda.max_memory_allocated()
peak_memory_reserved = torch.cuda.max_memory_reserved()

print(f"Peak memory allocated: {peak_memory_allocated / 1024**3:.2f} GB")
print(f"Peak memory reserved: {peak_memory_reserved / 1024**3:.2f} GB")

Peak memory allocated: 4.43 GB
Peak memory reserved: 4.99 GB


In [16]:
# custom
peak_memory_allocated = torch.cuda.max_memory_allocated()
peak_memory_reserved = torch.cuda.max_memory_reserved()

print(f"Peak memory allocated: {peak_memory_allocated / 1024**3:.2f} GB")
print(f"Peak memory reserved: {peak_memory_reserved / 1024**3:.2f} GB")

Peak memory allocated: 4.43 GB
Peak memory reserved: 4.99 GB


In [17]:
# 7. Evaluate the model performance on the STS Benchmark test dataset
test_dataset = load_dataset("sentence-transformers/stsb", split="test")
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=test_dataset["sentence1"],
    sentences2=test_dataset["sentence2"],
    scores=test_dataset["score"],
    main_similarity=SimilarityFunction.COSINE,
    name="sts-test",
)
test_result = evaluator(model)

In [18]:
test_result  # og

{'sts-test_pearson_cosine': 0.7173359658890913,
 'sts-test_spearman_cosine': 0.7179704210761317}

In [18]:
test_result  # custom

{'sts-test_pearson_cosine': 0.7399968711738987,
 'sts-test_spearman_cosine': 0.7233642739556767}

In [None]:
# # 8. Save the trained & evaluated model locally
# final_output_dir = f"{output_dir}/final"
# model.save(final_output_dir)
# final_output_dir