# Chapter 10 - Creating Text Embedding Models

## Creating an Embedding model

### Data

In [1]:
from datasets import load_dataset

train_dataset = load_dataset("glue", "mnli", split="train").select(range(50_000))
train_dataset = train_dataset.remove_columns("idx")

Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/52.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.22M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating validation_matched split:   0%|          | 0/9815 [00:00<?, ? examples/s]

Generating validation_mismatched split:   0%|          | 0/9832 [00:00<?, ? examples/s]

Generating test_matched split:   0%|          | 0/9796 [00:00<?, ? examples/s]

Generating test_mismatched split:   0%|          | 0/9847 [00:00<?, ? examples/s]

In [2]:
train_dataset[2]

{'premise': 'One of our number will carry out your instructions minutely.',
 'hypothesis': 'A member of my team will execute your orders with immense precision.',
 'label': 0}

### Model

In [3]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("bert-base-uncased")

No sentence-transformers model found with name bert-base-uncased. Creating a new one with mean pooling.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

### Loss Function

In [4]:
from sentence_transformers import losses

train_loss = losses.SoftmaxLoss(
    model=embedding_model,
    sentence_embedding_dimension=embedding_model.get_sentence_embedding_dimension(),
    num_labels=3
)

### Evaluation


In [5]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

val_sts = load_dataset('glue', 'stsb', split='validation')
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores=[score/5 for score in val_sts["label"]],
    main_similarity="cosine"
)

Downloading data:   0%|          | 0.00/502k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/151k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/114k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

### Training

In [6]:
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

args = SentenceTransformerTrainingArguments(
    output_dir="base_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100
)

In [7]:
from sentence_transformers.trainer import SentenceTransformerTrainer

trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/1563 [00:00<?, ?it/s]

{'loss': 1.0672, 'grad_norm': 2.943986177444458, 'learning_rate': 5e-05, 'epoch': 0.06}
{'loss': 0.9489, 'grad_norm': 2.6952121257781982, 'learning_rate': 4.6582365003417636e-05, 'epoch': 0.13}
{'loss': 0.8943, 'grad_norm': 3.071946859359741, 'learning_rate': 4.316473000683528e-05, 'epoch': 0.19}
{'loss': 0.8535, 'grad_norm': 3.986969470977783, 'learning_rate': 3.9747095010252904e-05, 'epoch': 0.26}
{'loss': 0.8312, 'grad_norm': 4.4188947677612305, 'learning_rate': 3.632946001367054e-05, 'epoch': 0.32}


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

{'loss': 0.8539, 'grad_norm': 4.209220886230469, 'learning_rate': 3.2946001367054005e-05, 'epoch': 0.38}
{'loss': 0.8195, 'grad_norm': 4.817514419555664, 'learning_rate': 2.9528366370471632e-05, 'epoch': 0.45}
{'loss': 0.8066, 'grad_norm': 4.501857757568359, 'learning_rate': 2.611073137388927e-05, 'epoch': 0.51}
{'loss': 0.8008, 'grad_norm': 5.136640548706055, 'learning_rate': 2.2693096377306907e-05, 'epoch': 0.58}
{'loss': 0.8053, 'grad_norm': 4.59669303894043, 'learning_rate': 1.9275461380724537e-05, 'epoch': 0.64}
{'loss': 0.7749, 'grad_norm': 3.2648565769195557, 'learning_rate': 1.5857826384142175e-05, 'epoch': 0.7}
{'loss': 0.7598, 'grad_norm': 6.336752891540527, 'learning_rate': 1.2440191387559808e-05, 'epoch': 0.77}
{'loss': 0.7757, 'grad_norm': 3.5765745639801025, 'learning_rate': 9.022556390977444e-06, 'epoch': 0.83}
{'loss': 0.7509, 'grad_norm': 4.3848090171813965, 'learning_rate': 5.604921394395079e-06, 'epoch': 0.9}
{'loss': 0.777, 'grad_norm': 5.516924858093262, 'learning_

TrainOutput(global_step=1563, training_loss=0.8320150686545931, metrics={'train_runtime': 226.4975, 'train_samples_per_second': 220.753, 'train_steps_per_second': 6.901, 'total_flos': 0.0, 'train_loss': 0.8320150686545931, 'epoch': 1.0})

In [8]:
evaluator(embedding_model)

{'pearson_cosine': 0.4269660344252679,
 'spearman_cosine': 0.4889759904724608,
 'pearson_manhattan': 0.46133627094623064,
 'spearman_manhattan': 0.48610902052240407,
 'pearson_euclidean': 0.4497834418965715,
 'spearman_euclidean': 0.4802397638029282,
 'pearson_dot': 0.3984190595159358,
 'spearman_dot': 0.4082841892345862,
 'pearson_max': 0.46133627094623064,
 'spearman_max': 0.4889759904724608}

## MTEB

In [9]:
from mteb import MTEB
evaluation = MTEB(tasks=["Banking77Classification"])

results = evaluation.run(embedding_model)
results



[MTEBResults(task_name=Banking77Classification, scores=...)]

In [11]:
import gc
import torch

gc.collect()
torch.cuda.empty_cache()

## Loss Funtions

### Cosine Similarity Loss

In [1]:
from datasets import Dataset, load_dataset

train_dataset = load_dataset("glue", "mnli", split="train").select(range(50_000))
train_dataset = train_dataset.remove_columns("idx")

mapping = {2: 0, 1: 0, 0: 1}
train_dataset = Dataset.from_dict(
    {
        "sentence1": train_dataset["premise"],
        "sentence2": train_dataset["hypothesis"],
        "label": [float(mapping[label]) for label in train_dataset["label"]]
    }
)

In [2]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

val_sts = load_dataset('glue', 'stsb', split='validation')
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores = [score/5 for score in val_sts["label"]],
    main_similarity="cosine"
)

In [3]:
from sentence_transformers import losses, SentenceTransformer
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

embedding_model = SentenceTransformer('bert-base-uncased')

train_loss = losses.CosineSimilarityLoss(model=embedding_model)

args = SentenceTransformerTrainingArguments(
    output_dir="cosineloss_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100
)

trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()

No sentence-transformers model found with name bert-base-uncased. Creating a new one with mean pooling.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/1563 [00:00<?, ?it/s]

{'loss': 0.2309, 'grad_norm': 2.236973762512207, 'learning_rate': 5e-05, 'epoch': 0.06}
{'loss': 0.1694, 'grad_norm': 1.530463457107544, 'learning_rate': 4.6582365003417636e-05, 'epoch': 0.13}
{'loss': 0.1704, 'grad_norm': 1.2414125204086304, 'learning_rate': 4.316473000683528e-05, 'epoch': 0.19}
{'loss': 0.1593, 'grad_norm': 1.0040730237960815, 'learning_rate': 3.9747095010252904e-05, 'epoch': 0.26}
{'loss': 0.1524, 'grad_norm': 1.6231491565704346, 'learning_rate': 3.632946001367054e-05, 'epoch': 0.32}


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

{'loss': 0.1586, 'grad_norm': 1.2467025518417358, 'learning_rate': 3.291182501708818e-05, 'epoch': 0.38}
{'loss': 0.15, 'grad_norm': 1.0338683128356934, 'learning_rate': 2.9494190020505813e-05, 'epoch': 0.45}
{'loss': 0.1561, 'grad_norm': 1.677746295928955, 'learning_rate': 2.6076555023923443e-05, 'epoch': 0.51}
{'loss': 0.1489, 'grad_norm': 1.4655015468597412, 'learning_rate': 2.2658920027341084e-05, 'epoch': 0.58}
{'loss': 0.148, 'grad_norm': 1.0010393857955933, 'learning_rate': 1.9241285030758715e-05, 'epoch': 0.64}
{'loss': 0.1479, 'grad_norm': 1.3275713920593262, 'learning_rate': 1.5823650034176352e-05, 'epoch': 0.7}
{'loss': 0.1457, 'grad_norm': 1.3998632431030273, 'learning_rate': 1.2406015037593984e-05, 'epoch': 0.77}
{'loss': 0.1472, 'grad_norm': 1.4148309230804443, 'learning_rate': 8.988380041011621e-06, 'epoch': 0.83}
{'loss': 0.1414, 'grad_norm': 1.3700520992279053, 'learning_rate': 5.570745044429255e-06, 'epoch': 0.9}
{'loss': 0.1396, 'grad_norm': 1.1958560943603516, 'lear

TrainOutput(global_step=1563, training_loss=0.1571444806119073, metrics={'train_runtime': 219.6343, 'train_samples_per_second': 227.651, 'train_steps_per_second': 7.116, 'total_flos': 0.0, 'train_loss': 0.1571444806119073, 'epoch': 1.0})

In [4]:
evaluator(embedding_model)

{'pearson_cosine': 0.7248499580867129,
 'spearman_cosine': 0.7273437813071368,
 'pearson_manhattan': 0.7384358702485685,
 'spearman_manhattan': 0.7370732310820323,
 'pearson_euclidean': 0.7382434313913847,
 'spearman_euclidean': 0.7368875067545738,
 'pearson_dot': 0.6676837171957636,
 'spearman_dot': 0.6698132878680747,
 'pearson_max': 0.7384358702485685,
 'spearman_max': 0.7370732310820323}

### Multiple Negative Ranking Loss

In [7]:
import random
from tqdm import tqdm
from datasets import Dataset, load_dataset

mnli = load_dataset("glue", "mnli", split="train").select(range(50_000))
mnli = mnli.remove_columns("idx")
mnli = mnli.filter(lambda x: True if x['label'] == 0 else False)

train_dataset = {"anchor": [], "positive": [], "negative": []}
soft_negatives = mnli["hypothesis"]
random.shuffle(soft_negatives)
for row, soft_negative in tqdm(zip(mnli, soft_negatives)):
    train_dataset["anchor"].append(row["premise"])
    train_dataset["positive"].append(row["hypothesis"])
    train_dataset["negative"].append(soft_negative)
train_dataset = Dataset.from_dict(train_dataset)
len(train_dataset)

Filter:   0%|          | 0/50000 [00:00<?, ? examples/s]

16875it [00:00, 26569.27it/s]


16875

In [8]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

val_sts = load_dataset('glue', 'stsb', split='validation')
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores = [score/5 for score in val_sts["label"]],
    main_similarity="cosine"
)

In [9]:
from sentence_transformers import losses, SentenceTransformer
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

# Define model
embedding_model = SentenceTransformer('bert-base-uncased')

# Loss function
train_loss = losses.MultipleNegativesRankingLoss(model=embedding_model)

# Define the training arguments
args = SentenceTransformerTrainingArguments(
    output_dir="mnrloss_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100,
)

# Train model
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()

No sentence-transformers model found with name bert-base-uncased. Creating a new one with mean pooling.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/528 [00:00<?, ?it/s]

{'loss': 0.3371, 'grad_norm': 5.291716575622559, 'learning_rate': 4.85e-05, 'epoch': 0.19}
{'loss': 0.1056, 'grad_norm': 5.968145370483398, 'learning_rate': 3.866822429906542e-05, 'epoch': 0.38}
{'loss': 0.0774, 'grad_norm': 4.940764904022217, 'learning_rate': 2.698598130841122e-05, 'epoch': 0.57}
{'loss': 0.0666, 'grad_norm': 0.21554285287857056, 'learning_rate': 1.530373831775701e-05, 'epoch': 0.76}
{'loss': 0.0691, 'grad_norm': 1.5026662349700928, 'learning_rate': 3.6214953271028036e-06, 'epoch': 0.95}


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

{'train_runtime': 100.9323, 'train_samples_per_second': 167.191, 'train_steps_per_second': 5.231, 'train_loss': 0.1275985473484704, 'epoch': 1.0}


TrainOutput(global_step=528, training_loss=0.1275985473484704, metrics={'train_runtime': 100.9323, 'train_samples_per_second': 167.191, 'train_steps_per_second': 5.231, 'total_flos': 0.0, 'train_loss': 0.1275985473484704, 'epoch': 1.0})

In [10]:
evaluator(embedding_model)

{'pearson_cosine': 0.8103973735587551,
 'spearman_cosine': 0.8123781869653769,
 'pearson_manhattan': 0.8244308076276949,
 'spearman_manhattan': 0.8194081775819729,
 'pearson_euclidean': 0.8241744954401639,
 'spearman_euclidean': 0.8189561975709714,
 'pearson_dot': 0.7359319321075328,
 'spearman_dot': 0.7243260390110376,
 'pearson_max': 0.8244308076276949,
 'spearman_max': 0.8194081775819729}

## Fine-tuning

### Supervised

In [12]:
from datasets import load_dataset
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

train_dataset = load_dataset("glue", "mnli", split="train").select(range(50_000))
train_dataset = train_dataset.remove_columns("idx")

val_sts = load_dataset('glue', 'stsb', split='validation')
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores = [score/5 for score in val_sts["label"]],
    main_similarity="cosine"
)

In [13]:
from sentence_transformers import losses, SentenceTransformer
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

train_loss = losses.MultipleNegativesRankingLoss(model=embedding_model)

args = SentenceTransformerTrainingArguments(
    output_dir="finetuned_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100
)

# train model
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/1563 [00:00<?, ?it/s]

{'loss': 0.1579, 'grad_norm': 3.3383264541625977, 'learning_rate': 5e-05, 'epoch': 0.06}
{'loss': 0.1103, 'grad_norm': 2.0389392375946045, 'learning_rate': 4.6582365003417636e-05, 'epoch': 0.13}
{'loss': 0.1208, 'grad_norm': 1.8478738069534302, 'learning_rate': 4.316473000683528e-05, 'epoch': 0.19}
{'loss': 0.116, 'grad_norm': 3.5813465118408203, 'learning_rate': 3.9747095010252904e-05, 'epoch': 0.26}
{'loss': 0.1104, 'grad_norm': 5.544090270996094, 'learning_rate': 3.632946001367054e-05, 'epoch': 0.32}


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

{'loss': 0.103, 'grad_norm': 1.4601185321807861, 'learning_rate': 3.291182501708818e-05, 'epoch': 0.38}
{'loss': 0.112, 'grad_norm': 3.89973783493042, 'learning_rate': 2.9494190020505813e-05, 'epoch': 0.45}
{'loss': 0.1008, 'grad_norm': 2.8806612491607666, 'learning_rate': 2.6076555023923443e-05, 'epoch': 0.51}
{'loss': 0.1081, 'grad_norm': 1.9317626953125, 'learning_rate': 2.2658920027341084e-05, 'epoch': 0.58}
{'loss': 0.1015, 'grad_norm': 5.811153411865234, 'learning_rate': 1.9241285030758715e-05, 'epoch': 0.64}
{'loss': 0.0971, 'grad_norm': 3.0193395614624023, 'learning_rate': 1.5823650034176352e-05, 'epoch': 0.7}
{'loss': 0.109, 'grad_norm': 1.5798457860946655, 'learning_rate': 1.2406015037593984e-05, 'epoch': 0.77}
{'loss': 0.1049, 'grad_norm': 2.923222303390503, 'learning_rate': 8.988380041011621e-06, 'epoch': 0.83}
{'loss': 0.1063, 'grad_norm': 1.1207858324050903, 'learning_rate': 5.570745044429255e-06, 'epoch': 0.9}
{'loss': 0.1075, 'grad_norm': 4.140763282775879, 'learning_ra

TrainOutput(global_step=1563, training_loss=0.11024341961548867, metrics={'train_runtime': 79.9944, 'train_samples_per_second': 625.043, 'train_steps_per_second': 19.539, 'total_flos': 0.0, 'train_loss': 0.11024341961548867, 'epoch': 1.0})

In [14]:
evaluator(embedding_model)

{'pearson_cosine': 0.8486568725007678,
 'spearman_cosine': 0.8479664072547248,
 'pearson_manhattan': 0.8506895739059305,
 'spearman_manhattan': 0.8469505097564902,
 'pearson_euclidean': 0.8516261002307633,
 'spearman_euclidean': 0.8479664072547248,
 'pearson_dot': 0.8486568752742518,
 'spearman_dot': 0.8479664072547248,
 'pearson_max': 0.8516261002307633,
 'spearman_max': 0.8479664072547248}

In [15]:
original_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
evaluator(original_model)



{'pearson_cosine': 0.8696194575405793,
 'spearman_cosine': 0.8671631190200253,
 'pearson_manhattan': 0.8670398996831856,
 'spearman_manhattan': 0.8663946131522758,
 'pearson_euclidean': 0.867871596469861,
 'spearman_euclidean': 0.8671631190200253,
 'pearson_dot': 0.8696194549917766,
 'spearman_dot': 0.8671631197908374,
 'pearson_max': 0.8696194575405793,
 'spearman_max': 0.8671631197908374}

## Augmented SBERT

**Step 1**: Fine-tune a cross-encoder

In [1]:
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset, Dataset
from sentence_transformers import InputExample
from sentence_transformers.datasets import NoDuplicatesDataLoader

dataset = load_dataset("glue", "mnli", split="train").select(range(50_000))
mapping = {2: 0, 1: 0, 0: 1}

gold_examples = [
    InputExample(texts=[row["premise"], row["hypothesis"]], label=mapping[row["label"]]) for row in tqdm(dataset)
]
gold_dataloader = NoDuplicatesDataLoader(gold_examples, batch_size=32)

gold = pd.DataFrame(
    {
        'sentence1': dataset['premise'],
        'sentence2': dataset['hypothesis'],
        'label': [mapping[label] for label in dataset['label']]
    }
)

100%|██████████| 50000/50000 [00:01<00:00, 37412.39it/s]


In [2]:
from sentence_transformers.cross_encoder import CrossEncoder

cross_encoder = CrossEncoder('bert-base-uncased', num_labels=2)
cross_encoder.fit(
    train_dataloader=gold_dataloader,
    epochs=1, show_progress_bar=True,
    warmup_steps=100,
    use_amp=False
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1562 [00:00<?, ?it/s]

**Step 2**: Create a new sentence pairs

In [3]:
silver = load_dataset("glue", "mnli", split="train").select(range(10_000, 50_000))
pairs = list(zip(silver['premise'], silver['hypothesis']))

**Step 3**: Label new sentence pairs with the fine-tuned cross-encoder (silver dataset)

In [4]:
import numpy as np

output = cross_encoder.predict(pairs, apply_softmax=True, show_progress_bar=True)
silver = pd.DataFrame(
    {
        'sentence1': silver['premise'],
        'sentence2': silver['hypothesis'],
        'label': np.argmax(output, axis=1)
    }
)

Batches:   0%|          | 0/1250 [00:00<?, ?it/s]

**Step 4**: Train a bi-encoder (SBERT) on the extended datatset (gold + silver)

In [5]:
# combine gold and silver
data = pd.concat([gold, silver], ignore_index=True, axis=0)
data = data.drop_duplicates(subset=['sentence1', 'sentence2'], keep='first')
train_dataset = Dataset.from_pandas(data, preserve_index=False)

In [6]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

val_sts = load_dataset('glue', 'stsb', split='validation')
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores = [score/5 for score in val_sts["label"]],
    main_similarity="cosine"    
)

In [7]:
from sentence_transformers import losses, SentenceTransformer
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

embedding_model = SentenceTransformer('bert-base-uncased')
train_loss = losses.CosineSimilarityLoss(model=embedding_model)

args = SentenceTransformerTrainingArguments(
    output_dir="augmented_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100
)

# train model
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()

No sentence-transformers model found with name bert-base-uncased. Creating a new one with mean pooling.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/1563 [00:00<?, ?it/s]

{'loss': 0.2354, 'grad_norm': 1.6784230470657349, 'learning_rate': 4.9500000000000004e-05, 'epoch': 0.06}
{'loss': 0.1737, 'grad_norm': 1.5096228122711182, 'learning_rate': 4.6616541353383456e-05, 'epoch': 0.13}
{'loss': 0.1636, 'grad_norm': 1.6863032579421997, 'learning_rate': 4.3198906356801096e-05, 'epoch': 0.19}
{'loss': 0.163, 'grad_norm': 1.2168338298797607, 'learning_rate': 3.978127136021873e-05, 'epoch': 0.26}
{'loss': 0.1565, 'grad_norm': 1.3421708345413208, 'learning_rate': 3.6363636363636364e-05, 'epoch': 0.32}


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

{'loss': 0.1579, 'grad_norm': 1.5156283378601074, 'learning_rate': 3.2946001367054005e-05, 'epoch': 0.38}
{'loss': 0.1523, 'grad_norm': 1.292264699935913, 'learning_rate': 2.9528366370471632e-05, 'epoch': 0.45}
{'loss': 0.1518, 'grad_norm': 1.2168147563934326, 'learning_rate': 2.611073137388927e-05, 'epoch': 0.51}
{'loss': 0.1482, 'grad_norm': 1.4321715831756592, 'learning_rate': 2.2693096377306907e-05, 'epoch': 0.58}
{'loss': 0.1453, 'grad_norm': 1.3003636598587036, 'learning_rate': 1.9275461380724537e-05, 'epoch': 0.64}
{'loss': 0.1512, 'grad_norm': 1.1824214458465576, 'learning_rate': 1.5857826384142175e-05, 'epoch': 0.7}
{'loss': 0.1456, 'grad_norm': 1.1237597465515137, 'learning_rate': 1.2440191387559808e-05, 'epoch': 0.77}
{'loss': 0.1446, 'grad_norm': 1.3558070659637451, 'learning_rate': 9.022556390977444e-06, 'epoch': 0.83}
{'loss': 0.1458, 'grad_norm': 1.2280879020690918, 'learning_rate': 5.604921394395079e-06, 'epoch': 0.9}
{'loss': 0.1445, 'grad_norm': 1.0700139999389648, 'l

TrainOutput(global_step=1563, training_loss=0.15777476842176127, metrics={'train_runtime': 222.6153, 'train_samples_per_second': 224.594, 'train_steps_per_second': 7.021, 'total_flos': 0.0, 'train_loss': 0.15777476842176127, 'epoch': 1.0})

In [8]:
evaluator(embedding_model)

{'pearson_cosine': 0.7340504370447386,
 'spearman_cosine': 0.7367061543162533,
 'pearson_manhattan': 0.73860755719696,
 'spearman_manhattan': 0.7389655817992703,
 'pearson_euclidean': 0.7384506413601905,
 'spearman_euclidean': 0.7385994420996997,
 'pearson_dot': 0.7008053852029055,
 'spearman_dot': 0.7026937642958971,
 'pearson_max': 0.73860755719696,
 'spearman_max': 0.7389655817992703}

In [9]:
trainer.accelerator.clear()

[]

**Step 5**: Evaluate without the silver dataset

In [10]:
data = pd.concat([gold], ignore_index=True, axis=0)
data = data.drop_duplicates(subset=['sentence1', 'sentence2'], keep="first")
train_dataset = Dataset.from_pandas(data, preserve_index=False)

# Define model
embedding_model = SentenceTransformer('bert-base-uncased')

# Loss function
train_loss = losses.CosineSimilarityLoss(model=embedding_model)

# Define the training arguments
args = SentenceTransformerTrainingArguments(
    output_dir="gold_only_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100,
)

# Train model
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()

No sentence-transformers model found with name bert-base-uncased. Creating a new one with mean pooling.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/1563 [00:00<?, ?it/s]

{'loss': 0.2354, 'grad_norm': 1.6784230470657349, 'learning_rate': 4.9500000000000004e-05, 'epoch': 0.06}
{'loss': 0.1737, 'grad_norm': 1.5096228122711182, 'learning_rate': 4.6616541353383456e-05, 'epoch': 0.13}
{'loss': 0.1636, 'grad_norm': 1.6834665536880493, 'learning_rate': 4.3198906356801096e-05, 'epoch': 0.19}
{'loss': 0.1627, 'grad_norm': 1.2431890964508057, 'learning_rate': 3.978127136021873e-05, 'epoch': 0.26}
{'loss': 0.157, 'grad_norm': 1.2910016775131226, 'learning_rate': 3.6363636363636364e-05, 'epoch': 0.32}


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

{'loss': 0.1576, 'grad_norm': 1.490830421447754, 'learning_rate': 3.2946001367054005e-05, 'epoch': 0.38}
{'loss': 0.1518, 'grad_norm': 1.2361739873886108, 'learning_rate': 2.9528366370471632e-05, 'epoch': 0.45}
{'loss': 0.1512, 'grad_norm': 1.2147853374481201, 'learning_rate': 2.611073137388927e-05, 'epoch': 0.51}
{'loss': 0.1491, 'grad_norm': 1.3431299924850464, 'learning_rate': 2.2693096377306907e-05, 'epoch': 0.58}
{'loss': 0.1456, 'grad_norm': 1.3120659589767456, 'learning_rate': 1.9275461380724537e-05, 'epoch': 0.64}
{'loss': 0.1509, 'grad_norm': 1.154739260673523, 'learning_rate': 1.5857826384142175e-05, 'epoch': 0.7}
{'loss': 0.1456, 'grad_norm': 1.1382807493209839, 'learning_rate': 1.2440191387559808e-05, 'epoch': 0.77}
{'loss': 0.1444, 'grad_norm': 1.3839597702026367, 'learning_rate': 9.022556390977444e-06, 'epoch': 0.83}
{'loss': 0.1455, 'grad_norm': 1.2133880853652954, 'learning_rate': 5.604921394395079e-06, 'epoch': 0.9}
{'loss': 0.1444, 'grad_norm': 1.0826623439788818, 'le

TrainOutput(global_step=1563, training_loss=0.15768708102762585, metrics={'train_runtime': 235.1198, 'train_samples_per_second': 212.649, 'train_steps_per_second': 6.648, 'total_flos': 0.0, 'train_loss': 0.15768708102762585, 'epoch': 1.0})

In [11]:
evaluator(embedding_model)

{'pearson_cosine': 0.7391174644133511,
 'spearman_cosine': 0.7415088480221026,
 'pearson_manhattan': 0.741650970678973,
 'spearman_manhattan': 0.7422510745129687,
 'pearson_euclidean': 0.7415154616689631,
 'spearman_euclidean': 0.7421394641156605,
 'pearson_dot': 0.6962311529189662,
 'spearman_dot': 0.6991862102738733,
 'pearson_max': 0.741650970678973,
 'spearman_max': 0.7422510745129687}

## Unsupervised Learning
### Transfomer-based Denoising AutoEncoder

In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [4]:
from tqdm import tqdm
from datasets import Dataset, load_dataset
from sentence_transformers.datasets import DenoisingAutoEncoderDataset

# Create a flat list of sentences
mnli = load_dataset("glue", "mnli", split="train").select(range(25_000))
flat_sentences = mnli["premise"] + mnli["hypothesis"]

# Add noise to our input data
damaged_data = DenoisingAutoEncoderDataset(list(set(flat_sentences)))

# Create dataset
train_dataset = {"damaged_sentence": [], "original_sentence": []}
for data in tqdm(damaged_data):
    train_dataset["damaged_sentence"].append(data.texts[0])
    train_dataset["original_sentence"].append(data.texts[1])
train_dataset = Dataset.from_dict(train_dataset)

100%|██████████| 48353/48353 [00:06<00:00, 7395.53it/s]


In [5]:
train_dataset[0]

{'damaged_sentence': 'it Paris in winter.',
 'original_sentence': "it's best to visit Paris in winter."}

In [6]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

val_sts = load_dataset('glue', 'stsb', split='validation')
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores = [score/5 for score in val_sts["label"]],
    main_similarity="cosine"
)

In [7]:
from sentence_transformers import models, SentenceTransformer

word_embedding_model = models.Transformer('bert-base-uncased')
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 'cls')
embedding_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])



In [8]:
from sentence_transformers import losses

train_loss = losses.DenoisingAutoEncoderLoss(
    embedding_model, tie_encoder_decoder=True
)
train_loss.decoder = train_loss.decoder.to('cuda')

Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.self.key.bias', 'bert.e

In [9]:
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

args = SentenceTransformerTrainingArguments(
    output_dir="tsdae_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100,
)

trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/3023 [00:00<?, ?it/s]

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


{'loss': 7.1498, 'grad_norm': 10.996194839477539, 'learning_rate': 4.8e-05, 'epoch': 0.03}
{'loss': 4.9324, 'grad_norm': 7.21999454498291, 'learning_rate': 4.8357851522408484e-05, 'epoch': 0.07}
{'loss': 4.697, 'grad_norm': 6.95160436630249, 'learning_rate': 4.6647280191583994e-05, 'epoch': 0.1}
{'loss': 4.5951, 'grad_norm': 7.555102825164795, 'learning_rate': 4.49367088607595e-05, 'epoch': 0.13}
{'loss': 4.5318, 'grad_norm': 6.254870891571045, 'learning_rate': 4.3226137529935e-05, 'epoch': 0.17}


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

{'loss': 4.3859, 'grad_norm': 6.634369850158691, 'learning_rate': 4.15155661991105e-05, 'epoch': 0.2}
{'loss': 4.385, 'grad_norm': 6.556199073791504, 'learning_rate': 3.980499486828601e-05, 'epoch': 0.23}
{'loss': 4.3271, 'grad_norm': 6.527666091918945, 'learning_rate': 3.8094423537461516e-05, 'epoch': 0.26}
{'loss': 4.2473, 'grad_norm': 7.25576114654541, 'learning_rate': 3.638385220663702e-05, 'epoch': 0.3}
{'loss': 4.271, 'grad_norm': 5.7785844802856445, 'learning_rate': 3.467328087581252e-05, 'epoch': 0.33}
{'loss': 4.2067, 'grad_norm': 6.272899150848389, 'learning_rate': 3.296270954498803e-05, 'epoch': 0.36}
{'loss': 4.1692, 'grad_norm': 6.672155857086182, 'learning_rate': 3.1252138214163535e-05, 'epoch': 0.4}
{'loss': 4.1783, 'grad_norm': 6.597072601318359, 'learning_rate': 2.9541566883339038e-05, 'epoch': 0.43}
{'loss': 4.0809, 'grad_norm': 6.397854328155518, 'learning_rate': 2.7830995552514545e-05, 'epoch': 0.46}
{'loss': 4.1572, 'grad_norm': 6.906569480895996, 'learning_rate': 

TrainOutput(global_step=3023, training_loss=4.243638261043587, metrics={'train_runtime': 455.3723, 'train_samples_per_second': 106.183, 'train_steps_per_second': 6.639, 'total_flos': 0.0, 'train_loss': 4.243638261043587, 'epoch': 1.0})

In [10]:
evaluator(embedding_model)

{'pearson_cosine': 0.6755061908100044,
 'spearman_cosine': 0.6907158912611558,
 'pearson_manhattan': 0.6907605506750119,
 'spearman_manhattan': 0.6952479406521168,
 'pearson_euclidean': 0.6915512320822095,
 'spearman_euclidean': 0.6956817042966573,
 'pearson_dot': 0.5117403919131931,
 'spearman_dot': 0.49990595575508096,
 'pearson_max': 0.6915512320822095,
 'spearman_max': 0.6956817042966573}