In [3]:
!pip install datasets
import os
os.environ["WANDB_DISABLED"] = "true"



***Training Embedding Model form Scratch***

In [4]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from sentence_transformers import losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.training_args import SentenceTransformerTrainingArguments
from sentence_transformers.trainer import SentenceTransformerTrainer

In [5]:
train_dataset = load_dataset("glue", "mnli", split="train").select(range(50_000))
train_dataset = train_dataset.remove_columns("idx")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
train_dataset[2]

{'premise': 'One of our number will carry out your instructions minutely.',
 'hypothesis': 'A member of my team will execute your orders with immense precision.',
 'label': 0}

In [7]:
embedding_model = SentenceTransformer('bert-base-uncased')



In [8]:
train_loss = losses.SoftmaxLoss(model=embedding_model, sentence_embedding_dimension=embedding_model.get_sentence_embedding_dimension(),num_labels=3)

In [9]:
val_sts = load_dataset("glue","stsb", split="validation")
evaluator = EmbeddingSimilarityEvaluator(
sentences1=val_sts["sentence1"],
sentences2=val_sts["sentence2"],
scores=[score/5 for score in val_sts["label"]], main_similarity="cosine")

In [10]:
args = SentenceTransformerTrainingArguments(output_dir="base_embedding_model", num_train_epochs=1, per_device_train_batch_size=32,
          per_device_eval_batch_size=32,
          warmup_steps=100,
          fp16=True,
          eval_steps=100,
          logging_steps=100,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
trainer = SentenceTransformerTrainer(
      model=embedding_model,
      args=args,
      train_dataset=train_dataset,
      loss=train_loss,
      evaluator=evaluator
    )
trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataset = dataset.select_columns(['hypothesis', 'entailment', 'contradiction'])


Step,Training Loss


In [None]:
evaluator(embedding_model)

***Cosine similarity loss function***

In [None]:
from datasets import Dataset, load_dataset

In [None]:
train_dataset = load_dataset("glue", "mnli", split="train").select(range(50_000))
train_dataset = train_dataset.remove_columns("idx")

In [None]:
mapping = {2: 0, 1: 0, 0:1}
train_dataset = Dataset.from_dict({
  "sentence1": train_dataset["premise"],
  "sentence2": train_dataset["hypothesis"],
  "label": [float(mapping[label]) for label in
  train_dataset["label"]]
})

In [None]:
val_sts = load_dataset("glue" ,"stsb", split="validation")
evaluator = EmbeddingSimilarityEvaluator( sentences1=val_sts["sentence1"], sentences2=val_sts["sentence2"],
                                          scores=[score/5 for score in val_sts["label"]], main_similarity="cosine")

In [None]:
embedding_model = SentenceTransformer("bert-base-uncased")
train_loss = losses.CosineSimilarityLoss(model=embedding_model)

In [None]:
args = SentenceTransformerTrainingArguments(output_dir="cosineloss_embedding_model",num_train_epochs=1,
                                            per_device_train_batch_size=32,
                                            per_device_eval_batch_size=32,
                                            warmup_steps=100,
                                            fp16=True,
                                            eval_steps=100,
                                            logging_steps=100,
      )

In [None]:
trainer = SentenceTransformerTrainer(model=embedding_model, args=args, train_dataset=train_dataset, loss=train_loss, evaluator=evaluator)

In [None]:
trainer.train()

In [None]:
evaluator(embedding_model)

***Supervised fine tunning***

In [None]:
from datasets import load_dataset
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers import losses, SentenceTransformer
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

In [None]:
train_dataset = load_dataset("glue" ,"mnli", split="train").select(range(50_000))
train_dataset = train_dataset.remove_columns("idx")

In [None]:
val_sts = load_dataset("glue", "stsb", split="validation")

evaluator = EmbeddingSimilarityEvaluator(
  sentences1=val_sts["sentence1"],
  sentences2=val_sts["sentence2"],
  scores=[score/5 for score in val_sts["label"]],
  main_similarity="cosine"
)

In [None]:
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
train_loss = losses.MultipleNegativesRankingLoss(model=embedding_model)

In [None]:
args = SentenceTransformerTrainingArguments(output_dir="finetuned_embedding_model",
                                            num_train_epochs=1,
                                            per_device_train_batch_size=32,
                                            per_device_eval_batch_size=32,
                                            warmup_steps=100,
                                            fp16=True,
                                            eval_steps=100,
                                            logging_steps=100,
                                          )

In [None]:
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)

In [None]:
trainer.train()

In [None]:
evaluator(embedding_model)