# Sentence Transformers Encoder

This notebook can be used to fine-tune sentence transformers.

## Install the dependencies

In [1]:
%make install

UsageError: Line magic function `%make` not found.


## Importing the libraries

In [1]:
from sentence_transformers import SentenceTransformer, InputExample
from sentence_transformers.models import Transformer, Pooling, Normalize
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.datasets import NoDuplicatesDataLoader

## Load the data

In [2]:
from src.data.datasets import SquadV2DatasetBuilder

dataset_builder = SquadV2DatasetBuilder()

dataset = dataset_builder.make_encoder_dataset()
dataset

DatasetDict({
    train: Dataset({
        features: ['document', 'query', 'query_id', 'document_id'],
        num_rows: 130319
    })
    test: Dataset({
        features: ['document', 'query', 'query_id', 'document_id'],
        num_rows: 11873
    })
})

## Setup experiment

Define the total number of epochs and the batch size to use during training.

NOTE: we run on GPUs and therefore a high batch size may lead to OOM errors.

In [4]:
num_epochs = 3
train_batch_size = 4

We can either fine-tune a foundational Transformer model (such as `mpnet-base`, or `distilbert-base-cased`), or a SentenceTransformer model (such as `all-mpnet-base-v2`).

In [5]:
# FINE TUNE A PRE-TRAINED S-BERT MODEL
model_ckpt = "sentence-transformers/all-mpnet-base-v2"
model_name = "all-mpnet-base-v2-finetuned-squad-v2"
output_path = f"models/{model_name}"
model = SentenceTransformer(model_ckpt)
print(model)

"""
# FINE TUNE A PRE-TRAINED BERT MODEL
model_ckpt = 'microsoft/mpnet-base'
output_path = "models/mpnet-base-squad-v2"
word_embedding_model = Transformer(model_ckpt)
pooling_model = Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False
)
normalize_model = Normalize()
model = SentenceTransformer(modules=[word_embedding_model, pooling_model, normalize_model])
print(model)
"""

SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)


'\n# FINE TUNE A PRE-TRAINED BERT MODEL\nmodel_ckpt = \'microsoft/mpnet-base\'\noutput_path = "models/mpnet-base-squad-v2"\nword_embedding_model = Transformer(model_ckpt)\npooling_model = Pooling(\n    word_embedding_model.get_word_embedding_dimension(),\n    pooling_mode_mean_tokens=True,\n    pooling_mode_cls_token=False,\n    pooling_mode_max_tokens=False\n)\nnormalize_model = Normalize()\nmodel = SentenceTransformer(modules=[word_embedding_model, pooling_model, normalize_model])\nprint(model)\n'

Prepare the training data, the evaluator, and the loss.

In [6]:
from collections import defaultdict
from src.data.data_models import FeatureNames

train_set = [
    InputExample(texts=[query, document])
    for query, document in zip(dataset["train"][FeatureNames.QUERY.value], dataset["train"][FeatureNames.DOCUMENT.value])
]
train_dataloader = NoDuplicatesDataLoader(train_set, batch_size=train_batch_size)

queries_dict = {q_id: query for q_id, query in zip(dataset["test"][FeatureNames.QUERY_ID.value], dataset["test"][FeatureNames.QUERY.value])}
documents_dict = {c_id: context for c_id, context in zip(dataset["test"][FeatureNames.DOCUMENT_ID.value], dataset["test"][FeatureNames.DOCUMENT.value])}
relevant_docs = defaultdict(list)
for q_id, c_id in zip(dataset["test"][FeatureNames.QUERY_ID.value], dataset["test"][FeatureNames.DOCUMENT_ID.value]):
    relevant_docs[q_id].append(c_id)

evaluator = InformationRetrievalEvaluator(
    queries=queries_dict, #qid => query
    corpus=documents_dict, #cid => doc
    relevant_docs=relevant_docs, #qid => Set[cid]
    show_progress_bar=True,
    name='eval',
)

loss = MultipleNegativesRankingLoss(model=model)
print(loss)

warmup_steps = int(len(train_dataloader) * num_epochs * 0.1) #10% of train data
warmup_steps

MultipleNegativesRankingLoss(
  (model): SentenceTransformer(
    (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
    (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
    (2): Normalize()
  )
  (cross_entropy_loss): CrossEntropyLoss()
)


9773

In [7]:
# How many evaluation steps should I define?
(len(train_set)) // train_batch_size

32579

## Run training

In [None]:
# Train the model
model.fit(
    train_objectives=[(train_dataloader, loss)],
    evaluator=evaluator,
    epochs=num_epochs,
    evaluation_steps=1000,
    warmup_steps=warmup_steps,
    output_path=output_path,
)

In [10]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# FIXME: currently we can't push model to hub due to an issue in sentence-transformers lib
# https://github.com/UKPLab/sentence-transformers/pull/1868
# https://github.com/UKPLab/sentence-transformers/issues/1760

In [15]:
model.save_to_hub(
    repo_name=f"joaobarroca/{model_name}",
    train_datasets=dataset_builder.get_dataset_names(),
    #local_model_path=output_path,
    exist_ok=True,
)

Cloning https://huggingface.co/joaobarroca/all-mpnet-base-v2-finetuned-squad-v2 into local empty directory.


Upload file pytorch_model.bin:   0%|          | 1.00/418M [00:00<?, ?B/s]

remote: [31m-------------------------------------------------------------------------[0m        
remote: [31mSorry, your push was rejected during YAML metadata verification: [0m        
remote: [31m- Error: "datasets[2]" with value "MS Marco" is not valid. It should not contain any whitespace. If possible, use a dataset id from the huggingface Hub.[0m        
remote: [31m-------------------------------------------------------------------------[0m        
remote: [32m-------------------------------------------------------------------------[0m        
remote: [32mPlease find the documentation at:[0m        
remote: [32mhttps://huggingface.co/docs/hub/model-cards#model-card-metadata[0m        
remote: [32m[0m        
remote: [32m-------------------------------------------------------------------------[0m        
To https://huggingface.co/joaobarroca/all-mpnet-base-v2-finetuned-squad-v2
 ! [remote rejected] main -> main (pre-receive hook declined)
error: failed to push so

OSError: remote: [31m-------------------------------------------------------------------------[0m        
remote: [31mSorry, your push was rejected during YAML metadata verification: [0m        
remote: [31m- Error: "datasets[2]" with value "MS Marco" is not valid. It should not contain any whitespace. If possible, use a dataset id from the huggingface Hub.[0m        
remote: [31m-------------------------------------------------------------------------[0m        
remote: [32m-------------------------------------------------------------------------[0m        
remote: [32mPlease find the documentation at:[0m        
remote: [32mhttps://huggingface.co/docs/hub/model-cards#model-card-metadata[0m        
remote: [32m[0m        
remote: [32m-------------------------------------------------------------------------[0m        
To https://huggingface.co/joaobarroca/all-mpnet-base-v2-finetuned-squad-v2
 ! [remote rejected] main -> main (pre-receive hook declined)
error: failed to push some refs to 'https://huggingface.co/joaobarroca/all-mpnet-base-v2-finetuned-squad-v2'
