### Script for fine-tuning the multilingual e5 embedding model on German political data

The model is fine-tuned following this reference: https://www.sbert.net/docs/package_reference/losses.html#multiplenegativesrankingloss.

Fine-tuned model can be found at https://huggingface.co/jost/multilingual-e5-base-politics-de.

In [None]:
!pip install -U sentence-transformers

In [3]:
import json

import torch
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, InputExample, losses
from sentence_transformers.evaluation import InformationRetrievalEvaluator

###### Load pre-trained checkpoint

In [None]:
model_id = "intfloat/multilingual-e5-base"
model = SentenceTransformer(model_id)

###### Read training data

In [5]:
TRAIN_DATASET_FPATH = "/content/drive/train_dataset.json"
VAL_DATASET_FPATH = '/content/drive/val_dataset.json'

###### Start training

In [1]:
EPOCHS = 1
BATCH_SIZE = 64

with open(TRAIN_DATASET_FPATH, 'r+') as f:
    train_dataset = json.load(f)

with open(VAL_DATASET_FPATH, 'r+') as f:
    val_dataset = json.load(f)

dataset = train_dataset

corpus = dataset['corpus']
queries = dataset['queries']
relevant_docs = dataset['relevant_docs']

examples = []
for query_id, query in queries.items():
    node_id = relevant_docs[query_id][0]
    text = corpus[node_id]
    example = InputExample(texts=[query, text])
    examples.append(example)

loader = DataLoader(
    examples, batch_size=BATCH_SIZE
)

loss = losses.MultipleNegativesRankingLoss(model)

dataset = val_dataset

corpus = dataset['corpus']
queries = dataset['queries']
relevant_docs = dataset['relevant_docs']

evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs)

model.fit(
    train_objectives=[(loader, loss)],
    epochs=EPOCHS,
    output_path='/content/drive/MyDrive/', # save the model in Google Drive
    show_progress_bar=True,
    evaluator=evaluator
)