In [None]:
from sentence_transformers import SentenceTransformer, InputExample, losses, models
from torch.utils.data import DataLoader

# Step 1: Load BGE model
model_name = "BAAI/bge-small-en"
word_embedding_model = models.Transformer(model_name)
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode="mean"
)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

2025-05-07 21:27:17.268046: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746653237.512925      73 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746653237.583421      73 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
# Step 2: Instruction prefix for BGE models (recommended by BAAI)
instruction = "Represent this sentence for retrieval: "

# Step 3: Prepare training data (query, positive passage)
# In real use, load this from your dataset
train_examples = [
    InputExample(texts=[instruction + "what is artificial intelligence", "Artificial intelligence is the simulation of human intelligence."]),
    InputExample(texts=[instruction + "capital of France", "Paris is the capital of France."]),
    InputExample(texts=[instruction + "what is machine learning", "Machine learning is a subset of artificial intelligence focused on data-driven models."]),
    InputExample(texts=[instruction + "benefits of exercise", "Regular physical activity improves cardiovascular health and boosts mood."]),
    InputExample(texts=[instruction + "python programming language", "Python is a popular programming language known for its readability."]),
]

In [None]:
train_examples[0]

In [None]:
# Step 4: DataLoader
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=4)

# Step 5: Define the MultipleNegativesRankingLoss
train_loss = losses.MultipleNegativesRankingLoss(model)

# Step 6: Train the model

In [None]:
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=3,
    warmup_steps=2,
    evaluation_steps=2,
    output_path="./bge-mnr-finetuned"
)

In [None]:
# 3. Validation data
val_examples = [
    InputExample(texts=[instruction + "what is artificial intelligence", "AI simulates human intelligence."]),
    InputExample(texts=[instruction + "capital of Germany", "Berlin is the capital of Germany."]),
    InputExample(texts=[instruction + "benefits of sleep", "Sleep improves mental performance."]),
    InputExample(texts=[instruction + "define programming", "Programming is the act of writing code."]),
]
val_dataloader = DataLoader(val_examples, shuffle=False, batch_size=2)
val_loss_fn = losses.MultipleNegativesRankingLoss(model)

In [None]:
# Step 7: Use the model (example)
model = SentenceTransformer("./bge-mnr-finetuned")
query = instruction + "capital of France"
docs = ["Paris is the capital of France.", "Berlin is in Germany.", "Apples are fruits."]

# Compute embeddings and similarities
query_emb = model.encode(query, convert_to_tensor=True)
doc_embs = model.encode(docs, convert_to_tensor=True)

import torch
cos_sim = torch.nn.functional.cosine_similarity(query_emb, doc_embs)
for doc, score in zip(docs, cos_sim):
    print(f"Doc: {doc} \nScore: {score.item():.4f}\n")