In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
os.environ["WANDB_MODE"] = "disabled"

In [5]:
! pip install -U sentence-transformers accelerate datasets -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m119.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
import random, json
import pandas as pd
from sentence_transformers import SentenceTransformer, CrossEncoder, InputExample
from torch.utils.data import DataLoader
from sentence_transformers import losses

In [4]:
OUT_DIR = "/content/drive/MyDrive/rag"
CSV_PATH = f"{OUT_DIR}/rag_finetune_dataset.csv"
METADATA_PATH = f"{OUT_DIR}/metadata.json"
OUTPUT_PATH = f"{OUT_DIR}/bge-m3-reranker-finetuned"

RERANKER_MODEL = "BAAI/bge-reranker-base"
EPOCHS = 1
BATCH_SIZE = 4
LR = 2e-5

In [None]:
df = pd.read_csv(CSV_PATH)
df = df.dropna(subset=["question", "chunk_idx", "source"])

with open(METADATA_PATH, "r", encoding="utf-8") as f:
    docs = json.load(f)

from collections import defaultdict
chunks_by_source = defaultdict(list)
for d in docs:
    chunks_by_source[d["metadata"]["source"]].append(d["text"])

train_set = []
all_chunks = []

for _, row in df.iterrows():
    source = row["source"]
    idx = int(row["chunk_idx"])

    if source not in chunks_by_source:
        continue
    if idx >= len(chunks_by_source[source]):
        continue

    pos_chunk = chunks_by_source[source][idx]

    train_set.append(
        InputExample(
            texts=[row["question"], pos_chunk],
            label=1.0
        )
    )
    all_chunks.append(pos_chunk)

for _, row in df.iterrows():
    source = row["source"]
    idx = int(row["chunk_idx"])

    pos_chunk = chunks_by_source[source][idx]

    neg_chunk = random.choice(all_chunks)
    if neg_chunk == pos_chunk:
        continue

    train_set.append(
        InputExample(
            texts=[row["question"], neg_chunk],
            label=0.0
        )
    )

print("Total reranker training pairs:", len(train_set))

Total reranker training pairs: 10365


In [None]:
train_dataloader = DataLoader(
    train_set,
    shuffle=True,
    batch_size=BATCH_SIZE
)

In [None]:
reranker = CrossEncoder(
    RERANKER_MODEL,
    num_labels=1,
    max_length=512
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/799 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

In [None]:
reranker.fit(
    train_dataloader=train_dataloader,
    epochs=EPOCHS,
    warmup_steps=100,
    optimizer_params={"lr": LR},
    show_progress_bar=True,
    output_path=OUTPUT_PATH
)

Token indices sequence length is longer than the specified maximum sequence length for this model (690 > 512). Running this sequence through the model will result in indexing errors
  | |_| | '_ \/ _` / _` |  _/ -_)


Step,Training Loss
500,0.5907
1000,0.483
1500,0.4929
2000,0.4688
2500,0.4141


In [None]:
reranker.save(OUTPUT_PATH)