In [None]:
!pip install sentence_transformers datasets accelerate

# base

In [None]:
!pip install -U datasets

In [None]:
import random
from sentence_transformers import SentenceTransformer
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from datasets import load_dataset

In [None]:
corpus = load_dataset("khanglt0004/vietnamese_legal_chunks", split = "train")
queries = load_dataset("khanglt0004/questions", split = "train")
relevant_docs_data = load_dataset("khanglt0004/links", split = "train")
# Convert the datasets to dictionaries
corpuss = dict(zip(corpus["id"], corpus["text"]))  # Our corpus (cid => document)
corpus = {}
for cid, text in corpuss.items():
    cid = str(cid)
    corpus[cid] = text
queries = dict(zip(queries["qid"], queries["question"]))  # Our queries (qid => question)
relevant_docs = {}  # Query ID to relevant documents (qid => set([relevant_cids])
for qid, corpus_ids in zip(relevant_docs_data["q_id"], relevant_docs_data["chunk_id"]):
    qid = str(qid)
    corpus_ids = str(corpus_ids)
    if qid not in relevant_docs:
        relevant_docs[qid] = set()
    relevant_docs[qid].add(corpus_ids)

In [None]:
import json
import torch
from sentence_transformers import SentenceTransformer
from sentence_transformers.evaluation import (
    InformationRetrievalEvaluator,
    SequentialEvaluator,
)
from sentence_transformers.util import cos_sim
from datasets import load_dataset, concatenate_datasets

model = SentenceTransformer("bkai-foundation-models/vietnamese-bi-encoder")
matryoshka_dimensions = [768, 512, 256, 128, 64] # Important: large to small
matryoshka_evaluators = []
# Iterate over the different dimensions
for dim in matryoshka_dimensions:
    ir_evaluator = InformationRetrievalEvaluator(
        queries=queries,
        corpus=corpus,
        relevant_docs=relevant_docs,
        name=f"dim_{dim}",
        truncate_dim=dim,  # Truncate the embeddings to a certain dimension
        score_functions={"cosine": cos_sim},
    )
    matryoshka_evaluators.append(ir_evaluator)

# Create a sequential evaluator
evaluator = SequentialEvaluator(matryoshka_evaluators)

In [None]:
# Evaluate the model
results = evaluator(model)
for k,v in results.items():
    print(k, v)

# Training

In [None]:
import pandas as pd
from datasets import Dataset

def prepare_training_dataset(queries, corpus, relevant_docs):
    anchors = []
    positives = []
    for query_id, docs in relevant_docs.items():
        for doc_id in docs:
          anchors.append(queries[query_id])
          positives.append(corpus[str(doc_id)] )
    df = {
        "anchor": anchors,
        "positive": positives
    }

    return Dataset.from_dict(df)

pairs = prepare_training_dataset(queries, corpus, relevant_docs)
pairs

In [None]:
pairs[0]

In [None]:
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss

matryoshka_dimensions = [768, 512, 256, 128, 64]  # Important: large to small
inner_train_loss = MultipleNegativesRankingLoss(model)
train_loss = MatryoshkaLoss(
    model, inner_train_loss, matryoshka_dims=matryoshka_dimensions
)

In [None]:
from sentence_transformers import SentenceTransformerTrainingArguments
from sentence_transformers.training_args import BatchSamplers

# define training arguments
args = SentenceTransformerTrainingArguments(
    output_dir="sample", # output directory and hugging face model ID
    num_train_epochs=1,                         # number of epochs
    per_device_train_batch_size=8,             # train batch size
    gradient_accumulation_steps=4,             # for a global batch size of 512
    per_device_eval_batch_size=4,              # evaluation batch size
    #gradient_checkpointing=True,
    warmup_ratio=0.1,                           # warmup ratio
    learning_rate=2e-5,                         # learning rate, 2e-5 is a good value
    lr_scheduler_type="cosine",                 # use constant learning rate scheduler
    optim="adamw_torch_fused",                  # use fused adamw optimizer
    #tf32=True,                                  # use tf32 precision
    bf16=True,                                  # use bf16 precision
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
    eval_strategy="steps",                      # evaluate after each epoch
    #save_strategy="epoch",                      # save after each epoch
    save_steps = 500,
    logging_steps=10,                           # log every 10 steps
    save_total_limit=3,                         # save only the last 3 models
    load_best_model_at_end=True,                # load the best model when training ends
    metric_for_best_model="eval_dim_768_cosine_ndcg@10",  # Optimizing for the best ndcg@10 score for the 128 dimension
)

In [None]:
from sentence_transformers import SentenceTransformerTrainer
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,  # training arguments
    train_dataset=pairs,
    loss=train_loss,
    evaluator=evaluator,
)

In [None]:
# start training, the model will be automatically saved to the hub and the output directory
trainer.train()

# save the best model
trainer.save_model()

# Re-evaluate

In [None]:
from sentence_transformers import SentenceTransformer
import torch
fine_tuned_model = SentenceTransformer(
    args.output_dir, device="cuda" if torch.cuda.is_available() else "cpu"
)
# Evaluate the model
results = evaluator(fine_tuned_model)

for k,v in results.items():
    print(k, v)

# Push model to hub

In [None]:
from huggingface_hub import login

login(token="", add_to_git_credential=True)  # ADD YOUR TOKEN HERE

In [None]:
# push model to hub
trainer.model.push_to_hub("ltk_embedding")