In [1]:
import os
import torch
from tqdm import tqdm
from dotenv import load_dotenv

from sentence_transformers import SentenceTransformer, SentenceTransformerModelCardData, SentenceTransformerTrainingArguments, SentenceTransformerTrainer
from sentence_transformers.evaluation import InformationRetrievalEvaluator, SequentialEvaluator
from sentence_transformers.util import cos_sim
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers

from datasets import load_dataset, concatenate_datasets
from datasets import Dataset

from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from transformers import T5ForConditionalGeneration, T5Tokenizer
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import ChatGoogleGenerativeAI

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
if torch.cuda.is_available():
    print("CUDA is available.")
    print(f"Number of CUDA devices: {torch.cuda.device_count()}")
    print(f"Current CUDA device name: {torch.cuda.get_device_name(0)}")
else:
    print("CUDA is not available. PyTorch will use CPU.")

CUDA is available.
Number of CUDA devices: 1
Current CUDA device name: NVIDIA GeForce RTX 4050 Laptop GPU


## Data Preparation

In [3]:
# --- Configuration ---
DOCUMENTS_FOLDER = "data/raw/"
CHUNK_SIZE = 500
CHUNK_OVERLAP = 50
QUESTIONS_PER_CHUNK = 3

#### LLM Setup

In [4]:
load_dotenv()

True

In [5]:
try:
    print("Setting up LLM for question generation...")    
    # LLM
    llm = ChatGoogleGenerativeAI(
        model="gemini-2.5-flash",
        temperature=1,
        max_retries=2,
    )
    
except Exception as e:
    print(f"Error setting up the OpenAI model: {e}")
    print("Please ensure you have `langchain-openai` installed and your OPENAI_API_KEY is set.")
    exit()

Setting up LLM for question generation...


In [6]:
# Prompt 
label_prompt = PromptTemplate.from_template("""
You are an AI assistant tasked with generating a single, realistic question-answer pair based on a given document. The question should be something a user might naturally ask when seeking information contained in the document.

Given: {chunk}

Instructions:
1. Analyze the key topics, facts, and concepts in the given document, choose one to focus on.
2. Generate 3 similar questions that a user might ask to find the information in this document that does NOT contain any company name.
3. Use natural language and occasionally include typos or colloquialisms to mimic real user behavior in the question.
4. Ensure the question is semantically related to the document content WITHOUT directly copying phrases.
5. Make sure that all of the questions are similar to eachother. I.E. All asking about a similar topic/requesting the same information.

Output Format:
Return a JSON object with the following structure:
```json
{{
  "question_1": "Generated question text",
  "question_2": "Generated question text",
  ...
}}
```

Be creative, think like a curious user, and generate your 3 similar questions that would naturally lead to the given document in a semantic search. Ensure your response is a valid JSON object containing only the questions.

""")

In [7]:
def generate_questions(chunk: str) -> list:
    """
    Generates a specified number of questions for a given text chunk using an OpenAI model.
    """
    # Use the model to generate the questions

    label_chain = label_prompt | llm | JsonOutputParser()
    response = label_chain.invoke({"chunk":chunk})
    
    # The response content is a single string. We'll split it into a list of questions.
    questions = list(response.values())
    
    return questions

#### Documents processing

In [8]:
# 1. Load documents from the specified folder
loader = DirectoryLoader(DOCUMENTS_FOLDER, glob="*.txt",
                         loader_cls=lambda path: TextLoader(path, encoding="utf-8"))

try:
    documents = loader.load()
    if not documents:
        print(f"No '.txt' files found in the '{DOCUMENTS_FOLDER}' folder.")
    print(f"Successfully loaded {len(documents)} document(s).")
except Exception as e:
    print(f"An error occurred during document loading: {e}")

Successfully loaded 1 document(s).


In [9]:
# 2. Chunk the documents using the RecursiveCharacterTextSplitter
print("Splitting documents into chunks...")
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    length_function=len
)
chunks = text_splitter.split_documents(documents)
print(f"Created {len(chunks)} chunks.")

Splitting documents into chunks...
Created 191 chunks.


In [10]:
# 3. For each chunk, create 2-5 questions using the LLM
print("Generating questions for each chunk...")
dataset_list = []
for i, chunk in enumerate(tqdm(chunks)):
    try:
        questions = generate_questions(chunk.page_content)
        
        # 4. Store the questions and chunks in the desired format
        # Each chunk is paired with each of its generated questions
        for q in questions:
            dataset_list.append({
                "chunk_id": f"chunk_{i+1}",
                "chunk": chunk.page_content,
                "question": q
            })
    except Exception as e:
        print(f"\nSkipping chunk {i+1} due to an error during question generation: {e}")
        continue

Generating questions for each chunk...


 49%|██████████████████████████████████▌                                    | 93/191 [10:29<11:03,  6.77s/it]


KeyboardInterrupt: 

In [18]:
# Convert the list of dictionaries to a Hugging Face Dataset
print("\nCreating Hugging Face Dataset...")
if not dataset_list:
    print("No data was generated. The dataset will be empty.")

try:
    dataset = Dataset.from_list(dataset_list)
    # Add an id column to the dataset
    dataset = dataset.add_column("id", range(len(dataset)))
    
    print("\nDataset created successfully!")
    print(dataset)
    
    # You can save the dataset locally if you wish, for example:
    # dataset.save_to_disk("my_generated_dataset")
    # print("\nDataset saved to 'my_generated_dataset' folder.")

except Exception as e:
    print(f"An error occurred while creating the Hugging Face Dataset: {e}")


Creating Hugging Face Dataset...

Dataset created successfully!
Dataset({
    features: ['chunk_id', 'chunk', 'question', 'id'],
    num_rows: 279
})


In [19]:
# Shuffle Dataset
dataset = dataset.shuffle()

# Split Dataset Into a 90/10 Train/Test split
dataset = dataset.train_test_split(test_size=0.1)

# Save Datasets to Disk
dataset["train"].to_json("train_dataset.json", orient="records")
dataset["test"].to_json("test_dataset.json", orient="records")

Creating json from Arrow format: 100%|█████████████████████████████████████████| 1/1 [00:00<00:00, 24.20ba/s]
Creating json from Arrow format: 100%|████████████████████████████████████████| 1/1 [00:00<00:00, 499.80ba/s]


13378

## Base Model Evaluation & Matryoshka Dimensions

In [3]:
# Hugging Face model ID
model_id = "sentence-transformers/all-MiniLM-L6-v2"

# Loading via SentenceTransformer
model = SentenceTransformer(
    model_id, device="cuda" if torch.cuda.is_available() else "cpu"
)

In [4]:
# Load train and test datasets from their respective JSON files
# These contain pairs of questions (anchors) and text chunks (positives)
test_dataset = load_dataset("json", data_files="test_dataset.json", split="train")
train_dataset = load_dataset("json", data_files="train_dataset.json", split="train")

# Combine train and test datasets into a single corpus
# This ensures we have all possible text chunks available for retrieval evaluation
corpus_dataset = concatenate_datasets([train_dataset, test_dataset])

# Convert datasets into dictionary format required by the InformationRetrievalEvaluator
# corpus: maps corpus IDs to their text chunks (documents)
# Format: {corpus_id: text_chunk}
corpus = dict(
    zip(corpus_dataset["id"], corpus_dataset["chunk"])
)

# queries: maps query IDs to their questions
# Format: {query_id: question_text}
queries = dict(
    zip(test_dataset["id"], test_dataset["question"])
)

# Create a mapping between queries and their relevant documents
# This tells the evaluator which documents are correct matches for each query
relevant_docs = {}
for q_id, global_chunk_id in zip(test_dataset["id"], test_dataset["chunk_id"]):
    # Initialize empty list for each query if not already present
    if q_id not in relevant_docs:
        relevant_docs[q_id] = []

    # Find all corpus entries that share the same global_chunk_id
    # This handles cases where multiple questions can refer to the same text chunk
    matching_corpus_ids = [
        cid for cid, chunk in zip(corpus_dataset["id"], corpus_dataset["chunk_id"])
        if chunk == global_chunk_id
    ]
    # Add the matching corpus IDs to the relevant documents for this query
    relevant_docs[q_id].extend(matching_corpus_ids)

In [13]:
# Dimensions of interest
matryoshka_dimensions = [384] # Important: large to small

# Create empty list to hold evaluators
matryoshka_evaluators = []

# Create an evaluator for each above dimension
for dim in matryoshka_dimensions:
    # Define the evaluator
    ir_evaluator = InformationRetrievalEvaluator(
        queries=queries,
        corpus=corpus,
        relevant_docs=relevant_docs,
        name=f"dim_{dim}",
        truncate_dim=dim,  # Truncate the embeddings to the respective dimension
        score_functions={"cosine": cos_sim},
        accuracy_at_k = [1, 3],
        precision_recall_at_k = [1, 3],
        mrr_at_k = [3],
        ndcg_at_k = [3],
        map_at_k = [3]
    )
    # Add to list
    matryoshka_evaluators.append(ir_evaluator)

# Create a sequential evaluator
# Able to run all our dimension specific InformationRetrievalEvaluators sequentially.
evaluator = SequentialEvaluator(matryoshka_evaluators)

In [14]:
# Evaluate the model
base_results = evaluator(model)

# Print header
print("\nBase Model Evaluation Results")
print("-" * 85)
print(f"{'Metric':15} {'384d':>12}")
print("-" * 85)

# List of metrics to display
metrics = [
    'ndcg@3',
    'mrr@3',
    'map@3',
    'accuracy@1',
    'accuracy@3',
    'precision@1',
    'precision@3',
    'recall@1',
    'recall@3',
]

# Print each metric
for metric in metrics:
    values = []
    for dim in matryoshka_dimensions:
        key = f"dim_{dim}_cosine_{metric}"
        values.append(base_results[key])

    # Highlight NDCG@10
    metric_name = f"=={metric}==" if metric == "ndcg@3" else metric
    print(f"{metric_name:15}", end="  ")
    for val in values:
        print(f"{val:12.4f}", end=" ")
    print()

# Print sequential score
print("-" * 85)
print(f"{'seq_score:'} {base_results['sequential_score']:1f}")


Base Model Evaluation Results
-------------------------------------------------------------------------------------
Metric                  384d
-------------------------------------------------------------------------------------
==ndcg@3==             0.6786 
mrr@3                  0.6786 
map@3                  0.6786 
accuracy@1             0.6786 
accuracy@3             0.6786 
precision@1            0.6786 
precision@3            0.6786 
recall@1               0.2262 
recall@3               0.6786 
-------------------------------------------------------------------------------------
seq_score: 0.678571


### Training

In [15]:
# load model with SDPA for using Flash Attention 2
model = SentenceTransformer(
    model_id,
    model_kwargs={"attn_implementation": "sdpa"},
    model_card_data=SentenceTransformerModelCardData(
        language="en",
        license="apache-2.0",
        model_name="ModernBERT Embed base Legal Matryoshka",
    ),
)

In [16]:
# Initial Loss
base_loss = MultipleNegativesRankingLoss(model)

# Matryoshka Loss Wrapper
train_loss = MatryoshkaLoss(
    model, base_loss, matryoshka_dims=matryoshka_dimensions
)

In [27]:
# Training Arguments
args = SentenceTransformerTrainingArguments(
    output_dir="saved-model", # output directory and hugging face model ID
    num_train_epochs=4,                                        # number of epochs
    per_device_train_batch_size=32,                            # train batch size
    gradient_accumulation_steps=16,                            # for a global batch size of 512
    per_device_eval_batch_size=16,                             # evaluation batch size
    warmup_ratio=0.1,                                          # warmup ratio
    learning_rate=2e-5,                                        # learning rate, 2e-5 is a good value
    lr_scheduler_type="cosine",                                # use cosine learning rate scheduler
    optim="adamw_torch_fused",                                 # use fused adamw optimizer
    tf32=True,                                                 # use tf32 precision
    bf16=True,                                                 # use bf16 precision
    batch_sampler=BatchSamplers.NO_DUPLICATES,                 # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
    eval_strategy="epoch",                                     # evaluate after each epoch
    save_strategy="epoch",                                     # save after each epoch
    logging_steps=10,                                          # log every 10 steps
    save_total_limit=3,                                        # save only the last 3 models
    load_best_model_at_end=True,                               # load the best model when training ends
    metric_for_best_model="eval_dim_384_cosine_map@3",       # Optimizing for the best ndcg@10 score for the 128 dimension
    report_to="none"                                           # Turning off training logging for now, input 'wandb' etc. if desired.
)

In [28]:
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset.select_columns(
        ["chunk", "question"]
    ),  # training dataset
    loss=train_loss,
    evaluator=evaluator,
)

In [29]:
# Start training
trainer.train()

# Save the best model based on our eval_dim_128_cosine_ndcg@10 criteria
trainer.save_model()

Column 'question' is at index 1, whereas a column with this name is usually expected at index 0. Note that the column order can be important for some losses, e.g. MultipleNegativesRankingLoss will always consider the first column as the anchor and the second as the positive, regardless of the dataset column names. Consider renaming the columns to match the expected order, e.g.:
dataset = dataset.select_columns(['question', 'answer'])


Epoch,Training Loss,Validation Loss,Dim 384 Cosine Accuracy@1,Dim 384 Cosine Accuracy@3,Dim 384 Cosine Precision@1,Dim 384 Cosine Precision@3,Dim 384 Cosine Recall@1,Dim 384 Cosine Recall@3,Dim 384 Cosine Ndcg@3,Dim 384 Cosine Mrr@3,Dim 384 Cosine Map@3,Sequential Score
1,No log,No log,0.678571,0.678571,0.678571,0.678571,0.22619,0.678571,0.678571,0.678571,0.678571,0.678571
2,No log,No log,0.75,0.75,0.75,0.75,0.25,0.75,0.75,0.75,0.75,0.75
3,No log,No log,0.75,0.75,0.75,0.75,0.25,0.75,0.75,0.75,0.75,0.75
4,No log,No log,0.75,0.75,0.75,0.75,0.25,0.75,0.75,0.75,0.75,0.75


### Evaluating Trained Model

In [30]:
fine_tuned_model = SentenceTransformer(
    args.output_dir, device="cuda" if torch.cuda.is_available() else "cpu"
)

# Evaluate the model
ft_results = evaluator(fine_tuned_model)

In [31]:
# Evaluate the model
# Print header
print("\nFine-tuned Model Evaluation Results")
print("-" * 85)
print(f"{'Metric':15} {'384d':>12}")
print("-" * 85)

# List of metrics to display
metrics = [
    'ndcg@3',
    'mrr@3',
    'map@3',
    'accuracy@1',
    'accuracy@3',
    'precision@1',
    'precision@3',
    'recall@1',
    'recall@3',
]

# Print each metric
for metric in metrics:
    values = []
    for dim in matryoshka_dimensions:
        key = f"dim_{dim}_cosine_{metric}"
        values.append(ft_results[key])

    # Highlight NDCG@10
    metric_name = f"=={metric}==" if metric == "ndcg@3" else metric
    print(f"{metric_name:15}", end="  ")
    for val in values:
        print(f"{val:12.4f}", end=" ")
    print()

# Print sequential score
print("-" * 85)
print(f"{'seq_score:'} {ft_results['sequential_score']:1f}")


Fine-tuned Model Evaluation Results
-------------------------------------------------------------------------------------
Metric                  384d
-------------------------------------------------------------------------------------
==ndcg@3==             0.7500 
mrr@3                  0.7500 
map@3                  0.7500 
accuracy@1             0.7500 
accuracy@3             0.7500 
precision@1            0.7500 
precision@3            0.7500 
recall@1               0.2500 
recall@3               0.7500 
-------------------------------------------------------------------------------------
seq_score: 0.750000
