# Dependencies and Configurations

In [None]:
%pip install -q sentence_transformers ollama

In [None]:
import os

os.environ['WANDB_MODE'] = 'disabled'

# Load HF_ACCESS_TOKEN from environment variable
# Set this locally: export HF_ACCESS_TOKEN="your_token_here"
HF_ACCESS_TOKEN = os.environ.get('HF_ACCESS_TOKEN', '')

TRAIN_SAMPLE_SIZE = 70
EVAL_SAMPLE_SIZE = 30

# Encoder and Retrieval (DPR) Configuration
DPR_MODEL_NAME = 'distilbert-base-uncased' # Base model for DPR

DPR_BATCH_SIZE = 8 # Batch size for DPR training
DPR_NUM_EPOCHS = 1 # Number of epochs for DPR training
DPR_K = 5  # Number of passages to retrieve

# Note: Using distilbert-base-uncased instead of all-MiniLM-L6-v2 to avoid data leakage
# since all-MiniLM-L6-v2 was trained on msmarco data

# LLM Configuration
LLM_MODEL_NAME = "qwen2.5:0.5b"


In [None]:
OLLAMA_HOST = 'http://localhost:11434'

# Utils

In [None]:
import textwrap

def text_wrapped(text, width=150):
    return textwrap.fill(text, width=width)

# Dataset

## Loading Data

In [None]:
from datasets import load_dataset

def extract_rag_sample(row):
    """
    Extract query, positive passage, negative passages, and answers from row.

    Args:
        row (dict): A single example from the MS MARCO dataset.
    Returns:
        dict: A dictionary with keys 'query', 'answers', 'positive_passage', and 'negative_passages'.
    """
    query = row.get("query")
    answers = row.get("answers", [])
    passages = row.get("passages", {})

    texts = passages.get("passage_text")
    labels = passages.get("is_selected")

    if not query or not texts or not labels:
        return None

    positives, negatives = [], []

    for text, label in zip(texts, labels):
        if not text:
            continue
        if label == 1:
            positives.append(text)
        else:
            negatives.append(text)

    if len(positives) == 0:
        return None

    return {
        "query": query,
        "positives": positives,
        "negatives": negatives,
        "answers": answers
    }


def load_msmarco_dataset(sample_size=100_000):
    """
    Load a finite subset of MS MARCO train split, extract RAG-ready samples
    """
    dataset = load_dataset(
        "microsoft/ms_marco",
        "v1.1",
        split=f"train[:{sample_size}]",
    )

    dataset = (
        dataset
        .map(extract_rag_sample)
        .filter(lambda x: x is not None)
        #.remove_columns(dataset.column_names)
    )

    return dataset


In [None]:
dataset = load_msmarco_dataset(sample_size=TRAIN_SAMPLE_SIZE + EVAL_SAMPLE_SIZE + 100)

split_dataset = dataset.train_test_split(seed=42, test_size=EVAL_SAMPLE_SIZE, train_size=TRAIN_SAMPLE_SIZE)
train_ds = split_dataset['train']
eval_ds = split_dataset['test']

print(f"Training set size: {len(train_ds)}")
print(f"Evaluation set size: {len(eval_ds)}")

Training set size: 70
Evaluation set size: 30


In [None]:
train_ds = train_ds.select(range(TRAIN_SAMPLE_SIZE))
eval_ds = eval_ds.select(range(EVAL_SAMPLE_SIZE))

In [None]:
corpus = []
for row in dataset:
    corpus.extend(row["positives"])
    corpus.extend(row["negatives"])

corpus = list(set(corpus))
print(f"Corpus size: {len(corpus)}")

Corpus size: 1594


# Implementation

## Prompts

In [None]:
def prompt_with_context(query, passages):
    context = "\n\n".join(passages)
    return f"""You are a helpful assistant. Considering the information provided in the following context, answer the question below.

Context:
{context}

Question:
{query}

Answer:
"""


In [None]:
def prompt_no_context(query):
    return f"""You are a helpful assistant.

Question:
{query}

Answer:
"""

## Ollama Setup

In [None]:
!curl -fsSL https://ollama.com/install.sh | sh

>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
######################################################################## 100.0%####################################################               82.6%
>>> Creating ollama user...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.


In [None]:
import threading
import subprocess
import time

def run_ollama_serve():
  """
  Run ollama serve in a separate thread.
  """
  subprocess.Popen(["ollama", "serve"])

thread = threading.Thread(target=run_ollama_serve)
thread.start()

time.sleep(2)


In [None]:
import ollama
from IPython.display import display, Markdown

def call_ollama(prompt, model=LLM_MODEL_NAME, verbose=False):
    """
    Call the Ollama LLM with the given prompt and model.

    Args:
        prompt (str): The prompt to send to the LLM.
        model (str): The name of the LLM model to use.
        verbose (bool): Whether to display the prompt and response.

    Returns:
        str: The content of the LLM's response.
    """

    response = ollama.chat(model, messages=[
    {
        'role': 'user',
        'content': prompt,
    },
    ])

    if verbose:
        display(Markdown(f"**Prompt**:\n '{prompt}'"))
        display(Markdown(text_wrapped(f"**Ollama response:**\n '{response['message']['content']}'")))

    return response['message']['content']

In [None]:
ollama.pull(LLM_MODEL_NAME)

ProgressResponse(status='success', completed=None, total=None, digest=None)

Let's check if we can prompt the LLM:

In [None]:
call_ollama("hello, ollama!")

"Hello Ollama! I'm Qwen, an artificial intelligence language model designed to assist you with various tasks and inquiries. How can I help you today? Is there anything specific you would like to discuss or ask about?"

## RAG with Dense Passage Retrieval (DPR)

### Training Data

In [None]:
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
import random


def build_dpr_training_data(dataset, n_negatives=10):
    """
    Build DPR training data with one positive and n_negatives negatives per query (1-1-N).

    It uses Truncation & Pad strategy for negatives, so all negatives have the same length.
    - When row has n_negatives negatives, they will all be used.
    - When row has more than n_negatives negatives, sample n_negatives of them.
    - When row has fewer than n_negatives negatives, use all negatives and sample the rest with replacement.

    Args:
        dataset: Dataset with 'query', 'positives', and 'negatives' fields.
        n_negatives: Number of negatives to use per query.

    Returns:
        List of InputExample objects for training.
    """
    examples = []
    for row in dataset:
        pos = row["positives"][0]  # use only one positive passage
        negatives = row["negatives"] # must have n_negatives negatives

        if len(negatives) >= n_negatives:
            # Standard sampling (no duplicates needed)
            negatives = random.sample(negatives, k=n_negatives)
        else:
            # We need more negatives than we have, so use random.choices which allows duplication
            n_to_sample = n_negatives - len(negatives)
            neg_samples = random.choices(negatives, k=n_to_sample)
            negatives.extend(neg_samples)
        examples.append(InputExample(texts=[row["query"], pos, *negatives]))

    return examples


### Model Implementation and Training

In [None]:
from sentence_transformers import SentenceTransformer, models
import os

def build_dpr_model(model_name):
    """
    Build a DPR model with a transformer and mean pooling.
    Args:
        model_name (str): Name of the transformer model to use.
    Returns:
        SentenceTransformer: The constructed DPR model.
    """
    word_embedding_model = models.Transformer(
        model_name,
        max_seq_length=256
    )

    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True
    )

    return SentenceTransformer(modules=[word_embedding_model, pooling_model])


In [None]:
# Model & loss
retriever = build_dpr_model(DPR_MODEL_NAME)
train_loss = losses.MultipleNegativesRankingLoss(retriever)

In [None]:
# DataLoader
train_examples = build_dpr_training_data(train_ds)
train_loader = DataLoader(train_examples, batch_size=DPR_BATCH_SIZE, shuffle=True, drop_last=True)

In [None]:
# Fit
retriever.fit(
    train_objectives=[(train_loader, train_loss)],
    epochs=DPR_NUM_EPOCHS,
    warmup_steps=int(len(train_loader) * 0.1),
    output_path="./dpr_retriever")

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss


### Encoding and Retrieval

In [None]:
def encode_corpus(corpus, model, batch_size=32):
    """
    Encode a passage corpus using the given model.

    Args:
        corpus: List of passage texts to encode.
        model: SentenceTransformer model to use for encoding.
        batch_size: Batch size for encoding.

    Returns:
        Tensor of encoded passage embeddings.
    """
    return model.encode(
        corpus,
        batch_size=batch_size,
        convert_to_tensor=True,
        show_progress_bar=True
    )

corpus_embeddings = encode_corpus(corpus, retriever)

Batches:   0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
import torch

def dpr_retrieve(query, corpus, corpus_embeddings, model, k=10, with_scores=False):
    """
    Retrieve top-k passages from the corpus for the given query using DPR.

    Args:
        query (str): The input query.
        corpus (list): List of passage texts in the corpus.
        corpus_embeddings (Tensor): Precomputed embeddings of the corpus passages.
        model: SentenceTransformer model used for encoding.
        k (int): Number of top passages to retrieve.
        with_scores (bool): Whether to return scores along with passages.
    Returns:
        list: Top-k retrieved passages (and scores if with_scores is True).
    """

    q_emb = model.encode(query, convert_to_tensor=True)
    scores = torch.matmul(corpus_embeddings, q_emb)
    top_k = torch.topk(scores, k=k)

    indices = top_k.indices.tolist()

    if not with_scores:
        return [corpus[i] for i in indices]
    else:
        values = top_k.values.tolist()
        return [(corpus[i], values[j]) for j, i in enumerate(indices)]

### Evaluation with Mean Reciprocal Rank (MRR)

In [None]:
def reciprocal_rank(retrieved_passages, positive_set):
    """
    Calculate the Reciprocal Rank (RR) for a single query.

    Args:
        retrieved_passages (list): List of tuples (passage, score) retrieved for the query.
        positive_set (set): Set of positive passages for the query.
    Returns:
        float: The Reciprocal Rank value.
    """

    for rank, (passage, _) in enumerate(retrieved_passages, start=1):
        if passage in positive_set:
            return 1.0 / rank
    return 0.0

def evaluate_mrr(dataset, retriever, corpus, k=10, n_samples=100):
    """
    Evaluate Mean Reciprocal Rank (MRR) over the dataset.

    Args:
        dataset: Dataset with 'query' and 'positives' fields.
        retriever: SentenceTransformer model used for retrieval.
        corpus: List of passage texts in the corpus.
        k (int): Number of top passages to retrieve.
        n_samples (int): Number of samples from the dataset to evaluate.
    Returns:
        float: The Mean Reciprocal Rank (MRR) value.
    """

    rrs = []

    for sample in dataset.select(range(n_samples)):
        positives = set(sample["positives"])

        retrieved = dpr_retrieve(
            sample["query"],
            corpus,
            corpus_embeddings,
            retriever,
            k=k,
            with_scores=True
        )

        rr = reciprocal_rank(retrieved, positives)
        rrs.append(rr)

    return sum(rrs) / len(rrs)


Let's see how MRR operates:

In [None]:
mrr_score_5 = evaluate_mrr(eval_ds, retriever, corpus, k=5, n_samples=EVAL_SAMPLE_SIZE)
print(f"MRR@5 over {EVAL_SAMPLE_SIZE} samples: {mrr_score_5:.4f}")

mrr_score_15 = evaluate_mrr(eval_ds, retriever, corpus, k=15, n_samples=EVAL_SAMPLE_SIZE)
print(f"MRR@15 over {EVAL_SAMPLE_SIZE} samples: {mrr_score_15:.4f}")

mrr_score_30 = evaluate_mrr(eval_ds, retriever, corpus, k=30, n_samples=EVAL_SAMPLE_SIZE)
print(f"MRR@30 over {EVAL_SAMPLE_SIZE} samples: {mrr_score_30:.4f}")

MRR@5 over 30 samples: 0.1961
MRR@15 over 30 samples: 0.2200
MRR@30 over 30 samples: 0.2264


We can notice that, as we increase number of K retrieved, the MRR also increases. When we increase k, we are widening the number of passages retrieved, so the probability of retrieving a relevant passage increase. Therefore, a document that was "missing" at k=5 might be found at e.g. k=12. This changes that specific query's score.

# (WIP) Custom Reranker

## Runners

In [None]:
def run_baseline(sample, verbose=False):
    """
    Run baseline LLM without context.

    Args:
        sample (dict): A sample containing the 'query'.
        verbose (bool): Whether to display prompt and response.

    Returns:
        str: The LLM's response.
    """
    prompt = prompt_no_context(sample["query"])
    return call_ollama(prompt, verbose=verbose)


def run_oracle(sample, k=5, verbose=False):
    """
    Run oracle LLM with ground-truth context.

    Args:
        sample (dict): A sample containing the 'query' and 'positives'.
        k (int): Number of ground-truth passages to use.
        verbose (bool): Whether to display prompt and response.

    Returns:
        str: The LLM's response.
    """
    gold_context = sample["positives"][:k]
    prompt = prompt_with_context(sample["query"], gold_context)
    return call_ollama(prompt, verbose=verbose)


def run_rag_dpr(sample, retriever, corpus, k=5, verbose=False):
    """
    Run RAG with DPR retrieval.

    Args:
        sample (dict): A sample containing the 'query'.
        retriever: The DPR retriever model.
        corpus (list): The passage corpus.
        k (int): Number of passages to retrieve.
        verbose (bool): Whether to display prompt and response.

    Returns:
        str: The LLM's response.
    """
    retrieved_context = dpr_retrieve(
        query=sample["query"],
        corpus=corpus,
        corpus_embeddings=corpus_embeddings,
        model=retriever,
        k=k
    )
    prompt = prompt_with_context(sample["query"], retrieved_context)

    return call_ollama(prompt, verbose=verbose)


def run_rag_dpr_reranker(sample):
    raise NotImplementedError


# Full Pipeline

## Run with single sample

First, let's check if approaches work.

In [None]:
sample = train_ds[2]

In [None]:
print("=== BASELINE ===")
response_baseline = run_baseline(sample, verbose=True)

print("\n=== ORACLE ===")
response_upperline = run_oracle(sample, verbose=True)

print("\n=== RAG DPR ===")
response_rag_dpr = run_rag_dpr(sample, retriever, corpus, verbose=True)

=== BASELINE ===


**Prompt**:
 'You are a helpful assistant.

Question:
cortical functions of the brain

Answer:
'

**Ollama response:**  'Cortical functions of the brain refer to the functions that are primarily associated with neural structures in the cortex,
which is the outermost layer of the cerebral hemispheres (left and right sides). Cerebral cortex is responsible for processing sensory information,
controlling movement, and controlling motor responses. Some key features of cortical functions include:  1. Processing visual information: The primary
function of the visual cortex is to process visual stimuli from the outside world.  2. Motor control: The somatosensory cortex plays a crucial role in
regulating muscle movements based on sensory feedback from the body.  3. Language processing: Broca's area and Wernicke's area are involved in
language production and comprehension, respectively.  4. Emotional regulation: Areas like the anterior cingulate cortex and amygdala play important
roles in emotional responses and processing.  5. Motor learning: The basal ganglia, particularly the caudate nucleus, is linked to cognitive control
through the striatum and the medial prefrontal cortex.  6. Pain perception: The pain threshold region of the parieto-occipital cortex controls our
perception of pain.  7. Sensory integration: The primary somatosensory cortex in humans assists in integrating sensory information from different
parts of the body, which is essential for fine motor skills and spatial orientation.  8. Visual field processing: Damage to the occipito-temporal
cortex can affect visual processing abilities.  9. Pain perception: Many areas of the cerebral cortex are involved in pain sensations, including the
primary somatosensory cortex and the brainstem.  10. Sensory integration: The parieto-occipital cortex is responsible for integrating sensory
information from different parts of the body to form a coherent perception.  Cortical functions are further subdivided into several subcategories such
as language areas, motor regions, visual fields, pain processing, and sensory integration. Understanding these cortical functions can provide valuable
insights into brain function and how various aspects of cognition and behavior interact with each other.'


=== ORACLE ===


**Prompt**:
 'You are a helpful assistant. Considering the information provided in the following context, answer the question below.

Context:
Cerebral Cortex: The cerebral cortex covers the outer portion (1.5mm to 5mm) of the cerebrum. It is the layer of the brain often referred to as gray matter. The cortex (thin layer of tissue) is gray because nerves in this area lack the insulation that makes most other parts of the brain appear to be white. Most information processing occurs in the cerebral cortex. The cerebral cortex is divided into lobes that each have a specific function. Function: The cerebral cortex is involved in several functions of the body including: 1  Determining Intelligence. 2  Determining Personality. 3  Motor Function. 4  Planning and Organization.

Question:
cortical functions of the brain

Answer:
'

**Ollama response:**  'Based on the provided context, the cortical functions of the brain include:  1. Determining Intelligence. 2. Determining
Personality. 3. Motor Function. 4. Planning and Organization.  These functions are often referred to as "cortex-related functions" or "cortex-related
processes," indicating their involvement in how the brain processes information related to these cognitive domains.'


=== RAG DPR ===


**Prompt**:
 'You are a helpful assistant. Considering the information provided in the following context, answer the question below.

Context:
Introduction. The forebrain (proencephalon) is the largest part of the brain, most of which is cerebrum. Other important structures found in the forebrain include the thalamus, the hypothalamus and the limbic system. Limbic System. The Limbic system is made up of parts of the brain bordering the corpus collosum. The Limbic system contains areas of cerebral cortex, the cingulate gyrus (dorsally), the parahippocampus gyrus (ventrally), the amygdala, parts of the hypothalamus (mamillary body) and the hippocampus.

Dopamine production. Dopamine is produced in several areas of the brain, including the substantia nigra and the ventral tegmental area. It is a neurohormone that is released by the hypothalamus.

The subcortical structures of the forebrain are located deeper in the brain and include the basal ganglia, hippocampus and amygdala. The basal ganglia, groups of nerve cells, are responsible for the coordination of movement. Both the hippocampus and amygdala are parts of the limbic system. The cerebrum includes the cerebral cortex, which is made up of four lobes: frontal, temporal, parietal and occipital. Imagining, planning and reasoning are some of the functions of the frontal lobe. Hearing and smelling, as well as forming memories and retrieving them later, are functions of the temporal lobes.

The brain is made of three main parts: the forebrain, midbrain, and hindbrain. The forebrain consists of the cerebrum, thalamus, and hypothalamus (part of the limbic system). The midbrain consists of the tectum and tegmentum. The hindbrain is made of the cerebellum, pons and medulla. Often the midbrain, pons, and medulla are referred to together as the brainstem

The Cerebrum: The cerebrum or cortex is the largest part of the human brain, associated with higher brain function such as thought and action. The cerebral cortex is divided into four sections, called lobes: the frontal lobe, parietal lobe, occipital lobe, and temporal lobe. The forebrain consists of the cerebrum, thalamus, and hypothalamus (part of the limbic system). The midbrain consists of the tectum and tegmentum. The hindbrain is made of the cerebellum, pons and medulla.

Question:
cortical functions of the brain

Answer:
'

**Ollama response:**  'The corticofugal function refers to the cerebral cortex being associated with higher brain function such as thought and action.
It is made up of four sections known as lobes: the frontal lobe, parietal lobe, occipital lobe, and temporal lobe.  The hypothalamus plays a
significant role in regulating various bodily functions, including temperature, sleep patterns, hunger, thirst, sexual arousal, body temperature
regulation, water intake, etc. It also affects emotions and behavior. Additionally, the thalamus is involved in processing sensory information from
the external environment, such as sound, touch, taste, sight, pain, etc., to form cognitive and motor responses.  The limbic system is a complex
network of brain structures that includes parts responsible for storing and regulating emotional memories, behaviors, feelings, emotions, arousal,
perception, learning, motivation, and social interactions. The amygdala, part of the hippocampus and hypothalamus, plays important roles in processing
negative stimuli (such as fear) and managing stress responses.  The basal ganglia are a group of nuclei located within the cerebrum that play a
crucial role in motor control, behavior modification and cognitive functions. They also play a significant role in regulating attention, motivation,
learning, social behaviors, reward, reinforcement, decision making and sensory processing.'

## (WIP) Full Pipeline

In [None]:
import pandas as pd

def evaluate_methods(
    dataset,
    retriever,
    corpus,
    n_samples=100
):
    results = []

    for sample in dataset.select(range(n_samples)):
        gold_answers = sample["answers"]

        baseline_out = run_baseline(sample)
        dpr_out = run_rag_dpr(sample, retriever, corpus)
        upper_out = run_oracle(sample)

        results.append({
            "query": sample["query"],
            "gold_answers": gold_answers,
            "baseline": baseline_out,
            "rag_dpr": dpr_out,
            "upperline": upper_out
        })

    return pd.DataFrame(results)


In [None]:
EVAL_SAMPLE_SIZE

In [None]:
results = evaluate_methods(
    dataset=eval_ds,
    retriever=retriever,
    corpus=corpus,
    n_samples=10
)

In [None]:
results.to_csv("evaluation_results.csv", index=False)