# Lets join all together: retrieval system, fine-tuned model and a query pipeline.

In [2]:
import json
from typing import List, Tuple, Callable, Any
import random

import faiss
import numpy as np
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig

In [3]:
class FaissStorage:
    """
    FaissStorage is a concrete implementation of the Storage abstract base class.
    It uses the Faiss library to store and query vectors efficiently.
    """

    def __init__(
            self,
            dimension: int = 800,
            index = None,
    ):
        """
        Initializes the FaissStorage with a specified vector dimension.

        Args:
            dimension: The dimensionality of the vectors to be stored.
        """
        self.dimension = dimension
        if index is None:
            index = faiss.IndexFlatL2(dimension)
        self.index = index

    def store(self, key: str, data):
        """
        Stores a vector associated with a given key.

        Args:
            key: A unique identifier for the vector (e.g., a string).
            data: The vector to be stored. Must be a list of floats with length equal to 'dimension'.
        """
        if len(data[0]) != self.dimension:
            raise ValueError(f"Data must have {self.dimension} dimensions.")

        # Convert the data to a numpy array and ensure it's in the correct format
        data = np.array(data, dtype='float32')

        self.index.add(data)  # Add the vector to the index

    def query(self, key: str, k: int) -> (List, List):
        """
        Retrieves the vector associated with a given key.

        Args:
            key: The identifier of the vector to retrieve.

        Returns:
            A tuple containing the distances and indices of the nearest vectors.
        """
        # In this simple implementation, we don't actually use the key for retrieval.
        # In a real-world scenario, you would need to maintain a mapping from keys to indices.

        # For demonstration purposes, we'll return the first vector in the index
        if self.index.ntotal == 0:
            return None

        distances, indices = self.index.search(np.array([[0] * self.dimension], dtype='float32'), k)

        return distances[0].tolist(), indices[0].tolist()

    def export(self, file_path: str):
        """
        Exports the Faiss index to a file.

        Args:
            file_path: The path where the index will be saved.
        """
        faiss.write_index(self.index, file_path)

    def load(self, file_path: str):
        """
        Loads a Faiss index from a file.

        Args:
            file_path: The path from where the index will be loaded.
        """
        self.index = faiss.read_index(file_path)

# Load original dataset
As faiss index is built on top of a dataset, (without store the dataset contents in the index) we need to load the original dataset to be able to query it.

In [4]:
class FromJsonDataset(Dataset):
    def __init__(self, json_file):
        self.raw_content = ""
        with open(json_file, "r") as f:
            self.raw_content = f.read()

        self.data = json.loads(self.raw_content)

    def __len__(self):
        return len(self.data["id"])

    def __getitem__(self, idx: int):
        return  {
            "title": self.data["title"][idx],
            "content": self.data["content"][idx],
            "contents": self.data["contents"][idx],
            "PMID": self.data["PMID"][idx],
            "id": self.data["id"][idx],
        }

class ContentsDataset(Dataset):
    def __init__(self, dataset: FromJsonDataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx: int):
        item = self.dataset[idx]
        return item["contents"]

In [5]:
STORAGE_FILE_PATH = "./outputs/pubmed_500K.index"
storage = FaissStorage()
storage.load(STORAGE_FILE_PATH)

In [6]:
augmented_data = FromJsonDataset(json_file="./data/pubmed_500K.json")
augmented_data = ContentsDataset(augmented_data)

In [7]:
augmented_data[0]  # Check the first item in the dataset

"[Biochemical studies on camomile components/III. In vitro studies about the antipeptic activity of (--)-alpha-bisabolol (author's transl)]. (--)-alpha-Bisabolol has a primary antipeptic action depending on dosage, which is not caused by an alteration of the pH-value. The proteolytic activity of pepsin is reduced by 50 percent through addition of bisabolol in the ratio of 1/0.5. The antipeptic action of bisabolol only occurs in case of direct contact. In case of a previous contact with the substrate, the inhibiting effect is lost."

In [8]:
storage.index.ntotal  # Check how many vectors are stored in the index

490001

In [9]:
def build_tokenizer_function(
        tokenizer
) -> callable:
    return lambda query: tokenizer(
            query,
            max_length=800,
            truncation=True,
            padding="max_length",
            add_special_tokens=False,
            return_tensors="np",
        )["input_ids"].astype("float32")


querier_type = Callable[[str, int], Tuple[List, List]]

def build_querier(
        storage: FaissStorage,
        original_dataset: Dataset,
        tokenizer_fn: callable,
) -> querier_type:
    """
    Build a query function that can be used to query the storage system.

    Args:
        storage (Storage): The storage system to use for querying.
        original_dataset (Dataset): The original dataset to use for querying.
        tokenizer_fn (callable): The function to use for tokenizing the query.

    Returns:
        callable: A function that takes a query string and returns the results.
    """
    def querier(query: str, k: int = 10) -> (List, List):
        return perform_query(storage, original_dataset, tokenizer_fn, query, k)

    return querier

def perform_query(
        storage: FaissStorage,
        original_dataset: Dataset,
        tokenizer_fn: callable,
        query: str,
        k: int,
) -> (List, List):
    query_vector = tokenizer_fn(query)
    distances, indices = storage.query(query_vector, k)

    data = []
    for index in indices:
        data.append(original_dataset[index])

    return distances, data



In [10]:
def load_tokenizer():
    """
    Load a Llama model from Hugging Face Hub.
    :param model_name: The name of the model to load.
    :return: The loaded model.
    """
    # Load the model
    model = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
    model.pad_token = model.eos_token
    model.pad_token_id = model.eos_token_id

    return model

rag_tokenizer = load_tokenizer()

In [11]:
tokenizer_fn = build_tokenizer_function(rag_tokenizer)

querier = build_querier(storage, augmented_data, tokenizer_fn)

In [12]:
# test the querier

question = "What is the role of IL-6 in the immune response?"
querier(question, k=5)  # Get the top 5 results for the question

([171539365888.0,
  224323829760.0,
  226616164352.0,
  232985919488.0,
  234069327872.0],
 ["3,3'-Diiodothyronine production, a major pathway of peripheral iodothyronine metabolism in man. 3,3'-Diiodothyronine (3,3'-T(2)) has been detected in human serum and in thyroglobulin. However, no quantitative assessment of its clearance rate (CR), production rate (PR), or of the importance of extrathyroidal sources of 3,3'-T(2) relative to direct thyroidal secretion is yet available. This study examines these parameters in seven euthyroid subjects, and in eight athyreotic subjects (H) eumetabolic due to thyroxine therapy (HT(4)) (n = 5) or triiodothyronine replacement (HT(3)) (n = 3). A highly specific radioimmunoassay for the measurement of 3,3'-T(2) in whole serum was developed. Serum 3,3'-T(2) concentrations were (mean +/- SD) 6.0+/-1.0 ng/100 ml in 13 normal subjects, 9.0+/-4.6 ng/100 ml in 25 hyperthyroid patients, and 2.7+/-1.1 ng/100 ml in 17 hypothyroid patients. The values in each of 

# Lets load the model and the tokenizer from pretrained

In [13]:
# Load the PEFT config to find the base model
peft_model_path = "./outputs/fine-tuning/trainer"
peft_config = PeftConfig.from_pretrained(peft_model_path)

# Load the base model (this must match the model used during fine-tuning)
base_model = AutoModelForCausalLM.from_pretrained(
    peft_config.base_model_name_or_path,
    load_in_8bit=True,  # Load the model in 8-bit mode for efficiency
)

# Load the PEFT model
model = PeftModel.from_pretrained(base_model, peft_model_path)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("./outputs/fine-tuning/tokenizer")

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


# How are we going to use the model for predict A, B, C or D?

Nice, we are going to perform a forward pass on the model, and pick the most likely answer based on the logits returned by the model. The model will return a probability distribution over the possible answers, and we will pick the one with the highest probability.

In [14]:
ForwardType = Callable[[str, List[str]], Any]

def enhanced_forward(
        llm,
        tokenizer,
        augmenter: Callable[[str, int], Tuple[List[int], List[str]]],
        k_augmentations: int,
        prompt_builder: Callable[[str, List[str], List[str]], str],
        question: str,
        options: List[str],
        device: str,
        num_iterations: int = 1,
):
    """
    Performs multiple forward passes, appending the most probable token each time,
    and returns average probabilities across all tokens.

    Args:
        llm: The language model to use for generating the response.
        tokenizer: The tokenizer associated with the language model.
        augmenter (Callable): A function that takes a query string and returns the first k_augmentations in a tuple of
            distances and items.
        k_augmentations (int): The number of augmentations to generate.
        prompt_builder (Callable): A function that builds the prompt for the language model.
            It takes the augmented information, question and options, and returns a formatted string.
        question (str): The question to ask.
        options (List[str]): The list of options to choose from.
        device (str): The device to run the model on ('cpu' or 'cuda').
        num_iterations (int): Number of forward passes to perform (default: 3).

    Returns:
        Tuple containing:
            - List of generated tokens
            - Average probabilities tensor across all tokens
    """
    llm.eval()
    # Get augmented items
    _, items = augmenter(question, k_augmentations)

    # Generate the initial prompt
    prompt = prompt_builder(question, options, items)
    input_ids = tokenizer.encode(prompt, return_tensors="pt", truncation=True).to(device)

    generated_token_ids =  input_ids.clone()
    probss = []
    generated_tokens = []

    with torch.no_grad():
        for i in range(num_iterations):
            # Prepare model inputs
            # For most autoregressive models, only input_ids are strictly necessary for the forward pass
            # if attention_mask is not explicitly handled or modified in the loop.
            # However, it's good practice to pass it if the model uses it.
            # As we append tokens, the attention_mask also needs to be extended.
            #attention_mask = torch.ones_like(generated_token_ids).to(device)

            # Get model outputs
            with torch.no_grad():
                outputs = llm(generated_token_ids)

            # Get the logits for the last token in the sequence
            # outputs.logits is typically of shape (batch_size, sequence_length, vocab_size)
            next_token_logits = outputs.logits[:, -1, :] # Get logits for the very last token

            # Apply softmax to get probabilities (optional, as argmax works on logits directly)
            probs = torch.softmax(next_token_logits, dim=-1)
            probss.append(probs)
            # Get the predicted token ID (greedy decoding)
            next_token_id = torch.argmax(probs, dim=-1).unsqueeze(-1)

            # Append the predicted token ID to the generated sequence
            generated_token_ids = torch.cat([generated_token_ids, next_token_id], dim=1)

            # Check if the generated token is an end-of-sequence (EOS) token
            if tokenizer.eos_token_id is not None and next_token_id.item() == tokenizer.eos_token_id:
                break

            # Append the generated token ID to the list
            generated_tokens.append(next_token_id.item())

    # Concatenate all probabilities
    probs = torch.cat(probss, dim=0)
    # Average the probabilities across all iterations
    avg_probs = torch.mean(probs, dim=0)

    return generated_tokens, avg_probs

def build_enhanced_forwarder(
    llm,
    tokenizer,
    augmenter: Callable[[str, int], Tuple[List[int], List[str]]],
    k_augmentations: int,
    prompt_builder: Callable[[str, List[str], List[str]], str],
    num_iterations: int,
    device: str,
) -> Callable[[str, List[str]], Tuple[List[int], torch.Tensor]]:
    """
    Builds an enhanced forward function that can be used to generate responses from the language model.

    Returns:
        Callable: A function that takes a question and a list of options and returns the generated response.
    """
    def forward_fn(question: str, options: List[str]) -> Tuple[List[int], torch.Tensor]:
        return enhanced_forward(
            llm=llm,
            tokenizer=tokenizer,
            augmenter=augmenter,
            k_augmentations=k_augmentations,
            prompt_builder=prompt_builder,
            question=question,
            options=options,
            num_iterations=num_iterations,
            device=device,
        )

    return forward_fn

def _from_logits(
        tokenizer,
        logits: torch.Tensor,
        options: list[int],
) -> int:
    """
    Picks an option from a list of options based on the given logits.

    Args:
        tokenizer: The tokenizer to use for encoding the options.
        logits (list[float]): The logits for each option.
        options (list[int]): encoded options to choose from.

    Returns:
        int: The index of the chosen option.
    """
    # Convert logits to probabilities
    probs = torch.softmax(logits, dim=0).tolist()



    # Calculate the score for each option
    scores = []
    for _, option in enumerate(options):
        score = probs[option]
        scores.append(score)

    # Choose the option with the highest score
    chosen_option = scores.index(max(scores))

    return chosen_option

def build_from_logits(
    tokenizer,
    options: list[str],
) -> Callable[[torch.Tensor], int]:
    """
    Builds a function that can be used to pick an option from a list of options based on the given logits.

    Args:
        tokenizer: The tokenizer to use for encoding the options.
        options (list[str]): The list of options to choose from.

    Returns:
        Callable[[torch.Tensor], int]: A function that takes logits and returns the index of the chosen option.
    """
    # Encode the options
    options = [
        tokenizer.encode(option, add_special_tokens=False)[0]
        for option in options
    ]

    return lambda model_out: _from_logits(tokenizer, model_out, options)

possible_answers = [" A", " B", " C", " D"]

picker = build_from_logits(
    tokenizer,
    options=possible_answers,
)

In [15]:
# the same prompt used in the fine-tuning
def prompt(
    question: str,
    options: List[str],
    augmented_items: List[str],
) -> str:
    context = "\n".join(augmented_items)

    options_str = "\n".join(
        [f"{chr(65 + i)}. {option}" for i, option in enumerate(options)]
    )

    """
    Generates a prompt for the language model based on the question, options, and augmented items.

    Args:
        question (str): The question to ask.
        options (List[str]): The list of options to choose from.
        augmented_items (List[str]): The augmented items to include in the prompt.

    Returns:
        str: The formatted prompt string.
    """
    prompt = f"""You are an expert AI specializing in multiple-choice questions.
Your task is to analyze the provided context, question, and options, then identify the single best answer.
Respond with only the capital letter (A, B, C, or D) corresponding to your choice.

Context:
{context}

Question:
{question}

Options:
{options}

Answer:"""
    return prompt

In [16]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model.to(device)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 2048)
        (layers): ModuleList(
          (0-15): 16 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear8bitLt(
                (base_layer): Linear8bitLt(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lo

In [17]:
forward = build_enhanced_forwarder(
    base_model,
    tokenizer,
    querier,
    k_augmentations=10,
    prompt_builder=prompt,
    num_iterations=1,
    device=device,
)

def forward_and_get_last_logit(
    question,
    options,
):
    tokens, logits =  forward(
        question,
        options=options,
    )

    return logits

# Lets evaluate the model

We are going to load the evaluation dataset and perform the entire rag pipeline in order to get the best answer for each question.

In [18]:
def evaluate(
    forward_fn: ForwardType,
    picker_fn: Callable,
    eval_dataset: Dataset,
    log_each: int = 100,
) -> float:
    """
    Evaluate the model on the evaluation dataset.

    Args:
        forward_fn: The forward function to use for generating the response.
        picker_fn: The function to use for picking the best option.
        eval_dataset: The evaluation dataset.
        log_each: The number of samples to log accuracy for.

    Returns:
        The accuracy of the model on the evaluation dataset.
    """
    correct = 0
    total = len(eval_dataset)

    for i in range(total):
        item = eval_dataset[i]
        question = item["question"]
        options = item["options"]
        answer_idx = item["answer_idx"]

        # Get the model's response
        response = forward_fn(question, options)

        # Pick the best option
        picked_idx = picker_fn(response)

        if picked_idx == answer_idx:
            correct += 1

        if i % 10 == 0:
            print(f"Right answer: {answer_idx}, picked: {picked_idx}")
        if i % log_each == 0:
            # print current accuracy
            print(f"Accuracy at {i}: {correct / (i + 1):.2f}")

    return correct / total

In [19]:
class TrainAndEval(Dataset):
    """
    This loads a map which contains:
    - "id"
    - "excerpt"
    - "question"
    - "statement": the correct option
    - "distractors"
    """
    def __init__(self, file_path: str):
        self.file_path = file_path
        self._raw_data = []
        with open(file_path, "r") as f:
            for line in f:
                self._raw_data.append(line.strip())

    def __len__(self):
        return len(self._raw_data)

    def __getitem__(self, idx: int):
        return json.loads(self._raw_data[idx])

In [20]:
class EvalWithAnswers(Dataset):
    """
    A dataset that takes a TrainAndEval dataset and adds the statement to the distractors
    to create a multiple choice question. The statement is inserted at a random position
    in the distractors.

    Returns two item keys: options and answer_idx.
    """
    def __init__(self, dataset: TrainAndEval):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx: int):
        item = self.dataset[idx]

        options = item["distractors"]
        # insert the statement at any random position in the options
        index = random.randint(0, len(options))
        options.insert(index, item["statement"])

        item["options"] = options
        item["answer_idx"] = index

        return item


In [21]:
evaluationData = TrainAndEval("./data/pubmed_QA_eval.json")
evaluateWithAnswers = EvalWithAnswers(evaluationData)

In [22]:
evaluateWithAnswers[0]  # Check the first item in the evaluation dataset

{'id': 'pubmed23n0012_5208',
 'excerpt': 'Temporal changes in medial basal hypothalamic LH-RH correlated with plasma LH during the rat estrous cycle and following electrochemical stimulation of the medial preoptic area in pentobarbital-treated proestrous rats. In the present studies we have simultaneously measured changes in medial basal hypothalamic (MBH) leutenizing hormone-releasing hormone (LH-RH) and in plasma LH by radioimmunoassay in female rats at various hours during the 4-day estrous cycle and under experimental conditions known to alter pituitary LH secretion. In groups of rats decapitated at 12.00 h and 15.00 h on estrus and diestrus, plasma LH remained at basal levels (5-8 ng/ml) and MBH-LH-RH concentrations showed average steady state concentrations of 2231 +/- 205 pg/mg. On the day of proestrus hourly measurements of MBH-LH-RH between 12.00 h and 21.00 h suggested rhythmic rises and falls in the decapeptide concomitant with rises and falls in plasma LH. In a second group

In [23]:
accuracy = evaluate(
    forward_fn=forward_and_get_last_logit,
    picker_fn=picker,
    eval_dataset=evaluateWithAnswers,
)

print(f"Accuracy: {accuracy:.2f}")

Right answer: 3, picked: 1
Accuracy at 0: 0.00


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.20 GiB. GPU 0 has a total capacity of 5.65 GiB of which 1.43 GiB is free. Including non-PyTorch memory, this process has 4.19 GiB memory in use. Of the allocated memory 1.75 GiB is allocated by PyTorch, and 2.33 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

# Lets predict the answers for the contest

In [None]:
def predict(
    forward_fn: ForwardType,
    picker_fn: Callable,
    eval_dataset: EvalWithAnswers,
    log_each: int = 100,
) -> List[Tuple[int, int]]:
    """
    Evaluate the model on the evaluation dataset.

    Args:
        forward_fn: The forward function to use for generating the response.
        picker_fn: The function to use for picking the best option.
        eval_dataset: The evaluation dataset.
        log_each: The number of samples to log accuracy for.

    Returns:
        The id and the picked option of the model on the evaluation dataset.
    """
    total = len(eval_dataset)
    responses = []
    for i in range(total):
        item = eval_dataset[i]
        question = item["question"]
        options = item["option"]

        # Get the model's response
        response = forward_fn(question, options)

        # Pick the best option
        picked_idx = picker_fn(response)

        responses.append((i, picked_idx))

        if i != 0 and i% log_each == 0:
            print(f"Processed {i/total}%")

    return responses

In [None]:
class TestQuestions(Dataset):
    """
    Dataset for the test questions in the competition.
    Format:
        id
        question
        option: list of options
    """
    def __init__(self, file_path: str):
        self.file_path = file_path
        self._raw_data = []
        with open(file_path, "r") as f:
            for line in f:
                self._raw_data.append(line.strip())

    def __len__(self):
        return len(self._raw_data)

    def __getitem__(self, idx: int):
        return json.loads(self._raw_data[idx])

In [None]:
test_data = TestQuestions("/data/pubmed_QA_test_questions.json")
responses = predict(
    forward_fn=forward_and_get_last_logit,
    picker_fn=picker,
    eval_dataset=test_data,
)

responses

In [None]:
import pandas as pd

responses_with_ids = []

for i in range(len(responses)):
    responses_with_ids.append((test_data[i]["id"], responses[i][1]))
dataset = pd.DataFrame(responses_with_ids, columns=["ID", "answer"])
dataset.head()
dataset.to_csv("predictions.csv", index=False)