In [1]:
!pip install torch torchvision --index-url https://download.pytorch.org/whl/cu126
!pip install --upgrade transformers accelerate bitsandbytes
!pip install -q "langchain>=0.2.10,<1.0.0" "langchain-community>=0.2.10" \
                "langchain-text-splitters>=0.2.0" "chromadb>=0.5.5" \
                "sentence-transformers>=2.2.2" "pypdf>=4.2.0"

Looking in indexes: https://download.pytorch.org/whl/cu126
Collecting transformers
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading transformers-4.57.3-py3-none-any.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m124.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m41.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers, bitsandbytes
  Attempting uninstall: transformers
    Found existing installation: transformers 4.57.2
    Uninstalling transformers-4.57.2:
      Successfully uninstalled transformers-4.

In [2]:
!pip install faiss-cpu pdfplumber

Collecting faiss-cpu
  Downloading faiss_cpu-1.13.0-cp39-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.7 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.8-py3-none-any.whl.metadata (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting pdfminer.six==20251107 (from pdfplumber)
  Downloading pdfminer_six-20251107-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-5.1.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.7/67.7 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Downloading faiss_cpu-1.13.0-cp39-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [3

# Imports

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
import random
import numpy as np
import os
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer
import faiss
import pdfplumber
import math
import re
import json

# Getting the documents and make a vector DB
Make sure you have the data folder in your drive

In [4]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [5]:
DATA_DIR = "/content/extracted"

In [6]:
import zipfile

zip_path = "/content/drive/MyDrive/data.zip"        # path to your zip file
extract_path = DATA_DIR  # where to extract

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

## Create the Vector DB

In [7]:
DATA_DIR = os.path.join(DATA_DIR, "data/files/no_take_home")
print(os.listdir(DATA_DIR))

['doc7.pdf', 'doc2.pdf', 'doc1.pdf', 'doc8.pdf', 'doc5.pdf', 'doc6.pdf', 'doc9.pdf', 'doc4.pdf', 'doc10.pdf', 'doc3.pdf']


In [8]:
QA_DIR = "/content/extracted/data/qa_pairs"

In [9]:
PERSIST_DIR = "./RAG_CARDIOLOGY"
COLLECTION = "hallucination_eval"
CHUNK_SIZE = 500
CHUNK_OVERLAP = 20

Gather all the documents and merge the pages of a document into 1.

In [10]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.schema import Document

documents = []
for d in os.listdir(DATA_DIR):
    path = os.path.join(DATA_DIR, d)
    loader = PyPDFLoader(path)
    pages = loader.load()

    # merge pages into one big Document
    full_text = "\n".join([p.page_content for p in pages])
    documents.append(Document(page_content=full_text, metadata={"source": path}))

print(len(documents), "combined documents")


10 combined documents


split the document into chunks

In [11]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    add_start_index=True,
)
chunks = splitter.split_documents(documents)
print(f"Created {len(chunks)} chunks")

Created 8380 chunks


In [12]:
emb = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  emb = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

create the vector DB

In [13]:
vs = Chroma.from_documents(
    documents=chunks,
    embedding=emb,
    persist_directory=PERSIST_DIR,
    collection_name=COLLECTION,
    collection_metadata={"hnsw:space": "cosine"}
)
vs.persist()
print("Persisted at:", os.path.abspath(PERSIST_DIR))
print("Vector count:", vs._collection.count())

Persisted at: /content/RAG_CARDIOLOGY
Vector count: 8380


  vs.persist()


# Evaluation 1: Baseline model
## Step 1: Load the model
After experimentation with 1B, 3B, 7B models we found that for our experiments a 13B model is necessary to create good answers for the baseline. We will use Llama-2-13b-chat-hf model as the main model and the 3B as student.

In [14]:
from google.colab import userdata
key = userdata.get('hf_key')
model_name = "meta-llama/Llama-2-13b-chat-hf"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="bfloat16"
)

# Explicitly tell the tokenizer to use the SentencePiece model
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False, use_auth_token=key)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    use_auth_token=key
)

print(model.device)



tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/587 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/33.4k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

cuda:0


## Step 2: setup RAG pipeline

In [15]:
def get_RAG(vector_db, query, k=3):
  rag_docs = vector_db.similarity_search(query, k=k)
  return rag_docs

def format_RAG_docs(docs):
  return "\n\n".join([
    f"Source: {doc.metadata['source']}\n{doc.page_content}"
    for doc in docs
  ])

def create_prompt(question, k, vector_db):
  rag_docs = get_RAG(vector_db, question, k=k)
  formatted_rag_docs = format_RAG_docs(rag_docs)
  prompt = f"""Using the following retrieved passages, answer the medical question concisely (1-2 sentences).
Format your answer exactly as:

Answer: <your concise answer>
Confidence: <X%>
Citation: <document/source>

Question: {question}

Passages:
{formatted_rag_docs}

Answer:
  """
  return prompt


# Step 3: Setup experiment for baseline

In [16]:
from transformers import GenerationConfig
def get_answer_and_token_probs(model, tokenizer, prompt):
  # Encode prompt
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

  # Generation configuration
  gen_config = GenerationConfig(
      min_new_tokens=30,
      max_new_tokens=150,
      do_sample=False,  # Greedy to make logprobs deterministic

  )
  # Generate tokens step by step to capture probabilities
  generated_ids = inputs["input_ids"]
  logprobs_list = []

  with torch.no_grad():
      for _ in range(gen_config.max_new_tokens):
          # Get model logits
          outputs = model(generated_ids)
          logits = outputs.logits  # shape: [batch, seq_len, vocab_size]
          next_token_logits = logits[:, -1, :]  # last token

          # Compute probabilities
          probs = torch.softmax(next_token_logits, dim=-1)
          topk_probs, _ = torch.topk(probs, k=10, dim=-1)

          # Store top-10 token probabilities and ids
          logprobs_list.append(topk_probs.cpu().tolist())

          # Greedy: choose the highest probability token
          next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
          generated_ids = torch.cat([generated_ids, next_token], dim=-1)

          # Stop if EOS token
          if next_token.item() == tokenizer.eos_token_id:
              break

  # Decode final answer
  generated_text = tokenizer.decode(generated_ids[0, inputs["input_ids"].shape[1]:], skip_special_tokens=True)
  return generated_text, logprobs_list

In [17]:
qa_objects = os.listdir(QA_DIR)
qa_object = [qa for qa in qa_objects if "doc1.json" in qa][0]
print(qa_object)

qa_doc1.json


In [18]:
import pandas as pd
def experiment_1(model, tokenizer, vector_db, doc_dir, k):
  results = []
  doc_names = os.listdir(doc_dir)
  doc_names = [doc.split(".")[0] for doc in doc_names]

  qa_objects = os.listdir(QA_DIR)
  for doc in doc_names:
    print(doc)
    qa_object = [qa for qa in qa_objects if doc + ".json" in qa][0]
    with open(os.path.join(QA_DIR, qa_object), "r") as f:
      data = json.load(f)

    questions = data["questions"]
    answers = data["answers"]

    for i, question in enumerate(questions):
      print(f"Question {i+1}/{len(questions)}")
      prompt = create_prompt(question, k, vector_db)
      generated_text, logprobs_list = get_answer_and_token_probs(model, tokenizer, prompt)

      results.append({
          "id": f"{doc}_question_{i}",
          "response" : generated_text,
          "answer": answers[i],
          "logprobs": logprobs_list
      })

  df = pd.DataFrame(results).set_index("id")
  return df

print(DATA_DIR)
print(os.listdir(DATA_DIR))
df = experiment_1(model, tokenizer, vs, DATA_DIR, 3)




/content/extracted/data/files/no_take_home
['doc7.pdf', 'doc2.pdf', 'doc1.pdf', 'doc8.pdf', 'doc5.pdf', 'doc6.pdf', 'doc9.pdf', 'doc4.pdf', 'doc10.pdf', 'doc3.pdf']
doc7
Question 1/5
Question 2/5
Question 3/5
Question 4/5
Question 5/5
doc2
Question 1/5
Question 2/5
Question 3/5
Question 4/5
Question 5/5
doc1
Question 1/5
Question 2/5
Question 3/5
Question 4/5
Question 5/5
doc8
Question 1/5
Question 2/5
Question 3/5
Question 4/5
Question 5/5
doc5
Question 1/5
Question 2/5
Question 3/5
Question 4/5
Question 5/5
doc6
Question 1/5
Question 2/5
Question 3/5
Question 4/5
Question 5/5
doc9
Question 1/5
Question 2/5
Question 3/5
Question 4/5
Question 5/5
doc4
Question 1/5
Question 2/5
Question 3/5
Question 4/5
Question 5/5
doc10
Question 1/5
Question 2/5
Question 3/5
Question 4/5
Question 5/5
doc3
Question 1/5
Question 2/5
Question 3/5
Question 4/5
Question 5/5


In [19]:
df.to_csv("/content/drive/MyDrive/baseline_results.csv")

# Experiment 2: RAG with document held out

It is basically the same experiment as 1 but with the document from which we retrieve the question held out

In [25]:
def delete_doc_from_vector_db(vector_db, doc):
  path = os.path.join(DATA_DIR, doc)
  chunks = vector_db.get(where={"source": path})
  vector_db.delete(where={"source": path})
  return chunks

def insert_chunks_into_vector_db(vector_db, chunks):
  docs_to_add = [
    Document(page_content=text, metadata=meta)
    for text, meta in zip(chunks['documents'], chunks['metadatas'])
  ]
  vector_db.add_documents(
      documents=docs_to_add,
      ids=chunks['ids']  # your existing IDs
  )

In [24]:
docs_to_add = [
    Document(page_content=text, metadata=meta)
    for text, meta in zip(chunks['documents'], chunks['metadatas'])
]

# Add them to your vector store with the corresponding IDs
vs.add_documents(
    documents=docs_to_add,
    ids=chunks['ids']  # your existing IDs
)

print(vs.get(where={"source": path_test}))

{'ids': ['a98aaa8b-5b05-4d54-bce9-066e90c65e1c', 'a21b6a6d-fd4e-4e1d-a66a-b190ab91bb13', 'e4ac4b6f-7029-48ce-b955-d6ce745610f5', 'c8a83112-a66a-468a-9304-2d246541e154', '69a464b5-62c8-4fae-9dc8-de9d5ed5f39e', '84d3e2c2-53a9-41d6-b93e-a61769111d17', 'a5cdd279-de39-4cb8-88a7-6161cd760c24', '87fd6917-353c-4e2d-a97b-fa0c656aec7f', 'e8f1ccdd-e902-4600-ae21-787aefa13d5d', '770f2e02-8dac-428b-a467-891330277be0', '33c8bd17-e005-444e-8cde-f4d430e8757b', '68ea313a-1b6b-4602-9755-db34c5242080', '80163133-dbdc-41ce-993e-94ca445fcdf9', '9020b080-fdac-480d-9cca-497528571827', '84c13bc8-4fdd-421f-9e2c-39a56f66cb19', 'b70c7d2f-579f-43b9-8dc0-d46036350fb4', '7926c253-5a06-4128-9d6e-132b121aecdc', '48e5ffe6-436e-4c57-9f95-e5dbb8cfbe50', '8c4dad1c-36a2-40a2-88af-185e843cf118', '8f82859f-78ee-4acf-987b-c2d05050713a', 'db525a97-601c-46fd-bbf8-cfe4628b130d', '41a028bc-763f-4192-b5ff-60457f8fdd61', 'a94a5e06-e6ab-46d1-96c6-b36585c932aa', 'd7bbf5f4-6bde-448e-9992-2a93f3f9fb2f', '8017c2ab-2663-4891-9b8d-a04e56

In [27]:
def experiment_2(model, tokenizer, vector_db, doc_dir, k):
  results = []
  doc_names = os.listdir(doc_dir)
  doc_names = [doc.split(".")[0] for doc in doc_names]

  qa_objects = os.listdir(QA_DIR)
  for doc in doc_names:
    print(doc)
    qa_object = [qa for qa in qa_objects if doc + ".json" in qa][0]
    with open(os.path.join(QA_DIR, qa_object), "r") as f:
      data = json.load(f)

    questions = data["questions"]
    answers = data["answers"]

    # delete the relevant doc out of the RAG
    chunks = delete_doc_from_vector_db(vector_db, doc + ".pdf")

    for i, question in enumerate(questions):
      print(f"Question {i+1}/{len(questions)}")
      prompt = create_prompt(question, k, vector_db)
      generated_text, logprobs_list = get_answer_and_token_probs(model, tokenizer, prompt)

      results.append({
          "id": f"{doc}_question_{i}",
          "response" : generated_text,
          "answer": answers[i],
          "logprobs": logprobs_list
      })
    # reinsert the doc after
    insert_chunks_into_vector_db(vector_db, chunks)

  df = pd.DataFrame(results).set_index("id")
  return df

In [None]:
df = experiment_2(model, tokenizer, vs, DATA_DIR, 3)

doc7
Question 1/5
Question 2/5
Question 3/5
Question 4/5
Question 5/5
doc2
Question 1/5
Question 2/5
Question 3/5
Question 4/5
Question 5/5
doc1
Question 1/5
Question 2/5
Question 3/5
Question 4/5
Question 5/5
doc8
Question 1/5
Question 2/5
Question 3/5
Question 4/5
Question 5/5
doc5
Question 1/5
Question 2/5


In [None]:
df.to_csv("/content/drive/MyDrive/held_out_rag_results.csv")

#TODO
#TODO
#TODO SAVE TO CSV IN DRIVE

# Experiment 3: Wrong RAG
## Step 1: setup wrong RAG

In [None]:
DERM_DIR = "/content/dermatology/"

In [None]:
zip_path = "/content/drive/MyDrive/dermatology.zip"        # path to your zip file
extract_path = DERM_DIR  # where to extract

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

In [None]:
DERM_DIR = os.path.join(DERM_DIR, "dermatology/dermatology")
print(os.listdir(DERM_DIR))

In [None]:
PERSIST_DIR = "./RAG_DERMATOLOGY"
COLLECTION = "hallucination_eval"
CHUNK_SIZE = 500
CHUNK_OVERLAP = 20

In [None]:
documents = []
for d in os.listdir(DERM_DIR):
    path = os.path.join(DERM_DIR, d)
    loader = PyPDFLoader(path)
    pages = loader.load()

    # merge pages into one big Document
    full_text = "\n".join([p.page_content for p in pages])
    documents.append(Document(page_content=full_text, metadata={"source": path}))

print(len(documents), "combined documents")


In [None]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    add_start_index=True,
)
chunks = splitter.split_documents(documents)
print(f"Created {len(chunks)} chunks")

In [None]:
vs_derm = Chroma.from_documents(
    documents=chunks,
    embedding=emb,
    persist_directory=PERSIST_DIR,
    collection_name=COLLECTION,
    collection_metadata={"hnsw:space": "cosine"}
)
vs_derm.persist()
print("Persisted at:", os.path.abspath(PERSIST_DIR))
print("Vector count:", vs_derm._collection.count())

In [None]:
df = experiment_1(model, tokenizer, vs_derm, DATA_DIR, 3)

In [None]:
df.to_csv("/content/drive/MyDrive/derm.csv")