In [1]:
!pip install -q transformers accelerate sentence-transformers faiss-cpu pdfplumber pandas

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.7/67.7 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m488.0/488.0 kB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.6/23.6 MB[0m [31m59.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m84.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m83.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m85.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os
import re
import json
import math
from typing import List, Dict, Any

import numpy as np
import pandas as pd

import torch
import torch.nn.functional as F

from transformers import AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer
import faiss
import pdfplumber

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)



Using device: cpu


In [3]:
!pip install -q huggingface_hub
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
def clean_pdf_text(text: str) -> str:
    """
    Clean common PDF text artifacts:
    - Remove hyphenation at line breaks (e.g., 'car-\ndiology' -> 'cardiology')
    - Replace newlines by spaces
    - Collapse multiple spaces
    """
    # Fix hyphenated line breaks
    text = re.sub(r"-\s*\n\s*", "", text)
    # Replace newlines with spaces
    text = text.replace("\n", " ")
    # Collapse multiple spaces
    text = re.sub(r"\s+", " ", text)
    return text.strip()


def load_pdf_documents(folder: str, domain: str) -> List[Dict[str, Any]]:
    """
    Reads all PDF files in `folder` and extracts text using pdfplumber.
    Returns a list of dicts:
      {
        'doc_id': e.g. 'cardio_000',
        'domain': 'cardio' or 'derm',
        'filename': filename.pdf,
        'text': cleaned text
      }
    """
    docs = []
    folder = os.path.abspath(folder)
    if not os.path.isdir(folder):
        raise ValueError(f"Folder {folder} does not exist. Create it and add PDF files.")

    for i, fname in enumerate(sorted(os.listdir(folder))):
        if not fname.lower().endswith(".pdf"):
            continue

        path = os.path.join(folder, fname)
        full_text = ""

        # Read PDF pages
        with pdfplumber.open(path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text() or ""
                full_text += page_text + "\n"

        cleaned_text = clean_pdf_text(full_text)

        doc_id = f"{domain}_{i:03d}"
        docs.append({
            "doc_id": doc_id,
            "domain": domain,
            "filename": fname,
            "text": cleaned_text
        })

    return docs

cardio_docs = load_pdf_documents("/content/drive/MyDrive/Homeworks/LLM/Project/Data/docs_test", domain="cardio")
#derm_docs   = load_pdf_documents("./data/dermatology", domain="derm")

print("Cardiology PDFs loaded:", len(cardio_docs))
#print("Dermatology PDFs loaded:", len(derm_docs))

Cardiology PDFs loaded: 3


In [10]:
def split_into_word_chunks(text: str, chunk_size: int = 200, overlap: int = 50):
    """
    Split text into overlapping word chunks of ~chunk_size words.
    Overlap helps avoid cutting sentences in half.
    """
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        end = start + chunk_size
        chunk_words = words[start:end]
        if not chunk_words:
            break
        chunks.append(" ".join(chunk_words))
        start = end - overlap  # step back by overlap for next chunk
        if start < 0:
            start = 0
    return chunks


def group_chunks_to_fixed_number(chunks: List[str], target_num: int = 4) -> List[str]:
    """
    Merge adjacent small chunks so we end up with ~target_num final chunks per doc.
    This keeps the number of chunks per document roughly constant.
    """
    if len(chunks) <= target_num:
        return chunks

    merged = []
    n = len(chunks)
    group_size = math.ceil(n / target_num)

    for i in range(0, n, group_size):
        merged.append(" ".join(chunks[i:i+group_size]))
    return merged


def chunk_document(doc: Dict[str, Any],
                   chunk_size_words: int = 200,
                   overlap_words: int = 50,
                   target_num_chunks: int = 4) -> List[Dict[str, Any]]:
    """
    Turn a document into a list of chunk dicts:
    {
      'doc_id', 'domain', 'chunk_id', 'chunk_index', 'text'
    }
    """
    raw_chunks = split_into_word_chunks(doc["text"],
                                        chunk_size=chunk_size_words,
                                        overlap=overlap_words)
    grouped_chunks = group_chunks_to_fixed_number(raw_chunks, target_num=target_num_chunks)

    chunk_dicts = []
    for idx, chunk_text in enumerate(grouped_chunks):
        chunk_id = f"{doc['doc_id']}_chunk{idx}"
        chunk_dicts.append({
            "doc_id": doc["doc_id"],
            "domain": doc["domain"],
            "chunk_id": chunk_id,
            "chunk_index": idx,
            "text": chunk_text
        })
    return chunk_dicts


cardio_chunks = []
for d in cardio_docs:
    cardio_chunks.extend(chunk_document(d))
'''
derm_chunks = []
for d in derm_docs:
    derm_chunks.extend(chunk_document(d))
'''
print("Cardio chunks:", len(cardio_chunks))
#print("Derm chunks:", len(derm_chunks))

Cardio chunks: 12


In [6]:
# IMPORTANT: keep embedder on CPU to save GPU memory for the LLM
embedder = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")
embedding_dim = embedder.get_sentence_embedding_dimension()
embedding_dim

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

384

In [12]:
class VectorIndex:
    """
    Simple FAISS index wrapper with metadata.
    Uses inner-product on normalized embeddings (cosine similarity).
    """
    def __init__(self, embedding_dim: int):
        self.index = faiss.IndexFlatIP(embedding_dim)  # inner product
        self.id_to_meta: List[Dict[str, Any]] = []

    def add_documents(self, chunks: List[Dict[str, Any]]):
        texts = [c["text"] for c in chunks]
        embeddings = embedder.encode(
            texts,
            convert_to_numpy=True,
            show_progress_bar=True,
            normalize_embeddings=True
        )
        self.index.add(embeddings.astype(np.float32))
        self.id_to_meta.extend(chunks)

    def similarity_search(self, query: str, k: int = 4) -> List[Dict[str, Any]]:
        query_embedding = embedder.encode(
            [query],
            convert_to_numpy=True,
            normalize_embeddings=True
        )
        D, I = self.index.search(query_embedding.astype(np.float32), k)
        results = []
        for score, idx in zip(D[0], I[0]):
            if idx == -1:
                continue
            meta = self.id_to_meta[idx].copy()
            meta["score"] = float(score)
            results.append(meta)
        return results


cardio_index = VectorIndex(embedding_dim)
cardio_index.add_documents(cardio_chunks)

#derm_index = VectorIndex(embedding_dim)
#derm_index.add_documents(derm_chunks)

print("Cardio index size:", len(cardio_index.id_to_meta))
#print("Derm index size:", len(derm_index.id_to_meta))

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cardio index size: 12


In [13]:
import gc

# Clean up any leftover GPU allocations from earlier cells
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

print("GPU cache cleared after indexing.")

GPU cache cleared after indexing.


In [14]:
model_name = "meta-llama/Llama-2-7b-chat-hf"  # change to another instruct model if needed

tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    use_auth_token=True,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Some models don't have a pad token set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Model and tokenizer loaded.")



tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Model and tokenizer loaded.


In [15]:
def build_context_string(chunks: List[Dict[str, Any]]) -> str:
    """
    Turn a list of chunks into a readable context string.
    Each chunk is prefixed by its chunk_id so the model can cite it.
    """
    lines = []
    for c in chunks:
        header = f"[{c['chunk_id']}]"
        body = c["text"]
        lines.append(f"{header}\n{body}\n")
    return "\n".join(lines)

In [16]:
def build_rag_prompt(question: str, retrieved_chunks: List[Dict[str, Any]]) -> str:
    """
    Build a prompt that:
    - Shows the retrieved chunks as context.
    - Asks model to answer in 2-3 sentences.
    - Forces it to return Confidence and Sources.
    """
    context = build_context_string(retrieved_chunks)
    prompt = f"""
You are a careful medical assistant specialized in cardiology.

You must follow these rules:
- Use ONLY the information in the context below.
- If the answer is not in the context, say you cannot answer based on the documents.
- Answer in 2-3 sentences.
- At the end, give:
  - "Confidence: X" where X is an integer from 0 to 100.
  - "Sources: [chunk_id1, chunk_id2, ...]" listing the IDs of the chunks used.

Context:
{context}

Question:
{question}

Now provide:
Answer:
    """.strip()
    return prompt

In [17]:
def build_no_rag_prompt(question: str) -> str:
    """
    Prompt that does NOT give any external context.
    Model answers from its own knowledge and sets Sources: [].
    """
    return f"""
You are a careful medical assistant specialized in cardiology.

Answer the following question based only on your own knowledge.
Answer in 2-3 sentences.
At the end, give:
- "Confidence: X" where X is an integer from 0 to 100
- "Sources: []" because you are not using external documents.

Question:
{question}

Now provide:
Answer:
    """.strip()

In [18]:
def parse_model_output(raw_text: str):
    """
    Extract answer, confidence, and sources from the model output.
    Expected pattern:
      Answer: ...
      Confidence: 78
      Sources: [cardio_000_chunk0, cardio_000_chunk1]
    """
    lines = [l.strip() for l in raw_text.splitlines() if l.strip()]

    answer_lines = []
    confidence = None
    sources = []

    for line in lines:
        lower = line.lower()
        if lower.startswith("confidence"):
            m = re.search(r"(\d+)", line)
            if m:
                confidence = int(m.group(1))
        elif lower.startswith("sources"):
            m = re.search(r"\[(.*)\]", line)
            if m:
                inside = m.group(1)
                parts = [p.strip() for p in inside.split(",") if p.strip()]
                sources = parts
        elif lower.startswith("answer"):
            # strip "Answer:"
            ans_part = re.sub(r"^answer\s*:\s*", "", line, flags=re.IGNORECASE)
            if ans_part:
                answer_lines.append(ans_part)
        else:
            if answer_lines and not lower.startswith(("confidence", "sources")):
                answer_lines.append(line)

    answer = " ".join(answer_lines).strip()
    return {
        "answer": answer,
        "confidence": confidence,
        "sources": sources
    }

In [19]:
def generate_with_scores(prompt: str,
                         max_new_tokens: int = 128,
                         temperature: float = 0.0,
                         top_p: float = 0.9):
    """
    Run generation and capture token-level probability gaps.

    Returns dict:
      {
        'full_text': full decoded output,
        'parsed': {answer, confidence, sources},
        'token_gaps': list of p1-p2 for each generated token,
        'mean_gap': average p1-p2
      }
    """
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    do_sample = temperature > 0.0

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=do_sample,
            temperature=temperature if do_sample else None,
            top_p=top_p if do_sample else None,
            output_scores=True,
            return_dict_in_generate=True
        )

    generated_ids = output.sequences[0]
    full_text = tokenizer.decode(generated_ids, skip_special_tokens=True)

    token_gaps = []
    for score_tensor in output.scores:
        probs = torch.softmax(score_tensor[0], dim=-1)
        top2 = torch.topk(probs, k=2)
        p1 = top2.values[0].item()
        p2 = top2.values[1].item()
        token_gaps.append(p1 - p2)

    parsed = parse_model_output(full_text)
    mean_gap = float(np.mean(token_gaps)) if token_gaps else None

    return {
        "full_text": full_text,
        "parsed": parsed,
        "token_gaps": token_gaps,
        "mean_gap": mean_gap
    }

In [20]:
# EXAMPLE structure – replace with your real questions and gold info
questions_data = [
    {
        "question_id": "q1",
        "question": "What are the main side effects of drug X in patients with heart failure?",
        "gold_answer": "Side effects include ... (fill from the cardio doc).",
        "gold_doc_id": "cardio_000",
        "gold_chunk_ids": ["cardio_000_chunk1"]  # list of relevant chunks
    },
    # Add many more questions, ideally 3–5 per cardiology document
]

questions_df = pd.DataFrame(questions_data)
questions_df

Unnamed: 0,question_id,question,gold_answer,gold_doc_id,gold_chunk_ids
0,q1,What are the main side effects of drug X in pa...,Side effects include ... (fill from the cardio...,cardio_000,[cardio_000_chunk1]


In [21]:
def retrieve_baseline_cardio(question: str, k: int = 4):
    return cardio_index.similarity_search(question, k=k)


def retrieve_wrong_derm(question: str, k: int = 4):
    return derm_index.similarity_search(question, k=k)


def retrieve_heldout_cardio(question_row, k: int = 4):
    """
    For this question, remove gold chunks from retrieval results.
    This simulates 'almost-right' docs that are missing the direct answer.
    """
    question = question_row["question"]
    gold_chunk_ids = set(question_row["gold_chunk_ids"])

    # Retrieve more than k, then filter out gold chunks
    raw_results = cardio_index.similarity_search(question, k=10 * k)
    filtered = [r for r in raw_results if r["chunk_id"] not in gold_chunk_ids]
    return filtered[:k]

In [22]:
def run_condition_for_question(question_row,
                               condition: str,
                               temperature: float = 0.0,
                               k: int = 3) -> Dict[str, Any]:
    """
    Run one model call for one question under one condition.
    Returns all info needed for later analysis.
    """
    question_id = question_row["question_id"]
    question = question_row["question"]

    if condition == "baseline_cardio":
        retrieved = retrieve_baseline_cardio(question, k=k)
        prompt = build_rag_prompt(question, retrieved)

    # elif condition == "wrong_derm":
        # retrieved = retrieve_wrong_derm(question, k=k)
        # prompt = build_rag_prompt(question, retrieved)
    elif condition == "heldout_cardio":
        retrieved = retrieve_heldout_cardio(question_row, k=k)
        prompt = build_rag_prompt(question, retrieved)
    elif condition == "no_rag":
        retrieved = []
        prompt = build_no_rag_prompt(question)
    else:
        raise ValueError(f"Unknown condition: {condition}")

    gen = generate_with_scores(prompt, max_new_tokens=256, temperature=temperature)

    result = {
        "question_id": question_id,
        "condition": condition,
        "temperature": temperature,
        "prompt": prompt,
        "retrieved_chunk_ids": [c["chunk_id"] for c in retrieved],
        "retrieved_doc_ids": [c["doc_id"] for c in retrieved],
        "model_full_output": gen["full_text"],
        "model_answer": gen["parsed"]["answer"],
        "model_confidence": gen["parsed"]["confidence"],
        "model_sources": gen["parsed"]["sources"],
        "token_gap_mean": gen["mean_gap"],
        "token_gaps": gen["token_gaps"],  # you can drop this column when saving to CSV
    }
    return result

In [None]:
conditions = ["baseline_cardio", "wrong_derm", "heldout_cardio", "no_rag"]
temperatures = [0.0]  # you can later add 0.7 for a subset

results = []

for _, row in questions_df.iterrows():
    for cond in conditions:
        for temp in temperatures:
            print(f"Running {row['question_id']} | {cond} | T={temp}")
            res = run_condition_for_question(row, condition=cond, temperature=temp)
            results.append(res)

results_df = pd.DataFrame(results)
results_df.head()

Running q1 | baseline_cardio | T=0.0
