In [None]:
!pip install sentence-transformers faiss-cpu pypdf langchain

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting pypdf
  Downloading pypdf-6.3.0-py3-none-any.whl.metadata (7.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m79.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pypdf-6.3.0-py3-none-any.whl (328 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.9/328.9 kB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf, faiss-cpu
Successfully installed faiss-cpu-1.12.0 pypdf-6.3.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# ──────────────────────────────────────────────────────────────
# CELL 3: Full Classes (DocumentProcessor + DPRRetriever + NLIVerifier)
# ──────────────────────────────────────────────────────────────
import os
import json
import numpy as np
from typing import List, Dict
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer
import faiss
import torch
from pypdf import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import pipeline

# ======================================================================
# 1. DOCUMENT PROCESSOR (chunk = 400)
# ======================================================================
class DocumentProcessor:
    def __init__(self, chunk_size=300, chunk_overlap=50):
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=["\n\n", "\n", ". ", "! ", "? ", " ", ""]
        )

    def load_document(self, file_path: str) -> Dict:
        _, ext = os.path.splitext(file_path)
        if ext.lower() == ".pdf":
            reader = PdfReader(file_path)
            text = "\n".join([page.extract_text() or "" for page in reader.pages])
        else:
            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                text = f.read()
        return {"source": os.path.basename(file_path), "content": text}

    def process_documents(self, file_paths: List[str]) -> List[Dict]:
        chunks = []
        for path in file_paths:
            print(f"Loading: {path}")
            doc = self.load_document(path)
            parts = self.text_splitter.split_text(doc["content"])
            for idx, text in enumerate(parts):
                chunks.append({
                    "text": text.strip(),
                    "source": doc["source"],
                    "chunk_id": idx
                })
        return chunks

# --------------------------------------------------------------
#  DPR RETRIEVER – uses the official DPR context encoder
# --------------------------------------------------------------
from transformers import DPRContextEncoder, DPRContextEncoderTokenizerFast
import torch, faiss, os, json, numpy as np
from typing import List, Dict

class DPRRetriever:
    def __init__(self, model_name: str = "facebook/dpr-ctx_encoder-multiset-base"):
        self.model_name = model_name
        print(f"Loading DPR context encoder: {model_name}")
        self.tokenizer = DPRContextEncoderTokenizerFast.from_pretrained(model_name)
        self.model     = DPRContextEncoder.from_pretrained(model_name)
        self.model.eval()
        self.index = None
        self.chunks = []
        self.embeddings = None

    # --------------------------------------------------------------
    #  Build FAISS index from list of chunks
    # --------------------------------------------------------------
    def build_index(self, chunks: List[Dict]):
        self.chunks = chunks
        texts = [c["text"] for c in chunks]

        batch_size = 16                     # safe for Colab GPU/CPU
        all_emb    = []

        print(f"Encoding {len(texts)} chunks (max 512 tokens)…")
        with torch.no_grad():
            for i in range(0, len(texts), batch_size):
                batch = texts[i:i+batch_size]
                inputs = self.tokenizer(
                    batch,
                    padding=True,
                    truncation=True,
                    max_length=512,
                    return_tensors="pt"
                )
                # DPRContextEncoder returns (pooler_output, …)
                outputs = self.model(**inputs)
                # pooler_output is already L2-normalised by DPR
                emb = outputs.pooler_output.cpu().numpy()
                all_emb.append(emb)

                print(f"  → {min(i+batch_size, len(texts))}/{len(texts)}", end="\r")

        self.embeddings = np.vstack(all_emb).astype("float32")
        dim = self.embeddings.shape[1]
        self.index = faiss.IndexFlatIP(dim)
        self.index.add(self.embeddings)
        print(f"\nFAISS index built – {self.index.ntotal} vectors")

    # --------------------------------------------------------------
    #  Save index + metadata
    # --------------------------------------------------------------
    def save_index(self, path: str = "index"):
        os.makedirs(path, exist_ok=True)
        faiss.write_index(self.index, f"{path}/faiss.index")
        with open(f"{path}/chunks.json", "w", encoding="utf-8") as f:
            json.dump(self.chunks, f)
        np.save(f"{path}/embeddings.npy", self.embeddings)
        print(f"Index saved to {path}/")

    # --------------------------------------------------------------
    #  Load a previously saved index
    # --------------------------------------------------------------
    def load_index(self, path: str = "index"):
        self.index = faiss.read_index(f"{path}/faiss.index")
        with open(f"{path}/chunks.json", encoding="utf-8") as f:
            self.chunks = json.load(f)
        self.embeddings = np.load(f"{path}/embeddings.npy")
        # reload model/tokenizer (needed for retrieval)
        self.tokenizer = DPRContextEncoderTokenizerFast.from_pretrained(self.model_name)
        self.model     = DPRContextEncoder.from_pretrained(self.model_name)
        self.model.eval()
        print(f"Index loaded from {path}/ ({self.index.ntotal} vectors)")

    # --------------------------------------------------------------
    #  Retrieve top-k passages for a query
    # --------------------------------------------------------------
    def retrieve(self, query: str, top_k: int = 3) -> List[Dict]:
        with torch.no_grad():
            inputs = self.tokenizer(
                [query],
                padding=True,
                truncation=True,
                max_length=512,
                return_tensors="pt"
            )
            q_emb = self.model(**inputs).pooler_output.cpu().numpy().astype("float32")

        distances, indices = self.index.search(q_emb, top_k)
        results = []
        for dist, idx in zip(distances[0], indices[0]):
            chunk = self.chunks[idx]
            results.append({
                "text":   chunk["text"],
                "source": chunk["source"],
                "score":  float(dist)
            })
        return results
# ======================================================================
# 3. NLI VERIFIER
# ======================================================================
class NLIVerifier:
    def __init__(self):
        print("Loading RoBERTa-base-MNLI…")
        self.nli = pipeline(
            "text-classification",
            model="textattack/roberta-base-MNLI",
            tokenizer="textattack/roberta-base-MNLI",
            device=0 if torch.cuda.is_available() else -1
        )

    def verify(self, premise: str, hypothesis: str) -> Dict:
        input_text = f"{premise} [SEP] {hypothesis}"
        result = self.nli(input_text)[0]
        label = result["label"].lower()   # FIXED
        score = result["score"]

        return {
            "label": label,
            "score": score,
            "verified": label == "entailment" and score > 0.80
        }

    def verify_response(self, response: str, context: List[Dict]) -> Dict:
        combined = " ".join([c["text"] for c in context])
        sentences = [s.strip() for s in response.replace("\n", ". ").split(".") if s.strip()]

        if not sentences:
            return {"verified": True, "entailment_score": 1.0, "details": []}

        results = [self.verify(combined, s) for s in sentences]
        entail_rate = sum(r["verified"] for r in results) / len(results)

        return {
            "verified": entail_rate >= 0.70,
            "entailment_score": entail_rate,
            "details": results
        }


In [None]:
# ──────────────────────────────────────────────────────────────
# CELL 4: Your Document Paths
# ──────────────────────────────────────────────────────────────
document_paths = [
    "/content/drive/MyDrive/Tb_Documents/Global Tuberculosis Report 2024 by WHO.pdf",
    "/content/drive/MyDrive/Tb_Documents/Management_of_Tuberculosis_(4th_Edition) by MHO.pdf"
]

# Verify files exist
for p in document_paths:
    assert os.path.exists(p), f"File not found: {p}"
print("All files found!")

All files found!


In [None]:
# ──────────────────────────────────────────────────────────────
# CELL 5: Build Index
# ──────────────────────────────────────────────────────────────
processor = DocumentProcessor(chunk_size=400)
chunks = processor.process_documents(document_paths)
print(f"Total chunks: {len(chunks)}")

retriever = DPRRetriever()
retriever.build_index(chunks)
retriever.save_index("index")

Loading: /content/drive/MyDrive/Tb_Documents/Global Tuberculosis Report 2024 by WHO.pdf
Loading: /content/drive/MyDrive/Tb_Documents/Management_of_Tuberculosis_(4th_Edition) by MHO.pdf
Total chunks: 1256
Loading DPR context encoder: facebook/dpr-ctx_encoder-multiset-base


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizerFast'.
Some weights of the model checkpoint at facebook/dpr-ctx_encoder-multiset-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification 

Encoding 1256 chunks (max 512 tokens)…
  → 1256/1256
FAISS index built – 1256 vectors
Index saved to index/


In [None]:
# ──────────────────────────────────────────────────────────────
# CELL 6: Download Index
# ──────────────────────────────────────────────────────────────
!zip -r index.zip index/
from google.colab import files
files.download("index.zip")
print("DOWNLOAD COMPLETE! Unzip in VS Code.")

  adding: index/ (stored 0%)
  adding: index/faiss.index (deflated 7%)
  adding: index/chunks.json (deflated 72%)
  adding: index/embeddings.npy (deflated 7%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

DOWNLOAD COMPLETE! Unzip in VS Code.
