In [None]:
!pip install torch transformers faiss-cpu biopython

In [None]:
!pip install faiss-cpu --no-cache-dir
!pip install sentence-transformers biopython

In [None]:
!pip install --upgrade protobuf==3.20.3

In [None]:
from huggingface_hub import login
login(new_session=False)
# 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   TOKEN

In [None]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

HF_MODEL_ID = "meta-llama/Llama-2-7b-chat-hf"
MODEL_DIR = "/meta-llama/Llama-2-7b-chat-hf"

os.makedirs(MODEL_DIR, exist_ok=True)

# Download tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    HF_MODEL_ID,
    use_fast=True
)

# Download model
model = AutoModelForCausalLM.from_pretrained(
    HF_MODEL_ID,
    torch_dtype=torch.float16,
    device_map=None,
    low_cpu_mem_usage=True,
)

# Save to Google Drive
tokenizer.save_pretrained(MODEL_DIR)
model.save_pretrained(MODEL_DIR)

print("Model downloaded and stored at:", MODEL_DIR)


In [None]:
import torch
import faiss
import numpy as np
from typing import List, Dict, Any

from sentence_transformers import SentenceTransformer
from Bio import Entrez

Entrez.email = "khushpatel1080@gmail.com"


EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"




In [None]:


class Embedder:
    def __init__(self, model_name: str = EMBED_MODEL_NAME):
        # SentenceTransformers normalizes embeddings if asked; we'll normalize here explicitly
        self.model = SentenceTransformer(model_name)

    def encode(self, texts: List[str]) -> np.ndarray:
        emb = self.model.encode(
            texts,
            convert_to_numpy=True,
            normalize_embeddings=True,  # cosine similarity via inner product
        )
        return emb.astype("float32")


In [None]:
!pip install bio

In [None]:

class PubMedClient:
    def __init__(self, max_results: int = 5):
        self.max_results = max_results

    def fetch_docs(self, query: str) -> List[Dict[str, Any]]:
        # 1) ESearch: get PubMed IDs
        handle = Entrez.esearch(
            db="pubmed",
            term=query,
            retmax=self.max_results,
            sort="relevance",
            retmode="xml",
        )
        search_record = Entrez.read(handle)
        ids = search_record.get("IdList", [])
        handle.close()

        if not ids:
            return []

        handle = Entrez.efetch(
            db="pubmed",
            id=",".join(ids),
            rettype="abstract",
            retmode="text",
        )
        raw_text = handle.read()
        handle.close()

        chunks = [c.strip() for c in raw_text.split("\n\n") if c.strip()]
        docs = []
        for pmid, chunk in zip(ids, chunks):
            docs.append(
                {
                    "id": pmid,
                    "title": "",
                    "abstract": chunk,
                }
            )
        return docs


In [None]:
import os
import requests
from Bio import Entrez

class PDFDownloader:
    def __init__(self, download_dir="/kaggle/working/downloaded_pdfs"):
        self.download_dir = download_dir
        os.makedirs(self.download_dir, exist_ok=True)

    def get_pmc_id(self, pmid):
        """Converts a PubMed ID (PMID) to a PubMed Central ID (PMCID) for Open Access download."""
        try:
            handle = Entrez.elink(dbfrom="pubmed", db="pmc", linkname="pubmed_pmc", id=pmid)
            result = Entrez.read(handle)
            handle.close()
            # Extract PMC ID if a link exists
            if result and result[0]['LinkSetDb']:
                return result[0]['LinkSetDb'][0]['Link'][0]['Id']
        except Exception as e:
            print(f"Could not find PMC ID for PMID {pmid}: {e}")
        return None

    def download_pdf(self, pmid, title):
        """Attempts to download the PDF if a PMC ID is found, skipping if file exists."""

        # Check if file already exists (skipping logic not fully shown in your paste, but good practice)
        safe_title = "".join([c for c in title if c.isalnum() or c in (' ','-')]).rstrip()
        pmc_id_placeholder = "NO_PMC" # Placeholder until we know the real ID
        filename = f"{self.download_dir}/{safe_title[:50]}_{pmc_id_placeholder}.pdf"

        # 1. ATTEMPT TO GET PMC ID
        pmc_id = self.get_pmc_id(pmid)

        if not pmc_id:
            print(f"PDF download skipped for '{title[:30]}...' (No PMC ID found for PMID {pmid}). The article is likely **not archived in PubMed Central (PMC)**.")
            return

        # Update filename with actual PMCID
        filename = f"{self.download_dir}/{safe_title[:50]}_PMC{pmc_id}.pdf"

        # 2. CHECK FOR EXISTING FILE
        if os.path.exists(filename):
            print(f"[SKIP-2] PDF already in storage: {filename}")
            return

        # 3. ATTEMPT TO DOWNLOAD OPEN ACCESS PDF
        pdf_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{pmc_id}/pdf/"

        try:
            headers = {'User-Agent': 'Mozilla/5.0 (ColabUser; mailto:khushpatel1080@gmail.com)'}
            response = requests.get(pdf_url, headers=headers, stream=True)

            if response.status_code == 200:
                with open(filename, 'wb') as f:
                    f.write(response.content)
                print(f"[SUCCESS] Downloaded: {filename}")
            else:
                # Most common failure reason is a 403 Forbidden for paywalled content
                print(f"[FAIL-3] Could not retrieve Open Access PDF for PMC{pmc_id}. Status: {response.status_code}. **The article is likely paywalled.** URL: {pdf_url}")
        except Exception as e:
            print(f"[ERROR] downloading PMC{pmc_id}: {e}")

In [None]:
from sentence_transformers import CrossEncoder


reward_model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")


In [None]:
def select_top3_docs(query: str, docs: list, reward_model) -> list:
    """
    docs: List of dicts with 'text', 'title', 'external_id'
    Returns top-3 docs based on reward model scores
    """
    if len(docs) <= 3:
        return docs  # nothing to select

    pairs = [(query, doc['text']) for doc in docs]
    scores = reward_model.predict(pairs)
    top3_idx = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:3]
    top3_docs = [docs[i] for i in top3_idx]
    return top3_docs


In [None]:

class DocumentStore:
    def __init__(self, embedder: Embedder, dim: int):
        self.embedder = embedder
        self.dim = dim


        self.keyword_index = faiss.IndexIDMap(faiss.IndexFlatIP(dim))
        self.doc_index = faiss.IndexIDMap(faiss.IndexFlatIP(dim))


        self.meta: Dict[int, Dict[str, Any]] = {}

        self._next_local_id = 1

    def _alloc_id(self) -> int:
        doc_id = self._next_local_id
        self._next_local_id += 1
        return doc_id

    def add_document(self, text: str, keyword_text: str, external_id: str = None) -> int:
        """
        text         : full document text (e.g., title + abstract)
        keyword_text : the query / keywords used to fetch this doc
        external_id  : optional external ID (e.g., PubMed PMID)
        """
        # 0th index = keyword embedding
        keyword_emb = self.embedder.encode([keyword_text])[0]
        # 1st index = full document embedding
        doc_emb = self.embedder.encode([text])[0]

        # Store as a small 2 x dim matrix: [0] keyword, [1] full doc
        embeddings = np.stack([keyword_emb, doc_emb], axis=0)  # shape (2, dim)

        doc_id = self._alloc_id()

        self.meta[doc_id] = {
            "text": text,
            "keyword_text": keyword_text,
            "external_id": external_id,
            "embeddings": embeddings,   # here '0th index' is embeddings[0]
        }

        # Add to FAISS indexes (one vector per doc per index)
        self.keyword_index.add_with_ids(
            keyword_emb[None, :],
            np.array([doc_id], dtype="int64"),
        )
        self.doc_index.add_with_ids(
            doc_emb[None, :],
            np.array([doc_id], dtype="int64"),
        )

        return doc_id

    def search_local(
        self,
        query: str,
        top_k: int = 5,
        keyword_threshold: float = 0.4,
    ) -> List[Dict[str, Any]]:
        """
        Search only in local FAISS indexes.
        Stage 1: compare query embedding with 0th-index (keyword) embeddings.
        """
        q_emb = self.embedder.encode([query]).astype("float32")  # shape (1, dim)

        # Keyword-level FAISS search
        D_kw, I_kw = self.keyword_index.search(q_emb, top_k)

        sims = D_kw[0]
        ids = I_kw[0]
        mask = sims >= keyword_threshold

        filtered_ids = ids[mask]
        filtered_sims = sims[mask]

        results = []
        for doc_id, sim in zip(filtered_ids.tolist(), filtered_sims.tolist()):
            if doc_id == -1:
                continue
            meta = self.meta.get(doc_id)
            if not meta:
                continue

            results.append(
                {
                    "doc_id": doc_id,
                    "similarity_keyword": float(sim),
                    "text": meta["text"],
                    "keyword_text": meta["keyword_text"],
                    "external_id": meta["external_id"],
                }
            )


        return results
    def search_with_pubmed_backfill(
        self,
        query: str,
        pubmed_client: PubMedClient,
        top_k: int = 5,
        keyword_threshold: float = 0.4,
    ) -> List[Dict[str, Any]]:
        """
        Try local FAISS search; if nothing passes the threshold,
        call PubMed, store new docs, and then search again.
        """
        print(f"Checking local vector store for query: '{query}'...")


        local_results = self.search_local(
            query, top_k=top_k, keyword_threshold=keyword_threshold
        )


        if local_results:
            print(f" -> Found {len(local_results)} documents in local storage (Similarity >= {keyword_threshold}).")
            print(" -> Skipping external PubMed search.")
            return local_results


        print(" -> No sufficient local matches found. Fetching from external PubMed API...")


        pubmed_docs = pubmed_client.fetch_docs(query)
        print(f" -> PubMed API retrieved {len(pubmed_docs)} documents.")

        initial_index_size = self.keyword_index.ntotal
        print(f" FAISS Index Size BEFORE adding: {initial_index_size}")

        for i, d in enumerate(pubmed_docs):
            full_text = f"{d.get('title', '')}\n{d.get('abstract', '')}"
            self.add_document(
                text=full_text,
                keyword_text=query,
                external_id=d.get("id"),
            )

            print(f" Successfully added document {i+1}/{len(pubmed_docs)}: PMID {d.get('id')}")

        final_index_size = self.keyword_index.ntotal
        print(f"  FAISS Index Size AFTER adding: {final_index_size}")


        return self.search_local(query, top_k=top_k, keyword_threshold=keyword_threshold)






In [None]:
def build_rag_prompt(user_query: str, retrieved_docs: List[Dict[str, Any]]) -> str:

    if not retrieved_docs:
        return f"User question: {user_query}\nAnswer: I could not find any relevant documents to answer your question."


    context_str = ""
    for i, doc in enumerate(retrieved_docs, 1):
        context_str += f"Source {i}: [Title: {doc.get('title', 'Unknown')}]\nContent: {doc['text']}\n\n"

    prompt = f"""
You are a research assistant. Answer the user question based ONLY on the provided Context sources.

Instructions:
1. You must base your answer strictly on the provided sources.
2. You MUST cite the specific Source Number or Title when stating facts (e.g., "According to Source 1...").
3. At the end of your answer, list the titles of the articles you used.

Context:
{context_str}

User question:
{user_query}

Answer:
"""
    return prompt.strip()

In [None]:
class LLMClient:
    def __init__(self, model_id: str = "/kaggle/working/Llama-2-7b-hf"):
        from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

        self.model_id = model_id
        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_id,
            torch_dtype=torch.float16,
            device_map="auto",
        )
        self.pipe = pipeline(
            "text-generation",
            model=self.model,
            tokenizer=self.tokenizer,
            device_map="auto",
        )

    def generate(self, prompt: str, max_tokens: int = 256) -> str:
        outputs = self.pipe(
            prompt,
            max_new_tokens=max_tokens,
            do_sample=False,
            eos_token_id=self.tokenizer.eos_token_id,
        )
        # text-generation pipeline returns a list of dicts with 'generated_text'
        full_text = outputs[0]["generated_text"]
        # Strip the original prompt, keep only the completion if needed
        return full_text[len(prompt):].strip()


In [None]:
def answer_query(
    user_query: str,
    doc_store: DocumentStore,
    pubmed_client: PubMedClient,
    llm_client: LLMClient,
    top_k: int = 5,
    keyword_threshold: float = 0.4,
) -> str:
    """
    End-to-end:
    - embed query
    - search local FAISS (cache check)
    - if no results, call PubMed API
    - DOWNLOAD PDFS for the found articles (New Step)
    - build RAG prompt
    - generate answer
    """



    retrieved = doc_store.search_with_pubmed_backfill(
      query=user_query,
      pubmed_client=pubmed_client,
      top_k=10,
      keyword_threshold=keyword_threshold,
)

# Selecting  best 3 using reward model
      top3_docs = select_top3_docs(user_query, retrieved, reward_model)




    print(f"\n[System] Found {len(retrieved)} relevant abstracts.")

    # Initialize the downloader class we created earlier
    downloader = PDFDownloader()

    # Loop through every article found to get its full PDF
    for doc in retrieved:
        pmid = doc.get('external_id')    # Get the PubMed ID
        title = doc.get('title', 'Untitled')

        if pmid:
            # This triggers the download (or skips if file exists)
            downloader.download_pdf(pmid, title)

    prompt = build_rag_prompt(user_query, top3_docs)
    answer = llm_client.generate(prompt)

    return answer



In [None]:
import torch
embedder = Embedder(EMBED_MODEL_NAME)
dim = embedder.encode(["test"]).shape[1]   # infer embedding dimension

doc_store = DocumentStore(embedder=embedder, dim=dim)
pubmed_client = PubMedClient(max_results=3)
llm_client = LLMClient()

print("Embedding dimension:", dim)


In [None]:
faiss.write_index(doc_store.keyword_index, "/kaggle/working/keyword.index")
faiss.write_index(doc_store.doc_index, "/kaggle/working/doc.index")

import pickle, os
with open("/kaggle/working/metadata.pkl", "wb") as f:
    pickle.dump(doc_store.meta, f)




In [None]:
doc_store.keyword_index = faiss.read_index("/kaggle/working/keyword.index")
doc_store.doc_index = faiss.read_index("/kaggle/working/doc.index")

import pickle
with open("/kaggle/working/metadata.pkl", "rb") as f:
    doc_store.meta = pickle.load(f)

doc_store._next_local_id = max(doc_store.meta.keys(), default=0) + 1


In [None]:
# Use only the search keywords
search_term = "diabetes caused by sugar"

print("--- RUNNING QUERY (Clean Search Term) ---")
answer = answer_query(
    # Only send keywords to the search/retrieval system
    user_query=search_term,
    doc_store=doc_store,
    pubmed_client=pubmed_client,
    llm_client=llm_client,
    top_k=3,
    keyword_threshold=0.4,
)

print("\nUser search term:")
print(search_term)
print("\nSystem answer:")
print(answer)