In [1]:
import os, pickle, requests, pdfplumber
from dotenv import load_dotenv
from langchain.schema import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
import gradio as gr

load_dotenv()
API_KEY = os.getenv("OPENAI_API_KEY")
assert API_KEY, "Please set OPENAI_API_KEY in your .env file"


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Cell 2 (simplified): Download one EMA PDF & convert to text

import os, requests, pdfplumber

os.makedirs("data", exist_ok=True)

# Only EMA Bioanalytical Method Validation
url = "https://www.ema.europa.eu/en/documents/scientific-guideline/guideline-bioanalytical-method-validation_en.pdf"
r = requests.get(url, timeout=30)
r.raise_for_status()
pdf_path = "data/ema_bioanalytical_guideline.pdf"
with open(pdf_path, "wb") as f:
    f.write(r.content)

# Convert EMA PDF -> .txt
txt_path = pdf_path.replace(".pdf", ".txt")
pages = []
with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        pages.append(page.extract_text() or "")
with open(txt_path, "w", encoding="utf-8") as f:
    f.write("\n\n".join(pages))

print("✅ EMA guideline downloaded and converted to text in ./data/")


✅ EMA guideline downloaded and converted to text in ./data/


In [3]:
from langchain.schema import Document
import os

def load_docs(folder="data"):
    docs = []
    for fn in os.listdir(folder):
        if not fn.endswith(".txt"):
            continue
        with open(os.path.join(folder, fn), encoding="utf-8") as f:
            txt = f.read()
        docs.append(Document(page_content=txt, metadata={"source": fn}))
    return docs

all_docs = load_docs("data")
print(f"🔍 Loaded {len(all_docs)} text docs for indexing.")


🔍 Loaded 1 text docs for indexing.


In [4]:
# Cell 4: Build & persist FAISS index with a local HF embedder
!pip install --quiet sentence-transformers

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
import pickle

embedder = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
faiss_index = FAISS.from_documents(all_docs, embedder)

with open("faiss_index.pkl", "wb") as f:
    pickle.dump(faiss_index, f)

print("✅ FAISS index built locally (no OpenAI calls) and saved to faiss_index.pkl")


  embedder = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


✅ FAISS index built locally (no OpenAI calls) and saved to faiss_index.pkl


In [5]:
# Cell 5: Wire up the RAG chain with GPT-3.5 using the v1.0+ API

from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

# (If you restarted, reload the FAISS store:)
# with open("faiss_index.pkl", "rb") as f:
#     faiss_index = pickle.load(f)

llm = ChatOpenAI(
    model_name="gpt-3.5-turbo",
    temperature=0,
    openai_api_key=API_KEY
)
retriever = faiss_index.as_retriever(search_kwargs={"k": 5})

rag = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)

print("✅ RAG chain is ready!")


  llm = ChatOpenAI(


✅ RAG chain is ready!


In [6]:
def answer_with_sources(question):
    res = rag({"query": question})
    answer = res["result"]
    sources = "\n".join(f"- {d.metadata['source']}" for d in res["source_documents"])
    return f"**Answer:**\n{answer}\n\n**Sources:**\n{sources}"

iface = gr.Interface(
    fn=answer_with_sources,
    inputs=gr.Textbox(lines=2, placeholder="Ask medical questions…"),
    outputs="markdown",
    title="🩺 Medical RAG Demo",
    description="GPT-3.5 + EMA Bioanalytical Guideline"
)

iface.launch(share=True)


* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://c73119fb403c441137.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




  res = rag({"query": question})
