## 📥 Step 1: Extract Text from PDF

In [None]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path: str) -> str:
    """
    Open a PDF and extract all text as a single string.
    """
    doc = fitz.open(pdf_path)
    pages = []
    for page in doc:
        page_text = page.get_text()
        pages.append(page_text)
    return "\n".join(pages)

## 🧩 Step 2: Chunk Text (Sliding Window)

In [None]:
from typing import List

def chunk_text(text: str, max_tokens: int = 512, overlap: int = 50) -> List[str]:
    tokens = text.split()
    chunks = []
    step = max_tokens - overlap
    for i in range(0, len(tokens), step):
        chunk = tokens[i:i + max_tokens]
        chunks.append(" ".join(chunk))
    return chunks

## 🔎 Step 3: Generate Embeddings

In [None]:
from sentence_transformers import SentenceTransformer

# Load model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

def embed_chunks(chunks: List[str]) -> List[List[float]]:
    return embedding_model.encode(chunks, show_progress_bar=True)

## 🗃️ Step 4: Build FAISS Index and Search

In [None]:
import faiss
import numpy as np

def build_faiss_index(embeddings: List[List[float]]) -> faiss.IndexFlatL2:
    dim = len(embeddings[0])
    index = faiss.IndexFlatL2(dim)
    index.add(np.array(embeddings).astype("float32"))
    return index

def search_index(query: str, model: SentenceTransformer, index: faiss.Index, chunks: List[str], top_k: int = 3):
    query_vector = model.encode([query])[0].astype("float32").reshape(1, -1)
    distances, indices = index.search(query_vector, top_k)
    return [chunks[i] for i in indices[0]]

## 🚀 Step 5: FastAPI Search Endpoint

In [None]:
from fastapi import FastAPI
from pydantic import BaseModel

app = FastAPI()

# Initialize FAISS index and chunks before calling this endpoint
class QueryRequest(BaseModel):
    q: str

@app.get("/")
def root():
    return {"message": "RAG PDF Search API"}

@app.post("/search")
def search(request: QueryRequest):
    query = request.q
    results = search_index(query, embedding_model, faiss_index, chunks)
    return {"query": query, "results": results}