In [26]:
import os
import requests
import fitz  # PyMuPDF
import faiss
import numpy as np
import feedparser
import pickle
from typing import List
from sentence_transformers import SentenceTransformer
from fastapi import FastAPI
from contextlib import asynccontextmanager

In [27]:
#############################################
# Step 1: Data Collection (arXiv API)
#############################################

def download_arxiv_papers(query="cs.CL", max_results=50, save_dir="./papers"):
    """
    Download PDFs from arXiv based on a query and save them locally.
    """
    os.makedirs(save_dir, exist_ok=True)
    base_url = "http://export.arxiv.org/api/query"
    params = {"search_query": query, "start": 0, "max_results": max_results}
    response = requests.get(base_url, params=params)
    if response.status_code != 200:
        raise Exception("Failed to fetch arXiv API")

    feed = feedparser.parse(response.text)
    for entry in feed.entries:
        pdf_url = None
        for link in entry.links:
            if link.rel == "alternate":
                continue
            if link.title == "pdf":
                pdf_url = link.href
        if pdf_url:
            pdf_name = entry.id.split("/")[-1] + ".pdf"
            pdf_path = os.path.join(save_dir, pdf_name)
            if not os.path.exists(pdf_path):
                r = requests.get(pdf_url)
                with open(pdf_path, "wb") as f:
                    f.write(r.content)
                print(f"Downloaded {pdf_name}")

In [28]:
#############################################
# Step 2: Text Extraction (PDF → Text)
#############################################

def extract_text_from_pdf(pdf_path: str) -> str:
    doc = fitz.open(pdf_path)
    pages = []
    for page in doc:
        page_text = page.get_text()
        pages.append(page_text)
    return "\n".join(pages)

In [29]:
#############################################
# Step 3: Chunking Logic
#############################################

def chunk_text(text: str, max_tokens: int = 512, overlap: int = 50) -> List[str]:
    tokens = text.split()
    chunks = []
    step = max_tokens - overlap
    for i in range(0, len(tokens), step):
        chunk = tokens[i:i + max_tokens]
        chunks.append(" ".join(chunk))
    return chunks

In [30]:
#############################################
# Step 4: Embedding Generation
#############################################

model = SentenceTransformer('all-MiniLM-L6-v2')

def embed_chunks(chunks: List[str]):
    return model.encode(chunks)

In [31]:
#############################################
# Step 5: FAISS Indexing
#############################################

def build_faiss_index(embeddings: np.ndarray):
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)
    return index

In [32]:
#############################################
# Step 6: Retrieval
#############################################

def retrieve(query: str, index, chunks: List[str], k=3):
    query_vec = model.encode([query])
    distances, indices = index.search(np.array(query_vec), k)
    results = [chunks[i] for i in indices[0]]
    return results

In [33]:
#############################################
# Step 7: Retrieval Report
#############################################

def generate_retrieval_report(queries, index, chunks, report_path="retrieval_report.md", k=3):
    with open(report_path, "w") as f:
        f.write("# Retrieval Report\n\n")
        for q in queries:
            results = retrieve(q, index, chunks, k)
            f.write(f"## Query: {q}\n")
            for i, r in enumerate(results, 1):
                f.write(f"**Result {i}:**\n\n{r}\n\n")
            f.write("\n---\n")
    print(f"Retrieval report saved to {report_path}")

In [34]:
#############################################
# Step 8: FastAPI Service (with lifespan)
#############################################

chunks = []
faiss_index = None

@asynccontextmanager
def lifespan(app: FastAPI):
    global chunks, faiss_index
    # Download papers if not already
    download_arxiv_papers(query="cs.CL", max_results=50, save_dir="./papers")
    all_chunks = []
    for fname in os.listdir("./papers"):
        if fname.endswith(".pdf"):
            text = extract_text_from_pdf(os.path.join("./papers", fname))
            all_chunks.extend(chunk_text(text))
    chunks = all_chunks
    if chunks:
        embeddings = embed_chunks(chunks)
        faiss_index = build_faiss_index(np.array(embeddings))
        # save index and chunks
        faiss.write_index(faiss_index, "faiss_index.bin")
        with open("chunks.pkl", "wb") as f:
            pickle.dump(chunks, f)
        print("FAISS index ready with", len(chunks), "chunks. Data saved.")
    yield

app = FastAPI(lifespan=lifespan)

@app.get("/search")
async def search(q: str, k: int = 3):
    global chunks, faiss_index
    if faiss_index is None or not chunks:
        return {"error": "Index not built. Please add PDFs first."}
    query_vector = model.encode([q])
    distances, indices = faiss_index.search(np.array(query_vector), k)
    results = [chunks[i] for i in indices[0]]
    return {"query": q, "results": results}


if __name__ == "__main__":
    # Demo run with retrieval report
    download_arxiv_papers(query="cs.CL", max_results=10, save_dir="./papers")
    all_chunks = []
    for fname in os.listdir("./papers"):
        if fname.endswith(".pdf"):
            text = extract_text_from_pdf(os.path.join("./papers", fname))
            all_chunks.extend(chunk_text(text))
    if all_chunks:
        embeddings = embed_chunks(all_chunks)
        index = build_faiss_index(np.array(embeddings))
        # Save index and chunks
        faiss.write_index(index, "faiss_index.bin")
        with open("chunks.pkl", "wb") as f:
            pickle.dump(all_chunks, f)

        queries = [
            "What are recent advances in machine translation?",
            "How are language models evaluated?",
            "What datasets are used in NLP benchmarks?",
            "Explain attention mechanisms in transformers.",
            "What is the role of pretraining in NLP models?"
        ]
        generate_retrieval_report(queries, index, all_chunks)
    else:
        print("No PDFs found in ./papers.")

MuPDF error: format error: No default Layer config

Retrieval report saved to retrieval_report.md
