In [None]:
%pip install --upgrade pip setuptools wheel
%pip install --upgrade --prefer-binary sentence-transformers faiss-cpu pymupdf fastapi uvicorn pydantic numpy tiktoken

# Sanity check: make sure the kernel sees faiss
import sys, platform, pkgutil
print("Python:", sys.version)
print("Kernel executable:", sys.executable)
print("faiss importable? ->", pkgutil.find_loader("faiss") is not None)


In [None]:
!pip install sentence-transformers faiss-cpu pymupdf fastapi uvicorn pydantic numpy tiktoken hf_xet

import os, json, re
from typing import List, Dict, Tuple
import numpy as np
import fitz  # PyMuPDF
import faiss

DATA_DIR = "data/arxiv"
INDEX_DIR = "artifacts"  

os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(INDEX_DIR, exist_ok=True)


## Loading PDFs and extract text

In [6]:
def extract_text_from_pdf(path: str) -> str:
    doc = fitz.open(path)
    pages = []
    for p in doc:
        pages.append(p.get_text("text"))
    return "\n".join(pages)

def load_corpus(pdf_dir: str = DATA_DIR) -> Dict[str, str]:
    corpus = {}
    for name in os.listdir(pdf_dir):
        if name.lower().endswith(".pdf"):
            fp = os.path.join(pdf_dir, name)
            corpus[name] = extract_text_from_pdf(fp)
    return corpus

corpus = load_corpus()
print(f"Loaded {len(corpus)} PDFs: {list(corpus.keys())[:5]}{' ...' if len(corpus)>5 else ''}")


Loaded 50 PDFs: ['2508.10507v1 - Multi-Sample Anti-Aliasing and Constrained Optimization for 3D Gaussian Splatting.pdf', '2508.10528v1 - Med-GLIP Advancing Medical Language-Image Pre-training with Large-scale Grounded Dataset.pdf', '2508.10530v1 - Diversity First Quality Later A Two-Stage Assumption for Language Model Alignment.pdf', '2508.10539v1 - Improving Value-based Process Verifier via Low-Cost Variance Reduction.pdf', '2508.10548v1 - Stabilizing Long-term Multi-turn Reinforcement Learning with Gated Rewards.pdf'] ...


## Chunking (token-based, ~512 tokens with overlap)


In [7]:
# Token-based chunking (~512 tokens) with fallback to character-based if tiktoken isn't available
from typing import List

try:
    import tiktoken
    _enc = tiktoken.get_encoding("cl100k_base")
except Exception:
    _enc = None

def chunk_by_tokens(text: str, max_tokens: int = 512, overlap_tokens: int = 64) -> List[str]:
    """
    Token-based sliding window chunker targeting ~512 tokens per chunk with ~64-token overlap.
    Falls back to character-based (~4 chars/token heuristic) if tiktoken isn't available.
    """
    if _enc is None:
        approx = max_tokens * 4
        overlap_chars = overlap_tokens * 4
        clean = " ".join(text.split())
        chunks, start = [], 0
        while start < len(clean):
            end = min(start + approx, len(clean))
            chunks.append(clean[start:end])
            if end == len(clean): break
            start = end - overlap_chars
        return chunks

    tokens = _enc.encode(text)
    chunks = []
    start = 0
    while start < len(tokens):
        end = min(start + max_tokens, len(tokens))
        sub = tokens[start:end]
        chunks.append(_enc.decode(sub))
        if end == len(tokens): break
        start = end - overlap_tokens
    return chunks

# Build chunk list with metadata using token-based chunking
docs: List[str] = []
metadatas: List[Dict] = []

for fname, text in corpus.items():
    for ch in chunk_by_tokens(text, max_tokens=512, overlap_tokens=64):
        docs.append(ch)
        metadatas.append({"source": fname})

print(f"Total chunks: {len(docs)} from {len(corpus)} PDFs")


Total chunks: 2055 from 50 PDFs


## Embeddings


In [8]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def embed_texts(texts: List[str]) -> np.ndarray:
    # normalize_embeddings=True pairs well with FAISS inner-product search
    embs = model.encode(texts, normalize_embeddings=True, show_progress_bar=True)
    return np.asarray(embs, dtype="float32")

embs = embed_texts(docs)
embs.shape


  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|██████████| 65/65 [00:46<00:00,  1.40it/s]


(2055, 384)

## Build FAISS index & persist artifacts

In [9]:
if len(docs) == 0:
    raise RuntimeError("No chunks found. Add PDFs to data/arxiv/ and rerun from the top.")

dim = embs.shape[1]
index = faiss.IndexFlatIP(dim)  # inner product on normalized vectors ≈ cosine similarity
index.add(embs)

# Persist index
faiss_path = os.path.join(INDEX_DIR, "arxiv.index")
faiss.write_index(index, faiss_path)

# Persist chunks (text + source) as JSONL so you can reload later
chunks_path = os.path.join(INDEX_DIR, "chunks.jsonl")
with open(chunks_path, "w", encoding="utf-8") as f:
    for t, m in zip(docs, metadatas):
        f.write(json.dumps({"text": t, "source": m["source"]}, ensure_ascii=False) + "\n")

print(f"Saved index to {faiss_path} and chunks to {chunks_path}")


Saved index to artifacts\arxiv.index and chunks to artifacts\chunks.jsonl


In [10]:
def load_index_and_chunks(index_dir: str = INDEX_DIR) -> Tuple[faiss.Index, List[str], List[Dict]]:
    idx = faiss.read_index(os.path.join(index_dir, "arxiv.index"))
    dcs, metas = [], []
    with open(os.path.join(index_dir, "chunks.jsonl"), "r", encoding="utf-8") as f:
        for line in f:
            rec = json.loads(line)
            dcs.append(rec["text"])
            metas.append({"source": rec["source"]})
    return idx, dcs, metas

# Example usage (uncomment to test reloading):
# index, docs, metadatas = load_index_and_chunks()
# print(len(docs), "chunks reloaded")


## Search helper

In [11]:
def search(query: str, k: int = 5):
    q = embed_texts([query])  # uses the same normalization as corpus embeddings
    D, I = index.search(q, k)
    results = []
    for score, idx in zip(D[0], I[0]):
        results.append({
            "score": float(score),
            "text": docs[idx],
            "source": metadatas[idx]["source"],
            "chunk_id": int(idx)
        })
    return results

# Smoke test
search("What problem does the paper address?", k=3)


Batches: 100%|██████████| 1/1 [00:00<00:00, 116.97it/s]


[{'score': 0.30652010440826416,
  'text': " located? Answer the \nquestion using a single phrase.\n1. Unidentified residential street. [Failed]\n2. residential street in an unnamed district.\n[Failed]\n1. I'm unable to determine the specific street \nand district based on the image alone. [Failed]\n2. I cannot determine the specific … [Failed]\nFreyburg Street, South Side Flats\n1. Hill District of Pittsburgh, Pennsylvania.\n2. The image is located in a residential \ndistrict, but the specific street … [Failed]\n1. Santa fe, downtown [Hallucination]\n2. Rocky hill [Invalid]\n3. 1st street [Hallucination]\nGPT-4o\nQwen2-VL\nAddressVLM\nSonnet\nLLaVA\nFig. 10 Qualitative comparison of address question-answering capabilities with general LVLMs.\n17\n\n5 Conclusion\nIn this work, we propose AddressVLM for city-wide address localization, which can\nperform flexible address question-answering for street-view images. The core idea is\nto leverage cross-view alignment tuning between satellite-

## Retrieval Report
Generate a markdown report with ≥5 queries and their top-3 retrieved passages.


In [12]:
import pathlib, datetime

def run_retrieval_report(queries, k=3, out_path=os.path.join(INDEX_DIR, "retrieval_report.md")):
    """
    Generates a markdown report with >=5 queries and their top-k retrieved passages.
    """
    ts = datetime.datetime.now().isoformat(timespec="seconds")
    lines = [f"# Retrieval Report\n\nGenerated: {ts}\n\n"]
    for qi, q in enumerate(queries, 1):
        lines.append(f"## Q{qi}. {q}\n")
        hits = search(q, k=k)
        for i, h in enumerate(hits, 1):
            excerpt = h["text"][:800].replace("\n", " ")
            lines.append(f"**Top {i}** (score {h['score']:.3f}) — *{h['source']}*\n\n> {excerpt} …\n")
        lines.append("\n---\n")
    pathlib.Path(out_path).write_text("\n".join(lines), encoding="utf-8")
    print(f"Saved report to {out_path}")

# Example starter queries (customize for your paper set)
example_queries = [
    "What problem do these papers address?",
    "Summarize the main methodology used.",
    "What datasets are commonly used?",
    "What are the key contributions mentioned?",
    "What future work or limitations are discussed?"
]

# Uncomment to run:
# run_retrieval_report(example_queries, k=3)


## LLM call + sample prompts


In [None]:
import os, textwrap
from typing import List, Dict
from openai import OpenAI


def build_prompt(question: str, passages):
    lines = [
        "You are a careful assistant. Answer ONLY using the context below.",
        "Cite sources like [1], [2]. If at least two sources are provided, use at least two distinct citations.",
        "If the answer is not covered, say you don't have enough information.",
        "",
        f"Question: {question}",
        "",
        "Context:",
    ]
    for i, p in enumerate(passages, 1):
        snippet = p["text"].strip()[:1800]
        lines.append(f"[{i}] ({p['source']})\n{snippet}\n")
    return "\n".join(lines)

def ask_llm(question: str, k: int = 3, model: str = "gpt-4o-mini") -> Dict:
    hits = search(question, k=k)

    if not hits:
        return {"question": question, "answer": "(no passages retrieved)", "hits": []}

    # Prompt
    prompt = build_prompt(question, hits)

    # Call OpenAI
    client = OpenAI()
    resp = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.2,
    )
    answer = resp.choices[0].message.content.strip()

    # Pretty print
    print("\n" + "="*80)
    print("Q:", question)
    print("-"*80)
    print(answer)
    print("-"*80)
    print("Sources:")
    for i, h in enumerate(hits, 1):
        print(f"[{i}] {h['source']}  (score={h['score']:.3f})")
    print("="*80 + "\n")

    return {"question": question, "answer": answer, "hits": hits}

# ---- Ask 5 relevant questions about your corpus ----
five_questions = [
    "What problem do these papers most commonly address?",
    "Summarize the typical methodology or architecture described across the papers.",
    "Which datasets or benchmarks are most frequently mentioned?",
    "What key limitations or open challenges are discussed?",
    "What future directions or proposed improvements recur across the papers?"
]

results = [ask_llm(q, k=3) for q in five_questions]



Batches: 100%|██████████| 1/1 [00:00<00:00, 148.88it/s]



Q: What problem do these papers most commonly address?
--------------------------------------------------------------------------------
The papers most commonly address the challenges related to visual question answering in the context of gastrointestinal imaging and the evaluation of misinformation on online platforms. Specifically, the first paper focuses on creating a dataset for GI endoscopic images paired with complex question-answer pairs to enhance reasoning capabilities in medical imaging [1]. The second paper discusses the implications of online searches for misinformation and how they can affect the perceived veracity of such content [2].
--------------------------------------------------------------------------------
Sources:
[1] 2508.10869v1 - Medico 2025 Visual Question Answering for Gastrointestinal Imaging.pdf  (score=0.288)
[2] 2508.10769v1 - Modeling Human Responses to Multimodal AI Content.pdf  (score=0.272)
[3] 2508.10666v1 - Deep Learning in Classical and Quantum P

Batches: 100%|██████████| 1/1 [00:00<00:00, 131.12it/s]



Q: Summarize the typical methodology or architecture described across the papers.
--------------------------------------------------------------------------------
The typical methodology described in the papers involves a structured approach to design evaluation and planning. 

In the context of the Agentic Design Review System, the methodology includes a Structured Design Description (SDD) module that generates design descriptions to guide the review process. This system employs a meta agent to coordinate both static agents, which focus on fixed design attributes (like typography), and dynamic agents, which assess attributes contextualized to specific designs. The meta agent consolidates insights from these agents to provide a final rating and actionable feedback, fostering a nuanced critique process that balances subjective creativity with objective design principles [1].

Additionally, in the realm of planning, the Planning Domain Definition Language (PDDL) is utilized to express p

Batches: 100%|██████████| 1/1 [00:00<00:00, 164.20it/s]



Q: Which datasets or benchmarks are most frequently mentioned?
--------------------------------------------------------------------------------
The most frequently mentioned datasets or benchmarks in the provided context are:

1. **CUFED5** - A testing dataset consisting of 126 image pairs, each with 5 reference images of varying similarity levels [1].
2. **WR-SR** - A dataset consisting of 80 images, each paired with one reference image sourced from Google Image [1].
3. **Landmark-4K** - A proposed dataset containing 185 high-quality landmark images across 49 categories, each with a corresponding high-quality reference image [1].
4. **T-Finance** - A large-scale dataset used for graph anomaly detection [3].
5. **Elliptic** - Another large-scale dataset mentioned in the context of graph anomaly detection [3].
6. **BlogCatalog** and **Flickr** - Datasets used to evaluate the performance of FreeGAD, particularly in terms of efficiency and memory usage [3].

These datasets are utilized f

Batches: 100%|██████████| 1/1 [00:00<00:00, 141.62it/s]



Q: What key limitations or open challenges are discussed?
--------------------------------------------------------------------------------
The context provided does not specify any key limitations or open challenges. Therefore, I don't have enough information to answer the question.
--------------------------------------------------------------------------------
Sources:
[1] 2508.10751v1 - Passk Training for Adaptively Balancing Exploration and Exploitation of Large Reasoning Models.pdf  (score=0.465)
[2] 2508.10701v1 - REFN A Reinforcement-Learning-From-Network Framework against 1-dayn-day Exploitations.pdf  (score=0.294)
[3] 2508.10806v1 - Who Benefits from AI Explanations Towards Accessible and Interpretable Systems.pdf  (score=0.295)



Batches: 100%|██████████| 1/1 [00:00<00:00, 111.09it/s]



Q: What future directions or proposed improvements recur across the papers?
--------------------------------------------------------------------------------
The future directions and proposed improvements across the papers include:

1. **Adaptive Unmasking/Remasking Strategies**: The use of adaptive unmasking and remasking strategies in diffusion language models is highlighted as a means to enhance efficiency and quality. This includes techniques like the remasking sampler proposed by ReMDM, which allows for further refinement of already decoded tokens, thus improving the compute-quality trade-off [1].

2. **Guidance Techniques**: The implementation of guidance techniques, such as classifier-free guidance, is emphasized as a pivotal method for steering generative models towards desired outputs. This approach enhances the quality of generated samples by balancing fidelity to conditions against sample diversity [1].

3. **Regulatory and Ethical Frameworks**: There is a pressing need for

## FastAPI endpoints (default top-3)
Run with: `uvicorn app:app --reload --port 8000`


In [None]:
from fastapi import FastAPI
from pydantic import BaseModel

app = FastAPI()

class Query(BaseModel):
    question: str
    k: int = 3  

@app.post("/query")
def do_query(q: Query):
    hits = search(q.question, k=q.k)
    # You can add an LLM call here and return {"answer": "...", "sources": hits}
    return {"results": hits}
