In [None]:
# Step 1 — Install dependencies

# !pip install --upgrade pip
# !pip install sentence-transformers faiss-cpu PyMuPDF tqdm fastapi uvicorn

In [12]:
# Step 2 — Imports and config

import os
from pathlib import Path
from typing import List
import numpy as np
import faiss
import fitz  # PyMuPDF
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm
import pickle
import json

# Config
MODEL_NAME = "all-MiniLM-L6-v2"
OUTPUT_DIR = Path("rag_project/output")
OUTPUT_DIR.mkdir(exist_ok=True)
CHUNKS_JSON = OUTPUT_DIR / "chunks.json"
INDEX_FILE = OUTPUT_DIR / "faiss.index"
META_FILE = OUTPUT_DIR / "meta.pkl"

DIM = 384  # embedding dimension


In [19]:
# Step 3 — PDF text extraction

def extract_text_from_pdf(pdf_path: str) -> List[str]:
    pages = []
    with fitz.open(pdf_path) as doc:
        for page in doc:
            pages.append(page.get_text("text"))
    return pages

# Example usage:
pdf_files = list(Path("rag_project/pdfs").glob("*.pdf"))  # place your PDFs in a folder "pdfs"
all_pages = []
for pdf in pdf_files:
    pages = extract_text_from_pdf(pdf)
    all_pages.extend(pages)

print(f"Extracted {len(all_pages)} pages from {len(pdf_files)} PDFs")


Extracted 122 pages from 5 PDFs


In [20]:
# Step 4 — Chunking

def chunk_text(text: str, max_tokens: int = 512, overlap: int = 50) -> List[str]:
    tokens = text.split()
    chunks = []
    start = 0
    while start < len(tokens):
        end = start + max_tokens
        chunk = " ".join(tokens[start:end])
        chunks.append(chunk)
        start += max_tokens - overlap
    return chunks

# Apply chunking
all_chunks = []
metadata = []

for page_num, page_text in enumerate(all_pages, start=1):
    page_chunks = chunk_text(page_text)
    for chunk_id, chunk in enumerate(page_chunks):
        metadata.append({"page": page_num, "chunk_id": len(all_chunks), "text": chunk})
        all_chunks.append(chunk)

print(f"Created {len(all_chunks)} chunks")


Created 189 chunks


In [21]:
# Step 5 — Save chunks (optional)

with open(CHUNKS_JSON, "w", encoding="utf-8") as f:
    json.dump([m["text"] for m in metadata], f, ensure_ascii=False, indent=2)


In [22]:
# Step 6 — Load embedding model and encode chunks

model = SentenceTransformer(MODEL_NAME)

batch_size = 64
embeddings = []

for i in tqdm(range(0, len(all_chunks), batch_size), desc="Embedding batches"):
    batch = all_chunks[i:i+batch_size]
    emb = model.encode(batch, show_progress_bar=False)
    embeddings.append(emb)

embeddings = np.vstack(embeddings).astype("float32")
print(f"Embeddings shape: {embeddings.shape}")


Embedding batches:   0%|          | 0/3 [00:00<?, ?it/s]

Embeddings shape: (189, 384)


In [23]:
# Step 7 — Build FAISS index

dim = embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dim)
faiss_index.add(embeddings)
print(f"FAISS index contains {faiss_index.ntotal} vectors")

# Save index & metadata
faiss.write_index(faiss_index, str(INDEX_FILE))
with open(META_FILE, "wb") as f:
    pickle.dump(metadata, f)
print(f"Saved index to {INDEX_FILE} and metadata to {META_FILE}")


FAISS index contains 189 vectors
Saved index to rag_project\output\faiss.index and metadata to rag_project\output\meta.pkl


In [24]:
# Step 8 — Interactive retrieval

def retrieve(query: str, model: SentenceTransformer, index, metadata, k: int = 3):
    qvec = model.encode([query]).astype("float32")
    distances, indices = index.search(qvec, k)
    results = []
    for i, idx in enumerate(indices[0]):
        meta = metadata[idx].copy()
        meta["distance"] = float(distances[0][i])
        results.append(meta)
    return results

# Example query
query = "How do transformers work?"
results = retrieve(query, model, faiss_index, metadata, k=3)

for r in results:
    print(f"[Page {r['page']}] distance: {r['distance']:.4f}")
    print(r['text'])
    print("-"*80)


[Page 17] distance: 1.5896
thereby supporting subsequent reasoning processes. In each instance, the model is required to accurately rotate target objects within a fixed 2D plane while preserving the overall scene structure and structural consistency, followed by performing reasoning tasks like grounding and OCR. The evaluation focuses on both the accuracy of the rotation in terms of angle and direction, and the precision of the resulting reasoning tasks. 17
--------------------------------------------------------------------------------
[Page 19] distance: 1.6424
Definition of Good / Moderate / Bad. Model outputs are categorized into three quality levels: ✓Good: The rotation is accurate, complete, and strictly confined to the 2D plane, with no extraneous scene motion. The following reasoning tasks are completed correctly. Target objects remain precisely grounded after rotation. ~ Moderate: The rotation is largely correct but may be incomplete or slightly off-angle, though still confine

In [25]:
example_queries = [
    "How do transformers work?",
    "Applications of natural language processing",
    "What is self-supervised learning?",
    "Data preprocessing techniques for machine learning",
    "How to fine-tune BERT for classification?"
]


In [26]:
report = {}

for q in example_queries:
    top_chunks = retrieve(q, model, faiss_index, metadata, k=3)
    report[q] = top_chunks

In [30]:
for query, results in report.items():
    print(f"Query: {query}")
    for i, r in enumerate(results, 1):
        print(f"  Top-{i} | distance: {r['distance']:.4f} | chunk_id: {r['chunk_id']}")
        print(f"    {r['text'][:200]}...")  # show first 200 chars
    print("="*80)

Query: How do transformers work?
  Top-1 | distance: 1.5896 | chunk_id: 23
    thereby supporting subsequent reasoning processes. In each instance, the model is required to accurately rotate target objects within a fixed 2D plane while preserving the overall scene structure and ...
  Top-2 | distance: 1.6424 | chunk_id: 25
    Definition of Good / Moderate / Bad. Model outputs are categorized into three quality levels: ✓Good: The rotation is accurate, complete, and strictly confined to the 2D plane, with no extraneous scene...
  Top-3 | distance: 1.6969 | chunk_id: 164
    Produce a ~60-second, 2D flat-design explainer educating viewers on trimming, pruning, stump removal, and tree health. Use bold typography, a natural palette, icon-driven graphics, subtle character an...
Query: Applications of natural language processing
  Top-1 | distance: 1.3874 | chunk_id: 143
    Neural Information Processing Systems, 37:52040–52094, 2024. 14...
  Top-2 | distance: 1.4088 | chunk_id: 75
    Gisti

In [None]:
Cell 9 — Optional: FastAPI Integration
Run this separately in a terminal (not in the notebook):
# uvicorn main:app --reload --host 0.0.0.0 --port 8000