# arXiv RAG Pipeline - Complete Exploration

This notebook demonstrates the full RAG pipeline implementation.

In [None]:
# 1. Import dependencies
import sys
import os
import json
import numpy as np
sys.path.append('../scripts')

from download_arxiv import download_pdfs, query_arxiv
from extract_and_chunk import process_pdfs
from build_index import build_faiss_index

print("Dependencies loaded successfully")

In [None]:
# 2. Download arXiv papers (if not already done)
print("Downloading arXiv papers...")
xml_text = query_arxiv("cat:cs.CL", max_results=50)
download_pdfs(xml_text)
print("Download completed")

In [None]:
# 3. Extract and chunk text
print("Processing PDFs and chunking text...")
chunks = process_pdfs("../data/raw_pdfs/")
print(f"Created {len(chunks)} chunks")

# Display sample chunks
print("\nSample chunks:")
for i, chunk in enumerate(chunks[:3]):
    print(f"Chunk {i+1}: {chunk['text'][:200]}...")

In [None]:
# 4. Build FAISS index
print("Building FAISS index...")
# This will run the build_index.py script
%run ../scripts/build_index.py
print("Index built successfully!")

In [None]:
# 5. Test retrieval
from sentence_transformers import SentenceTransformer
import faiss

# Load the index and model
model = SentenceTransformer("all-MiniLM-L6-v2")
index = faiss.read_index("../embeddings/faiss.index")

# Load text mappings
import pickle
with open("../embeddings/id2text.pkl", "rb") as f:
    id2text = pickle.load(f)

def search_query(query: str, k: int = 3):
    """Search the index for similar chunks"""
    query_embedding = model.encode([query]).astype('float32')
    distances, indices = index.search(query_embedding, k)
    
    print(f"Query: '{query}'")
    print(f"Top {k} results:")
    
    for i, (dist, idx) in enumerate(zip(distances[0], indices[0])):
        print(f"\n--- Result {i+1} (distance: {dist:.4f}) ---")
        print(id2text[idx][:500] + "..." if len(id2text[idx]) > 500 else id2text[idx])
    
    return distances, indices

# Test queries
test_queries = [
    "transformer models",
    "natural language processing",
    "machine learning",
    "neural networks",
    "attention mechanism"
]

for query in test_queries:
    search_query(query)
    print("\n" + "="*50 + "\n")