# Custom RAG System (Gemini Flash)

## Phase 1: Environment & Project Setup

This phase ensures:
- Clean project structure
- Isolated Python environment
- Secure secrets management
- Reproducible execution

In [None]:
import sys
print("Python executable:", sys.executable)
print("Python version:", sys.version)

In [None]:
import platform
print("OS:", platform.system())

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

print("GOOGLE_API_KEY loaded:", bool(os.getenv("GOOGLE_API_KEY")))
print("PINECONE_API_KEY loaded:", bool(os.getenv("PINECONE_API_KEY")))
print("PINECONE_ENV loaded:", bool(os.getenv("PINECONE_ENV")))

In [None]:
print("Phase 1 setup complete and verified.")

# Phase 2: Document Ingestion & Text Extraction

This phase:
- Loads PDFs from disk
- Extracts page-level text
- Preserves metadata (source + page)
- Performs validation and sanity checks

In [None]:
from pathlib import Path
from langchain.document_loaders import PyPDFLoader

In [None]:
PDF_DIR = Path("data/pdfs")
assert PDF_DIR.exists(), "data/pdfs folder does not exist"

pdf_files = sorted(PDF_DIR.glob("*.pdf"))

print(f"Found {len(pdf_files)} PDF files:")
for f in pdf_files:
    print("-", f.name)

assert len(pdf_files) > 0, "No PDFs found. Add files to data/pdfs/"

In [None]:
documents = []

for pdf_path in pdf_files:
    print(f"\nLoading: {pdf_path.name}")
    loader = PyPDFLoader(str(pdf_path))
    pages = loader.load()

    print(f"  Pages extracted: {len(pages)}")
    documents.extend(pages)

print(f"\nTotal pages loaded from all PDFs: {len(documents)}")

In [None]:
sample = documents[0]
print("Metadata:", sample.metadata)
print("\n--- Content Preview (first 1000 chars) ---\n")
print(sample.page_content[:1000])

In [None]:
empty_pages = [i for i, d in enumerate(documents) if len(d.page_content.strip()) < 50]

print("Total pages:", len(documents))
print("Empty or near-empty pages:", len(empty_pages))

if len(empty_pages) > 0:
    print("Indices of empty pages:", empty_pages[:10])

In [None]:
total_chars = sum(len(d.page_content) for d in documents)
avg_chars = total_chars / len(documents)

print("Total characters:", total_chars)
print("Average characters per page:", int(avg_chars))

In [None]:
print("Phase 2 complete: Documents ingested and validated.")

# Phase 3: Text Normalization & Chunking

This phase:
- Normalizes raw extracted text
- Removes formatting noise
- Prepares semantically meaningful chunks
- Preserves source metadata for traceability

In [None]:
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
def normalize_text(text: str) -> str:
    # Collapse multiple newlines
    text = re.sub(r"\n+", "\n", text)

    # Collapse excessive whitespace
    text = re.sub(r"\s+", " ", text)

    # Strip edges
    text = text.strip()

    return text

In [None]:
for doc in documents:
    doc.page_content = normalize_text(doc.page_content)

print("Text normalization complete.")

In [None]:
print("=== NORMALIZED TEXT SAMPLE ===\n")
print(documents[0].page_content[:1000])

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=150,
    separators=["\n\n", "\n", ". ", " ", ""]
)

In [None]:
chunks = text_splitter.split_documents(documents)

print("Total pages:", len(documents))
print("Total chunks created:", len(chunks))

In [None]:
sample_chunk = chunks[0]

print("Chunk metadata:", sample_chunk.metadata)
print("\n--- Chunk preview ---\n")
print(sample_chunk.page_content[:1000])

In [None]:
sizes = [len(c.page_content) for c in chunks]

print("Min chunk size:", min(sizes))
print("Max chunk size:", max(sizes))
print("Avg chunk size:", sum(sizes) // len(sizes))

In [None]:
print("Phase 3 complete: Text normalized and chunked successfully.")

# Phase 4: Embedding Generation (Sentence Transformers)

This phase:
- Initializes a HuggingFace embedding model
- Converts text chunks into vector embeddings
- Verifies embedding shape and consistency

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

In [None]:
EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

print("Loading embedding model:", EMBEDDING_MODEL_NAME)
embedder = SentenceTransformer(EMBEDDING_MODEL_NAME)

print("Model loaded.")

In [None]:
texts = [chunk.page_content for chunk in chunks]
print("Total chunks to embed:", len(texts))

In [None]:
embeddings = embedder.encode(
    texts,
    batch_size=32,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True
)

print("Embeddings generated.")

In [None]:
print("Embeddings array shape:", embeddings.shape)
print("Single embedding dimension:", embeddings.shape[1])

In [None]:
print("Sample embedding (first 10 values):")
print(embeddings[0][:10])
print("Vector norm:", np.linalg.norm(embeddings[0]))

In [None]:
assert embeddings.shape[0] == len(chunks), "Mismatch between chunks and embeddings!"
assert embeddings.shape[1] == 384, "Unexpected embedding dimension!"

print("Embedding consistency checks passed.")

In [None]:
print("Phase 4 complete: Embeddings generated and validated.")

# Phase 5: Vector Database (Pinecone Serverless)

This phase:
- Initializes Pinecone Serverless client
- Creates a new serverless index (if needed)
- Batches and upserts vectors with metadata

In [None]:
import os
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv

load_dotenv()

In [None]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
assert PINECONE_API_KEY is not None, "Missing PINECONE_API_KEY"

pc = Pinecone(api_key=PINECONE_API_KEY)
print("Pinecone client initialized.")

In [None]:
INDEX_NAME = "notebooklm-rag-antigravity"

existing_indexes = [i["name"] for i in pc.list_indexes()]

if INDEX_NAME not in existing_indexes:
    print("Creating new serverless index:", INDEX_NAME)
    pc.create_index(
        name=INDEX_NAME,
        dimension=embeddings.shape[1],
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )
else:
    print("Using existing index:", INDEX_NAME)

In [None]:
index = pc.Index(INDEX_NAME)

In [None]:
vectors = []

for i, (chunk, vector) in enumerate(zip(chunks, embeddings)):
    vectors.append({
        "id": f"chunk-{i}",
        "values": vector.tolist(),
        "metadata": {
            "source": chunk.metadata.get("source", ""),
            "page": chunk.metadata.get("page", ""),
            "text": chunk.page_content[:1000]  # metadata size safety
        }
    })

print("Prepared vectors:", len(vectors))

In [None]:
BATCH_SIZE = 100

for i in range(0, len(vectors), BATCH_SIZE):
    batch = vectors[i:i+BATCH_SIZE]
    index.upsert(vectors=batch)
    print(f"Uploaded {min(i + BATCH_SIZE, len(vectors))} / {len(vectors)} vectors")

In [None]:
stats = index.describe_index_stats()
print("Index stats:", stats)

In [None]:
print("Phase 5 complete: Vectors successfully stored in Pinecone Serverless.")