# FAISS Chunk Inspector Notebook
This notebook allows you to load your FAISS vector store and inspect the individual text chunks,
their internal IDs, and associated metadata (like page numbers and source files).

In [1]:
#Code Block 1

import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from dotenv import load_dotenv # Import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Add the project root to the Python path to allow importing src modules
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in os.sys.path:
    os.sys.path.insert(0, project_root)

from src.vector_stores.faiss_store import get_vector_store

## Setup

First, ensure you have all the necessary dependencies installed. If you haven't already, run the
following command in your project's root directory:

pip install -r requirements.txt

Also, make sure you have built your FAISS index by running `python ingest.py` at least once.

In [None]:
#Code Block 2

vector_store = get_vector_store(version="school")

if vector_store is None:
    print("FAISS index not found or empty. Please run 'python ingest.py' first and ensure documents are present.")
else:
    print("FAISS vector store loaded successfully.")

## Inspect Text Chunks and Metadata

This cell will iterate through the chunks stored in your FAISS index and display their content, internal ID, source file, and page number.

In [None]:
#Code Block 3

if vector_store:
    print("Inspecting document chunks and metadata:")

    # Get all document IDs from the index
    all_doc_ids = list(vector_store.docstore._dict.keys())

    if not all_doc_ids:
        print("No documents found in the vector store's docstore.")
    else:
        # Iterate through the docstore to get the full Document objects
        for i, doc_id in enumerate(all_doc_ids):
            doc_obj = vector_store.docstore.search(doc_id)
            if doc_obj:
                print(f"\n--- Chunk {i+1} (ID: {doc_id}) ---")
                print(f"Source: {doc_obj.metadata.get('source', 'N/A')}")
                print(f"Page: {doc_obj.metadata.get('page', 'N/A')}")
                print("Content (first 500 chars):")
                print(doc_obj.page_content[:500] + "...")
                print("-" * 40)

            if i >= 19:  # Limit output to first 20 chunks to avoid overwhelming the display
                print("\n... (Displaying details for the first 20 chunks. Modify this cell to see more.)")
                break
else:
    print("Vector store not loaded. Cannot inspect chunks.")