# Toyota Specifications: PDF Ingestion with Text Splitting

Extract and embed Toyota PDFs into ChromaDB with proper chunking.

In [None]:
# Setup
import os
from dotenv import load_dotenv
from pathlib import Path

load_dotenv("../../.env")
print("✅ Environment loaded")

In [None]:
# Configuration
DATA_DIR = "../../data/toyota-specs"
PERSIST_DIR = "../../chroma_db"
COLLECTION_NAME = "toyota_specs"

print(f"DATA_DIR: {DATA_DIR}")
print(f"PERSIST_DIR: {PERSIST_DIR}")
print(f"COLLECTION: {COLLECTION_NAME}")

In [None]:
# Load PDFs
from langchain_community.document_loaders import PyPDFLoader

pdf_files = sorted(Path(DATA_DIR).glob("*.pdf"))
print(f"Found {len(pdf_files)} PDFs:")

documents = []
for pdf_path in pdf_files:
    print(f"  Loading {pdf_path.name}...")
    loader = PyPDFLoader(str(pdf_path))
    docs = loader.load()
    
    # Add source metadata
    for doc in docs:
        doc.metadata["source"] = pdf_path.name
    
    documents.extend(docs)

print(f"\n✅ Loaded {len(documents)} pages from {len(pdf_files)} PDFs")

In [None]:
# Split into chunks
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    separators=["\n\n", "\n", " ", ""]
)

chunks = text_splitter.split_documents(documents)

print(f"✅ Split into {len(chunks)} chunks")
print(f"   Chunk size: 1000 characters")
print(f"   Chunk overlap: 200 characters")

In [None]:
# Initialize Embeddings
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(
    model="gemini-embedding-001",
    output_dimensionality=768
)
print("✅ Using: gemini-embedding-001")

In [None]:
from langchain_community.vectorstores import Chroma

In [None]:
# Delete existing collection
try:
    vectorstore = Chroma(
        collection_name=COLLECTION_NAME,
        embedding_function=embeddings,
        persist_directory=PERSIST_DIR
    )
    vectorstore.delete_collection()
    print("✅ Deleted existing collection")
except Exception as e:
    print(f"❌ Error deleting collection: {e}")

In [None]:
# Create ChromaDB Collection
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    collection_name=COLLECTION_NAME,
    persist_directory=PERSIST_DIR
)

print(f"✅ Created collection: {COLLECTION_NAME}")
print(f"   Total chunks: {len(chunks)}")
print(f"   Persist directory: {PERSIST_DIR}")

In [None]:
# Verify collection
count = vectorstore._collection.count()
print(f"Collection count: {count}")

In [None]:
# Test Search
query = "fuel efficient sedan"
results = vectorstore.similarity_search(query, k=3)

print(f"Test query: '{query}'\n")
for i, doc in enumerate(results, 1):
    print(f"[{i}] {doc.metadata.get('source', 'Unknown')}")
    print(f"    {doc.page_content[:150]}...\n")