# Import

In [None]:
import sys
import os
import pandas as pd

In [2]:
project_root = os.path.abspath(os.path.join(os.getcwd(), "../"))
if project_root not in sys.path:
    sys.path.append(project_root)
from src.embedding.chunking import TextChunker
from src.embedding.embedder import Embedder
from src.embedding.vector_store import VectorStoreManager

# Load cleaned complaints

In [3]:
df = pd.read_csv("../data/processed/filtered_complaints.csv")

# Add Complaint ID for metadata traceability

In [4]:
df["ComplaintID"] = df.index

# Initialize

In [5]:
chunker = TextChunker(chunk_size=256, chunk_overlap=32)
embedder = Embedder()

# Collect all chunks and aligned metadata

In [6]:
all_chunks = []
metadata = []

for _, row in df.iterrows():
    # Chunk the cleaned narrative
    text_chunks = chunker.chunk_texts([row["Cleaned Narrative"]])
    
    for chunk in text_chunks:
        all_chunks.append(chunk)
        metadata.append({
            "complaint_id": row["ComplaintID"],
            "product": row["Product"],
            "text": chunk
        })

# Embed all chunks

In [7]:
embeddings = embedder.embed_texts(all_chunks)

Batches:   0%|          | 0/13340 [00:00<?, ?it/s]

# Save to vector store

In [8]:
store = VectorStoreManager(dim=embeddings.shape[1])
store.add_embeddings(embeddings, metadata)
store.save()

print(" Vector index and metadata saved.")

 Vector index and metadata saved.
