In [8]:
"""02 - Embedding & Vector Store
This notebook computes sentence embeddings from the cleaned transactions data 
and saves the results as a FAISS index (vector store) along with metadata. 
This is Step 2 of the RAG pipeline.
"""

'02 - Embedding & Vector Store\nThis notebook computes sentence embeddings from the cleaned transactions data \nand saves the results as a FAISS index (vector store) along with metadata. \nThis is Step 2 of the RAG pipeline.\n'

In [11]:
import sys
from pathlib import Path

# Add src/ to Python path
sys.path.append(str(Path().resolve().parent))

from src.vector_store import build_and_save_vector_store, load_existing_vector_store, simple_query

print("Successfully imported vector_store module.")

Successfully imported vector_store module.


In [15]:
# Define data + vector store paths

PROCESSED_CSV = Path("../data/processed/embedding_ready_transactions.csv")
VECTOR_STORE_FILE = Path("../data/processed/vector_store.faiss")  # using .faiss extension
METADATA_FILE = Path("../data/processed/metadata.pkl")

# Print full resolved paths
print(f"Processed CSV:     {PROCESSED_CSV.resolve()}")
print(f"Vector store file: {VECTOR_STORE_FILE.resolve()}")
print(f"Metadata file:     {METADATA_FILE.resolve()}")

Processed CSV:     C:\Users\dhoward\howard-financial\data\processed\embedding_ready_transactions.csv
Vector store file: C:\Users\dhoward\howard-financial\data\processed\vector_store.faiss
Metadata file:     C:\Users\dhoward\howard-financial\data\processed\metadata.pkl


In [16]:
# Build and save the vector store (embeddings + FAISS index + metadata)
build_and_save_vector_store(
    processed_csv=PROCESSED_CSV,
    vector_store_file=VECTOR_STORE_FILE,
    metadata_file=METADATA_FILE,
    text_col="text",  # default
    model_name="all-MiniLM-L6-v2"
)

🔄 Loading processed data from ..\data\processed\embedding_ready_transactions.csv...


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


🔄 Embedding 102 records using model 'all-MiniLM-L6-v2'...


Batches: 100%|██████████| 4/4 [00:00<00:00, 15.93it/s]

✅ Saved FAISS index to C:\Users\dhoward\howard-financial\data\processed\vector_store.faiss
✅ Saved metadata to C:\Users\dhoward\howard-financial\data\processed\metadata.pkl
✅ Vector store built and saved successfully.





In [19]:
# Load the saved vector store to preview
index, metadata = load_existing_vector_store(
    vector_store_file=VECTOR_STORE_FILE,
    metadata_file=METADATA_FILE
)

print(f"✅ FAISS index loaded with {index.ntotal} vectors.")
print("Metadata sample:")
display(metadata.head())

✅ Loaded FAISS index and metadata successfully.
✅ FAISS index loaded with 102 vectors.
Metadata sample:


Unnamed: 0,text,Dollars,Date,Account,Type,Category,Vendor,Tags,Amount,weekday,is_weekend,is_fixed
0,"On 2025-04-01, you spent $775.00 at 6640 pasad...",($775.00),2025-04-01,usaa s,expense,rent,6640 pasado,,775.0,Tuesday,False,True
1,"On 2025-04-01, you spent $11.00 at coral tree ...",($11.00),2025-04-01,usaa c,expense,food,coral tree,eating out,11.0,Tuesday,False,False
2,"On 2025-04-02, you spent $5.99 at spotify (Cat...",($5.99),2025-04-02,usaa c,expense,subscription,spotify,,5.99,Wednesday,False,True
3,"On 2025-04-03, you spent $5.30 at ucsb (Catego...",($5.30),2025-04-03,usaa c,expense,parking,ucsb,,5.3,Thursday,False,False
4,"On 2025-04-07, a transaction of $83.99 occurre...",$83.99,2025-04-07,usaa c,reimburstment,lodging,slo hotel,,-83.99,Monday,False,False


In [22]:
# Run a test query to verify the vector store is functional

query_text = "how much did I spend on food at blenders?"
results = simple_query(
    query_text=query_text,
    index=index,
    metadata=metadata,
    model_name="all-MiniLM-L6-v2",
    top_k=5
)

print(f"🔎 Top results for query: '{query_text}'")
display(results)

🔎 Top results for query: 'how much did I spend on food at blenders?'


Unnamed: 0,text,Dollars,Date,Account,Type,Category,Vendor,Tags,Amount,weekday,is_weekend,is_fixed,similarity_score
0,"On 2025-04-08, you spent $9.00 at blenders (Ca...",($9.00),2025-04-08,usaa c,expense,food,blenders,eating out,9.0,Tuesday,False,False,0.573938
1,"On 2025-04-14, you spent $17.23 at pizza (Cate...",($17.23),2025-04-14,usaa c,expense,food,pizza,"yosemite, eating out, fast food",17.23,Monday,False,False,0.490238
2,"On 2025-04-09, you spent $7.59 at costco (Cate...",($7.59),2025-04-09,usaa c,expense,food,costco,"eating out, fast food",7.59,Wednesday,False,False,0.472417
3,"On 2025-04-14, you spent $8.37 at convenience ...",($8.37),2025-04-14,usaa c,expense,food,convenience store,"yosemite, beverage",8.37,Monday,False,False,0.469631
4,"On 2025-04-23, you spent $27.06 at kyles kitch...",($27.06),2025-04-23,usaa c,expense,food,kyles kitchen,"eating out, fast food",27.06,Wednesday,False,False,0.466078
