# Build FAISS Vector Store for Product and Category Data using Hugging Face Embeddings

This notebook loads exported categories and products data from PostgreSQL and creates a FAISS vector store for RAG queries. The vector store will enable semantic search over product information using lightweight embeddings from Hugging Face transformers optimized for Mac M1.

## Import Required Libraries

In [None]:
# Import necessary libraries
import os
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
from typing import Dict, List, Any

# LangChain components
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document

# Install required packages if needed
# !pip install sentence-transformers huggingface-hub faiss-cpu langchain

## Initialize Hugging Face Embeddings

We'll use a lightweight model that works well on Mac M1.

In [None]:
# Initialize embeddings with a lightweight model optimized for Mac M1
# Options include:
# - 'all-MiniLM-L6-v2': Small (80MB) and fast, 384 dimensions
# - 'paraphrase-multilingual-MiniLM-L12-v2': Good for multilingual (Vietnamese), 384 dimensions
# - 'all-mpnet-base-v2': Better quality but larger (420MB), 768 dimensions

# For best balance of size and quality for Vietnamese content
model_name = "paraphrase-multilingual-MiniLM-L12-v2"

# Initialize the Hugging Face embeddings
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'mps'},  # Use Metal Performance Shaders for M1 acceleration
    encode_kwargs={'normalize_embeddings': True}  # Normalize for better similarity search
)

print(f"Initialized embeddings using model: {model_name}")

Initialized embeddings using model: paraphrase-multilingual-MiniLM-L12-v2


## Load Categories and Products Data

In [None]:
# File paths
categories_file = "categories_202504171616.json"
products_file = "products_202504171616.json"

# Load categories data
try:
    with open(categories_file, "r", encoding="utf-8") as f:
        categories_data = json.load(f)
        categories = categories_data.get('categories', [])
        print(f"Successfully loaded {len(categories)} categories from {categories_file}")
except Exception as e:
    print(f"Error loading categories file: {e}")
    categories = []

# Load products data
try:
    with open(products_file, "r", encoding="utf-8") as f:
        products_data = json.load(f)
        products = products_data.get('products', [])
        print(f"Successfully loaded {len(products)} products from {products_file}")
except Exception as e:
    print(f"Error loading products file: {e}")
    products = []

Successfully loaded 182 categories from categories_202504171616.json
Successfully loaded 3669 products from products_202504171616.json
Successfully loaded 3669 products from products_202504171616.json


## Prepare Data for Vector Store

We'll create Document objects from products and categories for embedding generation.

In [None]:
def create_product_document(product: Dict) -> Document:
    """Convert a product record to a Document for embedding"""

    # Create a rich text representation of the product for embedding
    product_text = f"Product Name: {product.get('name', 'Unknown')}\n"
    product_text += f"Description: {product.get('description', 'No description')}\n"

    # Add price information
    price = product.get("price")
    currency = product.get("currency", "VND")
    if price:
        product_text += f"Price: {price} {currency}\n"

    # Add specifications if available
    specs = product.get("specifications")
    if specs and isinstance(specs, str):
        try:
            specs_dict = json.loads(specs)
            product_text += "Specifications:\n"
            for key, value in specs_dict.items():
                product_text += f"- {key}: {value}\n"
        except:
            product_text += f"Specifications: {specs}\n"

    # Add tags if available
    tags = product.get("tags")
    if tags:
        product_text += f"Tags: {tags}\n"

    # Create metadata for the document
    metadata = {
        "id": product.get("id"),
        "name": product.get("name"),
        "slug": product.get("slug"),
        "price": product.get("price"),
        "currency": product.get("currency"),
        "category_id": product.get("category_id"),
        "source": "product",
        "rating": product.get("rating"),
        "is_active": product.get("is_active"),
        "stock_quantity": product.get("stock_quantity"),
        "featured_image": product.get("images", "").split(",")[0],
    }

    return Document(page_content=product_text, metadata=metadata)


def create_category_document(category: Dict) -> Document:
    """Convert a category record to a Document for embedding"""

    # Create a rich text representation of the category
    category_text = f"Category Name: {category.get('name', 'Unknown')}\n"
    category_text += f"Path: {category.get('path_url', 'No path')}\n"
    category_text += f"Products Count: {category.get('products_count', 0)}\n"

    # Create metadata for the document
    metadata = {
        "id": category.get("id"),
        "name": category.get("name"),
        "path_url": category.get("path_url"),
        "parent_id": category.get("parent_id"),
        "is_leaf": category.get("is_leaf"),
        "source": "category",
    }

    return Document(page_content=category_text, metadata=metadata)

In [None]:
# Convert products to documents
print("Converting products to documents...")
product_documents = [create_product_document(product) for product in tqdm(products)]
print(f"Created {len(product_documents)} product documents")

# Convert categories to documents
print("\nConverting categories to documents...")
category_documents = [create_category_document(category) for category in tqdm(categories)]
print(f"Created {len(category_documents)} category documents")

# Combine all documents
all_documents = product_documents + category_documents
print(f"\nTotal documents: {len(all_documents)}")

Converting products to documents...


100%|██████████| 3669/3669 [00:00<00:00, 6271.33it/s]


Created 3669 product documents

Converting categories to documents...


100%|██████████| 182/182 [00:00<00:00, 58276.46it/s]


Created 182 category documents

Total documents: 3851


## Create and Save the FAISS Vector Store

Using batch processing to be more memory-efficient for Mac M1.

In [24]:
# Directory to save the FAISS index
faiss_index_path = "./faiss_index"

# Define batch size for processing
batch_size = 50  # Adjust based on your Mac M1's memory

try:
    print(f"Creating FAISS index with {len(all_documents)} documents in batches of {batch_size}...")
    
    # Process documents in batches to be more memory-efficient
    vectorstore = None
    for i in tqdm(range(0, len(all_documents), batch_size)):
        # Get the current batch
        batch = all_documents[i:i+batch_size]
        
        if vectorstore is None:
            # Create a new vector store with the first batch
            vectorstore = FAISS.from_documents(batch, embeddings)
        else:
            # Add subsequent batches to the existing vector store
            batch_vectorstore = FAISS.from_documents(batch, embeddings)
            vectorstore.merge_from(batch_vectorstore)
        
    # Save the vector store to disk
    print(f"Saving FAISS index to {faiss_index_path}...")
    vectorstore.save_local(faiss_index_path)
    
    print("FAISS index created and saved successfully!")
except Exception as e:
    print(f"Error creating FAISS index: {e}")

Creating FAISS index with 3851 documents in batches of 50...


100%|██████████| 78/78 [00:42<00:00,  1.85it/s]


Saving FAISS index to ./faiss_index...
FAISS index created and saved successfully!


## Save Refined Product Details

Save the product details for future use in the agent.

## Test the Vector Store with Sample Queries

In [20]:
# Function to load the vector store
def load_vectorstore(index_path: str = faiss_index_path):
    """Load the FAISS vector store with Hugging Face embeddings"""
    return FAISS.load_local(
        index_path, embeddings, allow_dangerous_deserialization=True
    )


# Load the vector store
try:
    vectorstore = load_vectorstore()
    print("FAISS vector store loaded successfully!")

    # Test with a sample query
    query = "sản phẩm thủ công mỹ nghệ bằng gỗ"
    results = vectorstore.similarity_search_with_score(query, k=3)

    print(f"\nSearch results for query: '{query}'\n")
    for i, (doc, score) in enumerate(results):
        print(f"Result {i + 1}: Score = {score}")
        print(f"Source: {doc.metadata['source']}")
        print(f"Name: {doc.metadata['name']}")
        print(f"Content: {doc.page_content[:200]}...")
        print("-" * 80)
except Exception as e:
    print(f"Error testing vector store: {e}")

FAISS vector store loaded successfully!

Search results for query: 'sản phẩm thủ công mỹ nghệ bằng gỗ'

Result 1: Score = 0.6046595573425293
Source: product
Name: Hộp đựng đũa bằng gỗ MNV-SMTR-HD07
Content: Product Name: Hộp đựng đũa bằng gỗ MNV-SMTR-HD07
Description: Hộp đựng đũa bằng gỗ có độ bền cao theo thời gian, giúp bảo quản đũa ăn một cách vệ sinh và an toàn. Sản phẩm này ngăn chặn hiệu quả sự xâ...
--------------------------------------------------------------------------------
Result 2: Score = 0.6073907017707825
Source: product
Name: Đũa Đầu Nhựa 1p Mun MNV-MNTD08-1
Content: Product Name: Đũa Đầu Nhựa 1p Mun MNV-MNTD08-1
Description: Đũa gỗ sơn mài với màu sắc tinh tế và họa tiết trang nhã theo phong cách truyền thống Việt Nam. Sản phẩm được chế tác công phu bởi những ngư...
--------------------------------------------------------------------------------
Result 3: Score = 0.6092035174369812
Source: product
Name: Lịch gỗ note book MNV-QTN25-2
Content: Product Name: Lịch gỗ no

## Performance Comparison

Let's check the memory usage and speed of embeddings.

In [21]:
import time
import psutil

def get_memory_usage():
    """Get current memory usage of the process in MB"""
    process = psutil.Process(os.getpid())
    mem_info = process.memory_info()
    return mem_info.rss / 1024 / 1024  # Convert bytes to MB

# Check embedding speed
sample_texts = [
    "Sản phẩm thủ công mỹ nghệ bằng gỗ",
    "Trang trí nội thất cao cấp",
    "Đồ lưu niệm truyền thống Việt Nam"
]

print(f"Memory usage before embeddings: {get_memory_usage():.2f} MB")

start_time = time.time()
embeddings_results = embeddings.embed_documents(sample_texts)
end_time = time.time()

print(f"Memory usage after embeddings: {get_memory_usage():.2f} MB")
print(f"Time to embed {len(sample_texts)} texts: {end_time - start_time:.4f} seconds")
print(f"Embedding dimension: {len(embeddings_results[0])}")

Memory usage before embeddings: 39.27 MB
Memory usage after embeddings: 127.64 MB
Time to embed 3 texts: 2.5931 seconds
Embedding dimension: 384
