## Indexing Tables

In [1]:
# !pip install chromadb sentence-transformers sklearn

In [2]:
import chromadb
import numpy as np
import pandas as pd

from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import StandardScaler

model = SentenceTransformer('all-MiniLM-L6-v2')

In [3]:
def get_price_range(price):
    """
    Categorize a product price into a range for filtering purposes.

    Args:
        price (float): The product price

    Returns:
        str: The price range category
    """
    if price < 10:
        return "budget"
    elif price < 50:
        return "affordable"
    elif price < 200:
        return "mid-range"
    elif price < 1000:
        return "premium"
    else:
        return "luxury"

# %%
def separate_column_types(dataframe):
    # Separate columns by data type
    text_columns = ['name', 'description', 'category']
    numeric_columns = ['price', 'rating', 'review_count']
    boolean_columns = ['in_stock']

    # Extract the different data types
    text_data = dataframe[text_columns]
    numeric_data = dataframe[numeric_columns]
    boolean_data = dataframe[boolean_columns]

    return text_data, numeric_data, boolean_data

In [4]:
def encode_text_embeddings(model, text_data):
    # Combine text fields into a single string for each row
    text_records = []
    for _, row in text_data.iterrows():
        text = f"Product: {row['name']}. Description: {row['description']}. Category: {row['category']}"
        text_records.append(text)

    # Generate embeddings using a text embedding model
    text_embeddings = model.encode(text_records)
    return text_embeddings

def encode_numeric_data(numeric_data):
    # Normalize numeric values
    scaler = StandardScaler()
    normalized_numeric = scaler.fit_transform(numeric_data)
    return normalized_numeric

def encode_boolean_data(boolean_data):
    # Convert boolean to binary values
    binary_encoded = boolean_data.astype(int).values
    return binary_encoded

In [5]:
def create_hybrid_embeddings(text_embeddings, numeric_data, boolean_data):
    # Combine the embeddings
    hybrid_embeddings = []

    for i in range(len(text_embeddings)):
        # Option 1: Concatenate approach (creates a longer vector)
        combined = np.concatenate([
            text_embeddings[i],
            # np.zeros(3),  # 3 numeric features
            numeric_data[i],
            # np.array([1])  # 1 boolean feature
            boolean_data[i]
        ])
        hybrid_embeddings.append(combined)

    return np.array(hybrid_embeddings)

# Calculate median values from your training data
def calculate_median_values(dataframe):
    numeric_columns = ['price', 'rating', 'review_count']
    median_values = dataframe[numeric_columns].median().values

    # Normalize these median values using the same scaler used for training
    scaler = StandardScaler()
    scaler.fit(dataframe[numeric_columns])  # Fit on the same data used for training
    normalized_median = scaler.transform([median_values])[0]

    return normalized_median

In [6]:
def store_hybrid_embeddings(client, dataframe, hybrid_embeddings):
    # Create or get the collection
    collection = client.get_or_create_collection(
        name="product-catalog",
        metadata={"hnsw:space": "cosine"}
    )

    batch_size = 100
    for i in range(0, len(hybrid_embeddings), batch_size):
        end = min(i + batch_size, len(hybrid_embeddings))
        ids = [str(id) for id in dataframe['product_id'][i:end].tolist()]
        vectors = hybrid_embeddings[i:end].tolist()

        # Include metadata for filtering
        metadata = []
        for j in range(i, end):
            meta = {
                'price_range': get_price_range(dataframe['price'][j]),
                'category': dataframe['category'][j],
                'rating': float(dataframe['rating'][j]),
                'in_stock': bool(dataframe['in_stock'][j])
            }
            metadata.append(meta)

        # Add to ChromaDB
        collection.add(
            ids=ids,
            embeddings=vectors,
            metadatas=metadata
        )

In [7]:
def process_query(model, client, df, query, filters=None):
    # Generate text embedding for the query
    text_embedding = model.encode([query])[0]
    median_numeric = calculate_median_values(df)
    boolean_embedding = np.array([1])  # Assuming you are looking for in stock products

    # Create hybrid embedding for the query
    query_embedding = np.concatenate([
        text_embedding,
        median_numeric,
        boolean_embedding
    ])

    # Get the collection
    collection = client.get_collection("product-catalog")

    # Convert filters to ChromaDB format if needed
    where = None
    if filters:
        # ChromaDB expects a specific format for multiple conditions
        # We need to use the $and operator to combine multiple conditions
        where_conditions = []
        for key, value in filters.items():
            where_conditions.append({key: value})

        if len(where_conditions) > 1:
            where = {"$and": where_conditions}
        elif len(where_conditions) == 1:
            where = where_conditions[0]

    # Query the vector database with metadata filtering
    results = collection.query(
        query_embeddings=[query_embedding.tolist()],
        n_results=10,
        where=where
    )

    return results

def print_results(results, df):
    """
    Print the results of a query in a readable format.

    Args:
        results: The results from ChromaDB
        df: The original DataFrame with product information
    """
    if not results or not results['ids'] or not results['ids'][0]:
        print("No results found.")
        return

    for i, product_id in enumerate(results['ids'][0]):
        product = df[df['product_id'] == int(product_id)].iloc[0]
        distance = results['distances'][0][i] if 'distances' in results else "N/A"

        print(f"Product: {product['name']}")
        print(f"Category: {product['category']}")
        print(f"Price: ${product['price']:.2f}")
        print(f"Rating: {product['rating']} ({product['review_count']} reviews)")
        print(f"In Stock: {'Yes' if product['in_stock'] else 'No'}")
        print(f"Distance: {distance}")
        print("-" * 50)

In [8]:
products = [
    {
        'product_id': 1,
        'name': 'Gaming Laptop Pro',
        'description': 'High-performance gaming laptop with RTX 3080, 32GB RAM, and 1TB SSD',
        'category': 'Electronics',
        'price': 2499.99,
        'rating': 4.8,
        'review_count': 156,
        'in_stock': True
    },
    {
        'product_id': 2,
        'name': 'Wireless Noise-Cancelling Headphones',
        'description': 'Premium wireless headphones with active noise cancellation and 30-hour battery life',
        'category': 'Electronics',
        'price': 349.99,
        'rating': 4.6,
        'review_count': 243,
        'in_stock': True
    },
    {
        'product_id': 3,
        'name': 'Smart Home Security Camera',
        'description': '1080p HD security camera with night vision and motion detection',
        'category': 'Smart Home',
        'price': 79.99,
        'rating': 4.3,
        'review_count': 89,
        'in_stock': False
    },
    {
        'product_id': 4,
        'name': 'Organic Coffee Beans',
        'description': 'Fair-trade organic coffee beans, medium roast, 1kg bag',
        'category': 'Food & Beverages',
        'price': 24.99,
        'rating': 4.7,
        'review_count': 312,
        'in_stock': True
    },
    {
        'product_id': 5,
        'name': 'Yoga Mat',
        'description': 'Non-slip yoga mat with carrying strap, eco-friendly materials',
        'category': 'Sports & Fitness',
        'price': 39.99,
        'rating': 4.5,
        'review_count': 178,
        'in_stock': True
    }
]

# Convert to DataFrame
df = pd.DataFrame(products)

client = chromadb.Client()

# Process the data
text_data, numeric_data, boolean_data = separate_column_types(df)
text_embeddings = encode_text_embeddings(model, text_data)
normalized_numeric = encode_numeric_data(numeric_data)
binary_encoded = encode_boolean_data(boolean_data)

# Create hybrid embeddings
hybrid_embeddings = create_hybrid_embeddings(text_embeddings, normalized_numeric, binary_encoded)

# Store in ChromaDB
store_hybrid_embeddings(client, df, hybrid_embeddings)

# Example queries
print("Query 1: Top Notebook for Gaming and Work")
results = process_query(model, client, df, "Top Notebook for Gaming and Work", filters={"price_range": "luxury", "in_stock": True})
print_results(results, df)

Query 1: Top Notebook for Gaming and Work
Product: Gaming Laptop Pro
Category: Electronics
Price: $2499.99
Rating: 4.8 (156 reviews)
In Stock: Yes
Distance: 0.8110852241516113
--------------------------------------------------




In [9]:
print("\nQuery 2: In-stock items under $50")
results = process_query(model, client, df, "affordable products", filters={"price_range": "affordable", "in_stock": True})
print_results(results, df)


Query 2: In-stock items under $50
Product: Yoga Mat
Category: Sports & Fitness
Price: $39.99
Rating: 4.5 (178 reviews)
In Stock: Yes
Distance: 0.3542284369468689
--------------------------------------------------
Product: Organic Coffee Beans
Category: Food & Beverages
Price: $24.99
Rating: 4.7 (312 reviews)
In Stock: Yes
Distance: 0.6221795678138733
--------------------------------------------------


