In [1]:
!pip install chromadb



In [2]:
import os
import pandas as pd
from tqdm import tqdm
import torch
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# ============================================================
# Config
# ============================================================

PARQUET_PATH = "/content/products.parquet"
PERSIST_DIR = "/content/drive/MyDrive/GenAI2 Final Project/Data/vector_store"            # Chroma directory
COLLECTION_NAME = "products_toys"

# Choose a stronger embedding model
EMBEDDING_MODEL_NAME = "BAAI/bge-large-en-v1.5"

In [5]:
def build_features_column(df: pd.DataFrame) -> pd.Series:
    """
    Build the 'features' text field used for embeddings.
    If 'features' already exists in the parquet, reuse it.
    Otherwise, compose it from available text columns.
    """
    if "features" in df.columns:
        # Reuse existing features column
        return df["features"].fillna("")

    # Try to be robust to different column names
    title = df.get("title", "").fillna("")
    brand = df.get("brand", "").fillna("")
    category = df.get("category", "").fillna("")
    ingredients = df.get("ingredients", "").fillna("")
    rating = df.get("rating", "").astype(str).fillna("")

    # Some datasets may have 'description' or similar
    description = df.get("description", "").fillna("")
    about = df.get("about", "").fillna("")
    spec = df.get("specification", "").fillna("")

    features = (
        title + " " +
        brand + " " +
        category + " " +
        ingredients + " " +
        rating + " " +
        description + " " +
        about + " " +
        spec
    )

    return features

In [6]:
def main():
    # ========================================================
    # 1. Load products parquet
    # ========================================================

    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"üöÄ Using Device: {device}")

    if device == 'cpu':
        print("‚ö†Ô∏è Not GPU")
    else:
        print("‚úÖ GPU")


    df = pd.read_parquet(PARQUET_PATH)
    print("Loaded products parquet:", df.shape)
    print("Columns:", df.columns.tolist())
    print(df.head(3))

    # Basic sanity check for required columns
    required_cols = ["id", "title", "brand", "category", "price"]
    for col in required_cols:
        if col not in df.columns:
            raise ValueError(f"Required column '{col}' not found in dataframe.")

    # Ensure numeric types for price/rating
    df["price"] = pd.to_numeric(df["price"], errors="coerce")
    if "rating" not in df.columns:
        df["rating"] = None
    df["rating"] = pd.to_numeric(df["rating"], errors="coerce")

    # (Optional) filter out rows without valid price
    df = df[df["price"].notna()]
    print("After dropping rows with missing price:", df.shape)

    # ========================================================
    # 2. Build features column for embeddings
    # ========================================================
    df["features"] = build_features_column(df)
    print("Sample features text:")
    print(df["features"].head(3).tolist())

    # ========================================================
    # 3. Initialize embedding model & Chroma
    # ========================================================
    print(f"Loading embedding model: {EMBEDDING_MODEL_NAME}")
    model = SentenceTransformer(EMBEDDING_MODEL_NAME, device=device)

    os.makedirs(PERSIST_DIR, exist_ok=True)

    client = chromadb.PersistentClient(
        path=PERSIST_DIR,
        settings=Settings(allow_reset=True)
    )

    # Reset any existing DB (full rebuild)
    client.reset()

    collection = client.create_collection(
        name=COLLECTION_NAME,
        metadata={"description": "Toys/products catalog with rating & ingredients"}
    )

    # ========================================================
    # 4. Prepare data for indexing
    # ========================================================
    texts = df["features"].tolist()
    ids = df["id"].astype(str).tolist()

    metadatas = []
    for _, row in df.iterrows():
        meta = {
            "id": str(row["id"]),
            "title": row.get("title"),
            "brand": row.get("brand"),
            "category": row.get("category"),
            "price": float(row["price"]) if pd.notna(row["price"]) else None,
            "rating": float(row["rating"]) if pd.notna(row["rating"]) else None,
            "ingredients": row.get("ingredients", None),
        }
        metadatas.append(meta)

    print(f"Number of items to index: {len(ids)}")

    # ========================================================
    # 5. Index in batches
    # ========================================================
    BATCH_SIZE = 256

    for start in tqdm(range(0, len(texts), BATCH_SIZE), desc="Indexing"):
        end = start + BATCH_SIZE
        batch_texts = texts[start:end]
        batch_ids = ids[start:end]
        batch_metas = metadatas[start:end]

        batch_embeds = model.encode(batch_texts, show_progress_bar=False).tolist()

        collection.add(
            ids=batch_ids,
            embeddings=batch_embeds,
            metadatas=batch_metas,
            documents=batch_texts
        )

    print("‚úÖ Indexing complete.")
    print(f"Chroma persisted at: {PERSIST_DIR}")
    print(f"Collection name: {COLLECTION_NAME}")


if __name__ == "__main__":
    main()

üöÄ Using Device: cuda
‚úÖ GPU
Loaded products parquet: (8202, 8)
Columns: ['id', 'title', 'brand', 'category', 'price', 'rating', 'features', 'ingredients']
                                 id  \
0  66d49bbed043f5be260fa9f7fbff5957   
1  2c55cae269aebf53838484b0d7dd931a   
2  18018b6bc416dab347b1b7db79994afa   

                                               title     brand  \
0  Electronic Snap Circuits Mini Kits Classpack, ...    Elenco   
1  3Doodler Create Flexy 3D Printing Filament Ref...  3Doodler   
2  Guillow Airplane Design Studio with Travel Cas...   Guillow   

                                            category  price  rating  \
0  Toys & Games | Learning & Education | Science ...  99.95     4.9   
1          Toys & Games | Arts & Crafts | Craft Kits  34.99     5.0   
2  Toys & Games | Hobbies | Models & Model Kits |...  28.91     5.0   

                                            features  \
0  Snap circuits mini kits classpack provides bas...   
1  Smooth 3D drawing e

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Number of items to index: 7864


Indexing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 31/31 [05:11<00:00, 10.06s/it]

‚úÖ Indexing complete.
Chroma persisted at: /content/drive/MyDrive/GenAI2 Final Project/Data/vector_store
Collection name: products_toys



