# Astrabolt - Load Products

This notebook uses **ragstack-ai** and **google-cloud-aiplatform** to connect to Astra DB, create the collections needed, and insert the product catalog with its vector embeddings using gemini multimodal embeeding.

## Install Dependencies, Authenticate, and Create Collections

In [None]:
!pip install google-cloud-aiplatform ragstack-ai --upgrade

Collecting ragstack-ai
  Downloading ragstack_ai-0.4.0-py3-none-any.whl (4.2 kB)
Collecting astrapy<0.7.0,>=0.6.2 (from ragstack-ai)
  Downloading astrapy-0.6.2-py3-none-any.whl (21 kB)
Collecting cassio<0.2.0,>=0.1.3 (from ragstack-ai)
  Downloading cassio-0.1.3-py3-none-any.whl (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-community==0.0.3 (from ragstack-ai)
  Downloading langchain_community-0.0.3-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-core==0.1.1 (from ragstack-ai)
  Downloading langchain_core-0.1.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.6/190.6 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain[openai]==0.0.350 (from ragstack-ai)
  Downloading langchain-0.0.350-py3-non

In [None]:
import getpass, os, requests

if "GCP_PROJECT_ID" not in os.environ or True:
  os.environ["GCP_PROJECT_ID"] = getpass.getpass("Provide your GCP Project ID")

if "ASTRA_DB_ENDPOINT" not in os.environ or True:
  os.environ["ASTRA_DB_ENDPOINT"] = getpass.getpass("Provide your Astra DB Endpoint")

if "ASTRA_DB_TOKEN" not in os.environ or True:
  os.environ["ASTRA_DB_TOKEN"] = getpass.getpass("Provide your Astra DB Token")

Provide your GCP Project ID··········
Provide your Astra DB Endpoint··········
Provide your Astra DB Token··········


In [None]:
from google.colab import auth
from google.cloud import aiplatform

!gcloud config set project {os.getenv("GCP_PROJECT_ID")}

auth.authenticate_user()


Updated property [core/project].


In [None]:
from astrapy.db import AstraDB
# Initialize our vector db
astra_db = AstraDB(token=os.getenv("ASTRA_DB_TOKEN"), api_endpoint=os.getenv("ASTRA_DB_ENDPOINT"))
collection_descriptions = astra_db.create_collection(collection_name="product_catalog_descriptions", dimension=1408)
collection_images = astra_db.create_collection(collection_name="product_catalog_images", dimension=1408)

## Download Product Catalog

In [None]:
PRODUCT_CATALOG_URI="https://raw.githubusercontent.com/BestBuyAPIs/open-data-set/master/products.json"

In [None]:
!wget $PRODUCT_CATALOG_URI

--2023-12-28 03:21:20--  https://raw.githubusercontent.com/BestBuyAPIs/open-data-set/master/products.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 39685207 (38M) [text/plain]
Saving to: ‘products.json’


2023-12-28 03:21:21 (167 MB/s) - ‘products.json’ saved [39685207/39685207]



In [None]:
import json
with open('products.json', 'r') as f:
    data = json.load(f)
products_slice=data[:1000] #Feel free to modify the size of the dataset as needed

## Create and Load Vector Embeddings
We will first create some helper functions for things such as downloading images, inserting records into Astra DB. And finally we will load the data prom `products_slice` in batches.

In [None]:
import requests
from PIL import Image
def download_image(image_url): #Downloads images from public bestbuy's URI to local file
  try:
    response = requests.get(image_url)
    response.raise_for_status()  # Raise an exception for error status codes

    filename = image_url.rsplit("/", 1)[-1]
    # Create the folder if it doesn't exist
    folder_path="product_images"
    if not os.path.exists(folder_path):
      os.makedirs(folder_path)
    file_path=f"product_images/{filename}"
    with open(file_path, "wb") as f:
      f.write(response.content)
      return file_path
  except requests.exceptions.HTTPError as err:
    if err.response.status_code == 404:
        print("File not found at the specified URL.")
        return None
    else:
        print("An error occurred:", err)
        return None

In [None]:
def insert_document(collection,document, verbose=0): #loads the document into the specified collection
  try:
    # add to the AstraDB Vector Database
    collection.insert_one(document)
  except Exception as error:
    # if you've already added this record, skip the error message
    error_info = json.loads(str(error))
    if error_info[0]['errorCode'] == "DOCUMENT_ALREADY_EXISTS" and verbose>0:
      print("Document already exists in the database.  Skipping.")

In [None]:
from langchain.chat_models import ChatVertexAI

llm = ChatVertexAI(project=os.getenv("GCP_PROJECT_ID"), model_name="gemini-pro-vision", region="uswest-1")


In [None]:
import tqdm, time
from vertexai.preview.vision_models import MultiModalEmbeddingModel, Image

model = MultiModalEmbeddingModel.from_pretrained("multimodalembedding@001")
multimodalembedding_requests_per_minute=120
batch_size=multimodalembedding_requests_per_minute

# Process product catalog in batches of 1000
for batch_start in tqdm.tqdm(range(0, len(products_slice), batch_size), desc="Processing product catalog"):
    batch_products = products_slice[batch_start:batch_start + batch_size]

    # Create embeddings for the batch of products
    t1 = time.perf_counter()
    for product in batch_products:
      filename = product['image'].rsplit("/", 1)[-1]
      import os.path
      if not os.path.exists(f"product_images/{filename}"):
        product["_id"] = product["sku"]
        product["description"] = f'{product["name"]}. {product["description"]}'
        product["main_category"] = product["category"][0]
        filename=download_image(product['image'])
        if filename is not None:
          img = Image.load_from_file(filename)
          embeddings = model.get_embeddings(image=img, contextual_text=product['description'])
        else:
          embeddings = model.get_embeddings(contextual_text=product['description'])
        product["$vector"] = embeddings.text_embedding
        insert_document(collection_descriptions,product)
        if filename is not None:
          product["$vector"] = embeddings.image_embedding
          insert_document(collection_images,product)
    #t2 = time.perf_counter()
    #if t2-t1<60:
    #  time.sleep(60-(t2-t1)) #to stay under GCP's API requests quota


Processing product catalog:   0%|          | 0/84 [00:00<?, ?it/s]

File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:   1%|          | 1/84 [00:01<02:12,  1.59s/it]

File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:   2%|▏         | 2/84 [00:02<01:44,  1.27s/it]

File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:   5%|▍         | 4/84 [00:03<01:03,  1.25it/s]

File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:   7%|▋         | 6/84 [00:04<00:53,  1.46it/s]

File not found at the specified URL.


Processing product catalog:   8%|▊         | 7/84 [00:05<00:50,  1.53it/s]

File not found at the specified URL.


Processing product catalog:  10%|▉         | 8/84 [00:05<00:47,  1.59it/s]

File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  11%|█         | 9/84 [00:08<01:26,  1.15s/it]

File not found at the specified URL.


Processing product catalog:  12%|█▏        | 10/84 [00:08<01:11,  1.04it/s]

File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  13%|█▎        | 11/84 [00:10<01:21,  1.11s/it]

File not found at the specified URL.


Processing product catalog:  14%|█▍        | 12/84 [00:10<01:07,  1.06it/s]

File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  15%|█▌        | 13/84 [00:12<01:17,  1.10s/it]

File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  17%|█▋        | 14/84 [00:13<01:23,  1.20s/it]

File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  18%|█▊        | 15/84 [00:15<01:32,  1.34s/it]

File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  19%|█▉        | 16/84 [00:17<01:36,  1.42s/it]

File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  20%|██        | 17/84 [00:19<01:49,  1.64s/it]

File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  21%|██▏       | 18/84 [00:20<01:36,  1.46s/it]

File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  23%|██▎       | 19/84 [00:23<02:03,  1.90s/it]

File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  26%|██▌       | 22/84 [00:36<03:22,  3.26s/it]

File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  27%|██▋       | 23/84 [00:40<03:34,  3.52s/it]

An error occurred: 504 Server Error: Gateway Time-out for url: http://img.bbystatic.com/BestBuy_US/images/pac/products/1308/1308476184/1308476184_sa.jpg
File not found at the specified URL.


Processing product catalog:  30%|██▉       | 25/84 [04:19<52:07, 53.01s/it]

File not found at the specified URL.


Processing product catalog:  32%|███▏      | 27/84 [09:48<1:39:58, 105.23s/it]

File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  33%|███▎      | 28/84 [12:26<1:52:04, 120.09s/it]

File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  36%|███▌      | 30/84 [19:16<2:28:31, 165.03s/it]

File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  37%|███▋      | 31/84 [21:24<2:16:13, 154.22s/it]

File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  38%|███▊      | 32/84 [24:18<2:18:29, 159.80s/it]

File not found at the specified URL.


Processing product catalog:  39%|███▉      | 33/84 [27:06<2:17:57, 162.30s/it]

File not found at the specified URL.


Processing product catalog:  40%|████      | 34/84 [29:37<2:12:33, 159.07s/it]

File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  42%|████▏     | 35/84 [32:24<2:11:50, 161.43s/it]

File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  43%|████▎     | 36/84 [35:09<2:09:54, 162.39s/it]

File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  44%|████▍     | 37/84 [37:42<2:04:55, 159.47s/it]

File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  45%|████▌     | 38/84 [39:42<1:53:19, 147.82s/it]

File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  46%|████▋     | 39/84 [43:44<2:12:04, 176.11s/it]

File not found at the specified URL.


Processing product catalog:  48%|████▊     | 40/84 [46:57<2:12:48, 181.10s/it]

File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  49%|████▉     | 41/84 [49:32<2:04:06, 173.18s/it]

File not found at the specified URL.


Processing product catalog:  50%|█████     | 42/84 [53:22<2:13:15, 190.36s/it]

File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  51%|█████     | 43/84 [55:34<1:58:05, 172.82s/it]

File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  52%|█████▏    | 44/84 [57:55<1:48:55, 163.39s/it]

File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  54%|█████▎    | 45/84 [1:00:45<1:47:18, 165.09s/it]

File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  55%|█████▍    | 46/84 [1:02:52<1:37:26, 153.84s/it]

File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  56%|█████▌    | 47/84 [1:05:46<1:38:38, 159.96s/it]

File not found at the specified URL.


Processing product catalog:  57%|█████▋    | 48/84 [1:09:28<1:47:07, 178.55s/it]

File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  58%|█████▊    | 49/84 [1:12:42<1:46:50, 183.16s/it]

File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  61%|██████    | 51/84 [1:18:26<1:40:44, 183.15s/it]

File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  62%|██████▏   | 52/84 [1:21:44<1:40:04, 187.64s/it]

File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  63%|██████▎   | 53/84 [1:23:52<1:27:37, 169.60s/it]

File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  64%|██████▍   | 54/84 [1:26:25<1:22:26, 164.89s/it]

File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  65%|██████▌   | 55/84 [1:30:05<1:27:36, 181.26s/it]

File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  70%|███████   | 59/84 [1:41:05<1:10:46, 169.87s/it]

File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  71%|███████▏  | 60/84 [1:44:57<1:15:23, 188.49s/it]

File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  73%|███████▎  | 61/84 [1:47:10<1:05:53, 171.90s/it]

File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  74%|███████▍  | 62/84 [1:49:32<59:45, 163.00s/it]  

File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  75%|███████▌  | 63/84 [1:51:56<55:04, 157.38s/it]

File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  76%|███████▌  | 64/84 [1:54:49<53:58, 161.94s/it]

File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  77%|███████▋  | 65/84 [1:57:19<50:11, 158.48s/it]

File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  79%|███████▊  | 66/84 [1:59:35<45:31, 151.74s/it]

File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  80%|███████▉  | 67/84 [2:02:39<45:42, 161.34s/it]

File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  81%|████████  | 68/84 [2:05:34<44:04, 165.28s/it]

File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  82%|████████▏ | 69/84 [2:08:59<44:21, 177.46s/it]

File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  86%|████████▌ | 72/84 [2:16:55<33:03, 165.33s/it]

File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  87%|████████▋ | 73/84 [2:19:58<31:16, 170.56s/it]

File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  95%|█████████▌| 80/84 [2:40:21<11:29, 172.34s/it]

File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  96%|█████████▋| 81/84 [2:41:36<07:09, 143.19s/it]

File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  98%|█████████▊| 82/84 [2:44:50<05:16, 158.19s/it]

File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.
File not found at the specified URL.


Processing product catalog:  99%|█████████▉| 83/84 [2:47:40<02:41, 161.86s/it]

File not found at the specified URL.
File not found at the specified URL.


Processing product catalog: 100%|██████████| 84/84 [2:49:05<00:00, 120.78s/it]


## Experiment Querying the Database
We'll run a couple of queries using text, and image as input.

In [None]:
import json
from vertexai.preview.vision_models import MultiModalEmbeddingModel, Image
from langchain.schema.messages import HumanMessage

model = MultiModalEmbeddingModel.from_pretrained("multimodalembedding@001")

# Embed the similar item
img = Image.load_from_file('2877554_sa.jpg')
embeddings = model.get_embeddings(image=img, contextual_text="")

# Perform the vector search against AstraDB Vector
documents = collection_images.vector_find(
    embeddings.image_embedding,
    limit=3,
)

related_products_csv = "name, image, price, url\n"
for doc in documents:
  related_products_csv += f"{doc['name']}, {doc['image']}, {doc['price']}, {doc['url']},\n"
print(related_products_csv)

name, image, price, url
Griffin Technology - GuitarConnect Cable - Black, http://img.bbystatic.com/BestBuy_US/images/products/1114/1114103_sa.jpg, 29.99, http://www.bestbuy.com/site/griffin-technology-guitarconnect-cable-black/1114103.p?id=1219470055093&skuId=1114103&cmp=RMXCC,
Blue Microphones - Mo-Fi Over-the-Ear Headphones - Gray, http://img.bbystatic.com/BestBuy_US/images/products/1095/1095039_sa.jpg, 349.98, http://www.bestbuy.com/site/blue-microphones-mo-fi-over-the-ear-headphones-gray/1095039.p?id=1219469043846&skuId=1095039&cmp=RMXCC,
Bell'O - TV Stand for Flat-Panel TVs Up to 52" - Cherry, http://img.bbystatic.com/BestBuy_US/images/products/1111/1111949_sa.jpg, 349.99, http://www.bestbuy.com/site/bello-tv-stand-for-flat-panel-tvs-up-to-52-cherry/1111949.p?id=1218220770595&skuId=1111949&cmp=RMXCC,



In [None]:
embeddings = model.get_embeddings(contextual_text="AudioQuest - Niagara 1200 Low-Z Power Conditioner")

# Perform the vector search against AstraDB Vector
documents = collection_descriptions.vector_find(
    embeddings.text_embedding,
    limit=3,
)

related_products_csv = "name, image, price, url\n"
for doc in documents:
  related_products_csv += f"{doc['name']}, {doc['image']}, {doc['price']}, {doc['url']},\n"
print(related_products_csv)

name, image, price, url
Blueair - SmokeStop Filter for Blueair 200 and 300 Series Air Purifiers - Black, http://img.bbystatic.com/BestBuy_US/images/products/1197/1197082_sa.jpg, 99.99, http://www.bestbuy.com/site/blueair-smokestop-filter-for-blueair-200-and-300-series-air-purifiers-black/1197082.p?id=1218231508373&skuId=1197082CC,
Pro Tools Tier 2 Audio Plug-In for PC and Mac Activation Card - Windows|Mac, http://img.bbystatic.com/BestBuy_US/images/products/1003/1003278_sa.jpg, 299, http://www.bestbuy.com/site/pro-tools-tier-2-audio-plug-in-for-pc-and-mac-activation-card-windowsmac/1003278.p?id=1219460772921&skuId=1003278CC,
Acoustic Rock TrackPak - Mac, http://img.bbystatic.com/BestBuy_US/images/products/1003/1003373_sa.jpg, 29.99, http://www.bestbuy.com/site/acoustic-rock-trackpak-mac/1003373.p?id=1219460754207&skuId=1003373CC,

