# Import Data


In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/Gen_AI/marketing_sample_for_amazon_com-ecommerce__20200101_20200131__10k_data.csv')
df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,Uniq Id,Product Name,Brand Name,Asin,Category,Upc Ean Code,List Price,Selling Price,Quantity,Model Number,...,Product Url,Stock,Product Details,Dimensions,Color,Ingredients,Direction To Use,Is Amazon Seller,Size Quantity Variant,Product Description
0,4c69b61db1fc16e7013b43fc926e502d,"DB Longboards CoreFlex Crossbow 41"" Bamboo Fib...",,,Sports & Outdoors | Outdoor Recreation | Skate...,,,$237.68,,,...,https://www.amazon.com/DB-Longboards-CoreFlex-...,,,,,,,Y,,
1,66d49bbed043f5be260fa9f7fbff5957,"Electronic Snap Circuits Mini Kits Classpack, ...",,,Toys & Games | Learning & Education | Science ...,,,$99.95,,55324.0,...,https://www.amazon.com/Electronic-Circuits-Cla...,,,,,,,Y,,
2,2c55cae269aebf53838484b0d7dd931a,3Doodler Create Flexy 3D Printing Filament Ref...,,,Toys & Games | Arts & Crafts | Craft Kits,,,$34.99,,,...,https://www.amazon.com/3Doodler-Plastic-Innova...,,,,,,,Y,,
3,18018b6bc416dab347b1b7db79994afa,Guillow Airplane Design Studio with Travel Cas...,,,Toys & Games | Hobbies | Models & Model Kits |...,,,$28.91,,142.0,...,https://www.amazon.com/Guillow-Airplane-Design...,,,,,,,Y,,
4,e04b990e95bf73bbe6a3fa09785d7cd0,Woodstock- Collage 500 pc Puzzle,,,Toys & Games | Puzzles | Jigsaw Puzzles,,,$17.49,,62151.0,...,https://www.amazon.com/Woodstock-Collage-500-p...,,,,,,,Y,,


In [None]:
# missing value overview
missing_percent = df.isnull().mean() * 100

for col, pct in missing_percent.items():
    print(f"{col}: {pct:.2f}% missing")


Uniq Id: 0.00% missing
Product Name: 0.00% missing
Brand Name: 100.00% missing
Asin: 100.00% missing
Category: 8.30% missing
Upc Ean Code: 99.66% missing
List Price: 100.00% missing
Selling Price: 1.07% missing
Quantity: 100.00% missing
Model Number: 17.72% missing
About Product: 2.73% missing
Product Specification: 16.32% missing
Technical Details: 7.90% missing
Shipping Weight: 11.38% missing
Product Dimensions: 95.21% missing
Image: 0.00% missing
Variants: 75.22% missing
Sku: 100.00% missing
Product Url: 0.00% missing
Stock: 100.00% missing
Product Details: 100.00% missing
Dimensions: 100.00% missing
Color: 100.00% missing
Ingredients: 100.00% missing
Direction To Use: 100.00% missing
Is Amazon Seller: 0.00% missing
Size Quantity Variant: 100.00% missing
Product Description: 100.00% missing


# Data Cleaning

## Numerical Columns

### Shipping Weights

In [None]:
# shipping weight
## define valid pattern
import re

valid_pattern = r'^\s*\d+(\.\d+)?\s*(pound|pounds|ounce|ounces)\s*$'

invalid_rows = df[~df['Shipping Weight'].astype(str).str.fullmatch(valid_pattern, case=False, na=False)]

print("Non-numeric or weird 'Shipping Weight' values:")
print(invalid_rows['Shipping Weight'].unique())

Non-numeric or weird 'Shipping Weight' values:
[nan '. pounds' '1,070 pounds']


In [None]:
## unify values and units
def unify_weight_units(weight_str):
    if not isinstance(weight_str, str):
        return None, None

    weight_str = weight_str.strip().lower().replace(',', '')  # normalize

    # match pattern like: 10.5 pounds or 12 ounces
    match = re.match(r'(\d+(\.\d+)?)\s*(pounds?|ounces?)', weight_str)
    if match:
        value = float(match.group(1))
        unit = match.group(3)
        return value, unit

    return None, None

def convert_shipping_weight(weight_str):
    try:
        value, unit = unify_weight_units(weight_str)
        if value is None or unit is None:
            return None

        if 'pound' in unit:
            return round(value, 4)
        elif 'ounce' in unit:
            return round(value / 16, 4)

        return None
    except:
        return None
df['Shipping_Weight_Lbs'] = df['Shipping Weight'].apply(convert_shipping_weight)

### Selling Prices

In [None]:
# Selling Prices
## define valid pattern
valid_price_pattern = r'^\d+(\.\d{2})$' # e.g. 12.00, 12.89
invalid_price_rows = df[~df['Selling Price'].astype(str).str.fullmatch(valid_price_pattern, na=False)]

for val in invalid_price_rows['Selling Price'].unique():
    print(repr(val))

'$237.68'
'$99.95'
'$34.99'
'$28.91'
'$17.49'
'$18.66'
'$29.12'
'$97.68'
'$12.99'
'$38.49'
'$18.16'
'$84.61'
'$33.92'
'$14.99'
'$4.99'
'$34.39'
'$12.88'
'$117.26'
'$9.39'
'$17.85'
'$27.50'
'$10.99'
'$159.99'
'$12.63'
'$11.88'
'$74.99 - $249.99'
'$26.99'
'$34.27'
'$18.70'
'$14.95'
'$6.91'
'$5.99'
'$36.37'
'$60.87'
'$21.07'
'$73.58'
'$8.44'
'$6.45'
'$3.12'
'$6.94 $ 6 . 94'
'$15.45'
'$16.01'
'$19.94'
'$22.38'
'$35.00'
'$37.98'
'$9.93'
'$11.99'
'$11.12'
'$7.99'
'$19.50'
'$10.36'
'$3.06'
'$6.99'
'$6.11'
'$19.98'
'$24.99'
'$38.33'
'$16.43'
'$24.53'
'$19.99'
'$11.42'
'$17.75'
'$14.73'
'$28.74'
'$114.57'
'$12.29'
'$9.27'
'$22.99'
'$14.41'
'$4.30'
'$34.49'
'$30.36'
'$11.00'
'$46.99'
'$14.98'
'$18.93'
'$29.99'
'$14.01'
'$186.06'
'$129.95'
'$10.49'
'$39.90'
'$23.74'
'$5.58'
'$25.96'
'$15.39'
'$8.53'
'$17.57'
'$3.64'
'$38.55'
'$6.47'
'$9.97'
'$9.22'
'$107.86'
'$22.00'
'$15.99'
'$30.82'
'$13.71'
'$22.28'
'$12.50'
'$105.81'
'$74.96'
'$17.40'
'$25.95'
'$99.99'
'$36.91'
'$11.24'
'$18.59'
'$15.98'
'$18

In [None]:
## clean selling price
import re
import numpy as np

def clean_selling_price(price_str):
    try:
        if not isinstance(price_str, str):
            return None

        # Remove unwanted characters and normalize
        cleaned = price_str.replace(',', '').replace('$', '')

        # Find all price-like numbers (e.g., 12.99, 499.00)
        price_matches = re.findall(r'\d+\.\d{2}', cleaned)

        # Convert to float and take the minimum if multiple found
        prices = [float(p) for p in price_matches]
        if prices:
            return round(min(prices), 2)
        else:
            return None
    except:
        return None

df['Selling_Price'] = df['Selling Price'].apply(clean_selling_price)
invalid_price_rows = df[df['Selling_Price'].isnull() & df['Selling Price'].notnull()]
for val in invalid_price_rows['Selling Price'].unique():
    print(repr(val))

'Total price:'
'$ 12 63'
'& FREE Shipping. Details'
'Currently unavailable.'
'from 2 sellers'
'from 1 seller'
'$ 59 99'
'from 4 sellers'
'$ 14 94'
'from 7 sellers'
'from 8 sellers'


In [None]:
## further cleaning selling price
def fix_remaining_prices(price_str):
    try:
        if not isinstance(price_str, str):
            return None

        # skip irrelevant text
        if any(x in price_str.lower() for x in [
            'currently unavailable', 'from', 'total price', 'free shipping'
        ]):
            return None

        # fix things like "$ 14 94" → "14.94"
        fixed_format = re.sub(r'\$?\s*(\d+)\s+(\d{2})', r'\1.\2', price_str)
        fixed_format = fixed_format.replace('$', '').replace(',', '')

        # extract valid float values
        price_matches = re.findall(r'\d+\.\d{2}', fixed_format)
        prices = [float(p) for p in price_matches]
        if prices:
            return round(min(prices), 2)
        return None
    except:
        return None


df['Selling_Price'] = df['Selling Price'].apply(clean_selling_price)

mask_fix = df['Selling_Price'].isnull() & df['Selling Price'].notnull()
df.loc[mask_fix, 'Selling_Price'] = df.loc[mask_fix, 'Selling Price'].apply(fix_remaining_prices)

remaining_invalids = df[df['Selling_Price'].isnull() & df['Selling Price'].notnull()]
print("Remaining problematic entries:")
print(remaining_invalids['Selling Price'].unique())

Remaining problematic entries:
['Total price:' '& FREE Shipping. Details' 'Currently unavailable.'
 'from 2 sellers' 'from 1 seller' 'from 4 sellers' 'from 7 sellers'
 'from 8 sellers']


In [None]:
remaining_invalids = df[~df['Selling_Price'].apply(lambda x: isinstance(x, float) or x == 'NA')]

print(f"Remaining unexpected entries: {len(remaining_invalids)}")
print(remaining_invalids[['Selling Price', 'Selling_Price']].head())

Remaining unexpected entries: 0
Empty DataFrame
Columns: [Selling Price, Selling_Price]
Index: []


In [None]:
df['Selling_Price'] = df['Selling_Price'].fillna('NA')
print("Count of 'NA' strings:", (df['Selling_Price'] == 'NA').sum())

Count of 'NA' strings: 162


In [None]:
df.drop('Selling Price', axis=1, inplace=True)

### is_amazon_seller

In [None]:
# clean is_amazon_seller
## examine is_amazon_seller
print("Unique values (raw):")
print(df['Is Amazon Seller'].unique())

Unique values (raw):
['Y' 'N']


In [None]:
## convert to 0 or 1
df['Is_Amazon_Seller_Flag'] = df['Is Amazon Seller'].apply(
    lambda x: 1 if str(x).strip().upper() == 'Y' else 0
)
print("Value counts after conversion:")
print(df['Is_Amazon_Seller_Flag'].value_counts())

Value counts after conversion:
Is_Amazon_Seller_Flag
1    9723
0     279
Name: count, dtype: int64


In [None]:
df.drop('Is Amazon Seller', axis=1, inplace=True)

## Non-numerical Columns

In [None]:
import re
import unicodedata

# general cleaner applied to all fields
def base_clean(text):
    if not isinstance(text, str):
        return ""

    # normalize unicode
    text = unicodedata.normalize('NFKC', text)

    # replace newline/tab with space
    text = text.replace('\n', ' ').replace('\t', ' ')

    # remove non-printable characters
    text = ''.join(c for c in text if c.isprintable())

    # collapse multiple spaces
    text = re.sub(r'\s{2,}', ' ', text)

    return text.strip()

In [None]:
def clean_product_name(text):
    text = base_clean(text)
    # replace accented characters (NFKD)
    text = ''.join(c for c in unicodedata.normalize('NFKD', text) if not unicodedata.combining(c))
    text = text.replace('’', "'").replace('–', '-')
    text = re.sub(r'\s*\|\s*', '. ', text)
    return text.strip()


def clean_about(text):
    text = base_clean(text)
    text = re.sub(r'Make sure this fits by entering your model number\.', '', text, flags=re.IGNORECASE)
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # remove emojis
    text = re.sub(r'\s*\|\s*', '. ', text)
    text = re.sub(r'(\d)([a-zA-Z])', r'\1 \2', text)
    text = re.sub(r'\.{2,}', '.', text)
    if text and text[-1] not in '.!?':
        text += '.'
    return text.strip()


def clean_spec(text):
    text = base_clean(text)
    text = re.sub(r'\(.*?View shipping rates and policies.*?\)', '', text, flags=re.IGNORECASE)
    text = re.sub(r'ASIN: \w+', '', text)
    text = re.sub(r'Item model number: \w+', '', text)
    text = re.sub(r'#\d+ in .*?(?=\.|$)', '', text)
    text = re.sub(r'\s*\|\s*', '. ', text)
    text = re.sub(r'(\d)([a-zA-Z])', r'\1 \2', text)
    if text and text[-1] not in '.!?':
        text += '.'
    return text.strip()


def clean_technical(text):
    text = base_clean(text)
    text = re.sub(r'Go to your orders.*?Ship it!', '', text, flags=re.IGNORECASE)
    text = re.sub(r'From the Manufacturer.*?\.', '', text, flags=re.IGNORECASE)
    text = re.sub(r'show up to \d+ reviews by default', '', text, flags=re.IGNORECASE)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    text = re.sub(r'(\d)([a-zA-Z])', r'\1 \2', text)
    if text and text[-1] not in '.!?':
        text += '.'
    return text.strip()


def format_category(text):
    if not isinstance(text, str) or not text.strip():
        return ""
    levels = [level.strip() for level in text.split('|') if level.strip()]
    if len(levels) == 1:
        return f"It is part of the {levels[0]} category."
    else:
        category_path = ", ".join(levels)
        return f"This product appears in the following categories: {category_path}."


In [None]:
df['Product_Name'] = df['Product Name'].apply(clean_product_name)
df['Product_Category'] = df['Category'].apply(format_category)
df['About_Product'] = df['About Product'].apply(clean_about)
df['Product_Specification'] = df['Product Specification'].apply(clean_spec)
df['Technical_Details'] = df['Technical Details'].apply(clean_technical)

In [None]:
## scan each one for non-ASCII or non-printable characters
def has_weird_characters(text):
    try:
        return bool(re.search(r'[^\x20-\x7E]', text))
    except:
        return False
columns_to_check = [
    'Product_Name',
    'Product_Category',
    'About_Product',
    'Product_Specification',
    'Technical_Details'
]

for col in columns_to_check:
    weird_rows = df[df[col].apply(has_weird_characters)]
    print(f"\nColumn: {col}")
    print(f"Weird rows: {len(weird_rows)}")
    if not weird_rows.empty:
        print("Examples:")
        print(weird_rows[col].dropna().unique()[:5])  # show first 5 unique weird entries


Column: Product_Name
Weird rows: 85
Examples:
['SpongeBob SquarePants, Masterpiece Memes, 8” Collectible Vinyl Figure, Handsome Squidward (Closed Eyes)'
 'Melissa & Doug Kids Furniture, Wooden Table & 4 Chairs (White Table, Pastel Pink, Yellow, Green, Blue Chairs, 20.5” H x 23.5” W x 20” L, Great Girls and Boys - Best for 3, 4, 5, 6, 7 and 8 Year Olds)'
 'Melissa & Doug Mine to Love Jordan 12” Baby Doll (Includes Romper, Cap, Pacifier, Great Gift for Girls and Boys - Best for Babies, 18 Month Olds, 24 Month Olds, 1 and 2 Year Olds )'
 'Click N\' Play Jumbo 10.5”" Animal Figurine Playset, Assorted 5Piece Realistically Designed Wild Zoo, Safari, Jungle Plastic Animals for Kids & Toddlers'
 'Beistle 50354 Party Supplies, 8" x 51⁄2" x 51⁄2", Multicolored']

Column: Product_Category
Weird rows: 378
Examples:
['This product appears in the following categories: Home & Kitchen, Home Décor, Window Treatments, Window Stickers & Films, Window Films.'
 'This product appears in the following categ

## Drop unnecessary features

In [None]:
# drop original columns
df.drop('Product Name', axis=1, inplace=True)
df.drop('Category', axis=1, inplace=True)
df.drop('About Product', axis=1, inplace=True)
df.drop('Product Specification', axis=1, inplace=True)
df.drop('Technical Details', axis=1, inplace=True)

In [None]:
# drop columns with 100% missing values
df = df.dropna(axis=1, how='all')
df.shape

(10002, 16)

In [None]:
df.columns

Index(['Uniq Id', 'Upc Ean Code', 'Model Number', 'Shipping Weight',
       'Product Dimensions', 'Image', 'Variants', 'Product Url',
       'Shipping_Weight_Lbs', 'Selling_Price', 'Is_Amazon_Seller_Flag',
       'Product_Name', 'Product_Category', 'About_Product',
       'Product_Specification', 'Technical_Details'],
      dtype='object')

In [None]:
df.drop('Uniq Id', axis=1, inplace=True)
df.drop('Upc Ean Code', axis=1, inplace=True)
df.drop('Shipping Weight', axis=1, inplace=True)
df.drop('Product Dimensions', axis=1, inplace=True)

df.drop('Model Number', axis=1, inplace=True)
df.drop('Variants', axis=1, inplace=True)
df.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop('Uniq Id', axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop('Upc Ean Code', axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop('Shipping Weight', axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop('Product Dimensions', ax

(10002, 10)

# Text Embeddings

## Generate a single unified description


- Since CLIP only accepts text and image inputs, numerical features like price or weight need to be embedded into the text.

- Industry experience and research (such as Alibaba's UniVL and Amazon's SCANN) suggest that natural language formatting is more effective.  
  For example, instead of writing:
  
  `"Product: iPhone | Price: $699"`  
  it's better to say:  
  `"This is the iPhone. It costs $699 and belongs to the Phone category."`

- Natural sentences like this are easier for both CLIP and language models to understand.

- Models trained on large web corpora are more aligned with descriptive, human-style phrasing than rigid, structured formats.

- This method is also widely used in RAG pipelines built with frameworks like LangChain and Hugging Face.


In [None]:
# create unified CLIP text input
def build_natural_clip_text(row):
  """
  This function builds a natural language product description by combining:
  Product name (always included)
  Category → “his product falls under the category.”
  Price → “It is priced at $X.”
  Weight → “It weighs around X pounds.”
  About Product → added directly
  Specs & Technical Details → added as full sentences if present
  """
  name = row['Product_Name']
  category = f"{row['Product_Category']}." if row['Product_Category'] else ""
  price = f"It is priced at ${row['Selling_Price']}." if isinstance(row['Selling_Price'], float) else ""
  weight = f"It weighs around {row['Shipping_Weight_Lbs']} pounds." if pd.notnull(row['Shipping_Weight_Lbs']) else ""
  about = row['About_Product'] or ""
  spec = f"Specifications include: {row['Product_Specification']}." if row['Product_Specification'] else ""
  detail = f"Additional details: {row['Technical_Details']}." if row['Technical_Details'] else ""

  return f"{name}. {category} {price} {weight} {about} {spec} {detail}".strip()

df['CLIP_Text'] = df.apply(build_natural_clip_text, axis=1)

In [None]:
df['CLIP_Text'][0]

'DB Longboards CoreFlex Crossbow 41" Bamboo Fiberglass Longboard Complete. This product appears in the following categories: Sports & Outdoors, Outdoor Recreation, Skates, Skateboards & Scooters, Skateboarding, Standard Skateboards & Longboards, Longboards.. It is priced at $237.68. It weighs around 10.7 pounds. . RESPONSIVE FLEX: The Crossbow features a bamboo core encased in triaxial fiberglass and HD plastic for a responsive flex pattern thats second to none. Pumping & carving have never been so satisfying! Flex 2 is recommended for people 120 to 170 pounds. COREFLEX TECH: CoreFlex construction is water resistant, impact resistant, scratch resistant and has a flex like you wont believe. These boards combine fiberglass, epoxy, HD plastic and bamboo to create a perfect blend of performance and strength. INSPIRED BY THE NORTHWEST: Our founding ideal is chasing adventure & riding the best boards possible, inspired by the hills, waves, beaches & mountains all around our headquarters in t

## Text Embedding from CLIP

In [None]:
turn_text_embeddings_on = False

In [None]:
if turn_text_embeddings_on:
  import torch
  import open_clip
  from PIL import Image
  from torchvision import transforms
  from tqdm import tqdm
  # initialize CLIP
  model_name = "ViT-B-32"
  pretrained = "openai"

  model, _, preprocess = open_clip.create_model_and_transforms(model_name, pretrained=pretrained)
  tokenizer = open_clip.get_tokenizer(model_name)

  # inference mode
  model.eval()
  device = "cuda" if torch.cuda.is_available() else "cpu"
  model = model.to(device)

  # text-embedding function
  @torch.no_grad()
  def embed_texts(text_list, batch_size=32):
    all_embeddings = []
    for i in tqdm(range(0, len(text_list), batch_size)):
        # batch
        batch_text = text_list[i:i+batch_size]
        # tokenize
        tokens = tokenizer(batch_text).to(device)
        # embed, return 512-dimensional vector
        embeddings = model.encode_text(tokens).cpu()
        all_embeddings.append(embeddings)
    return torch.cat(all_embeddings)

In [None]:
if turn_text_embeddings_on:
  # tokenize and embed text
  @torch.no_grad()
  def embed_texts(text_list, batch_size=32):
    all_embeddings = []
    for i in tqdm(range(0, len(text_list), batch_size)):
        # batch
        batch_text = text_list[i:i+batch_size]
        # tokenize
        tokens = tokenizer(batch_text).to(device)
        # embed, return 512-dimensional vector
        embeddings = model.encode_text(tokens).cpu()
        all_embeddings.append(embeddings)
    return torch.cat(all_embeddings)

  text_embeddings = embed_texts(df['CLIP_Text'].tolist())

100%|██████████| 313/313 [00:16<00:00, 18.96it/s]


In [None]:
# save embeddings
if turn_text_embeddings_on:
  save_path = "/content/drive/MyDrive/Gen_AI/text_embeddings.npy"
  np.save(save_path, text_embeddings.numpy())

# Image Embeddings

## Examine Invalud URL

In [None]:
turn_image_embeddings_on = False

In [None]:
if turn_image_embeddings_on:
  import requests
  from concurrent.futures import ThreadPoolExecutor, as_completed

  def is_url_valid(index_url_pair, timeout=5):
    i, url = index_url_pair
    main_url = url.split('|')[0].strip()
    try:
        response = requests.get(main_url, timeout=timeout)
        if response.status_code != 200:
            return i
        if "image" not in response.headers.get("Content-Type", "").lower():
            return i
    except requests.exceptions.RequestException:
        return i
    return None

  def check_urls_multithreaded(image_urls, timeout=5, max_workers=50):
    invalid_indices = []
    index_url_pairs = list(enumerate(image_urls))

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(is_url_valid, pair, timeout) for pair in index_url_pairs]

        for future in as_completed(futures):
            result = future.result()
            if result is not None:
                invalid_indices.append(result)

    return len(invalid_indices), invalid_indices

In [None]:
if turn_image_embeddings_on:
  image_urls = df['Image'].tolist()
  num_invalid, invalid_indices = check_urls_multithreaded(image_urls)

In [None]:
if turn_image_embeddings_on:
  num_invalid

27

## Generate image embeddings

In [None]:
# get dimensions
if turn_image_embeddings_on:
  with torch.no_grad():
    dummy = torch.zeros(1, 3, 224, 224).to(device)
    embedding_dim = model.encode_image(dummy).shape[1]
  embedding_dim

512

In [None]:
# embedding functions
if turn_image_embeddings_on:
  def load_image_from_url(url):
    """
    for extracting+preprocess images, output a tensor for each image_url
    """
    try:
      # send request
      response = requests.get(url, timeout=5)
      response.raise_for_status()
      # open image
      img = Image.open(BytesIO(response.content)).convert("RGB")

      # apply CLIP preprocessing tools, output a tensor for each image [3, 224, 224]
      preprocessed_img = preprocess(img)
      return preprocessed_img
    except:
      return None


  @torch.no_grad()
  def embed_images(image_urls, batch_size=32):
    all_embeddings = []
    for i in tqdm(range(0, len(image_urls), batch_size)):
      # batch selection
      batch_urls = image_urls[i:i+batch_size]
      batch_imgs = [] # storing individual tensors
      success_flags = []

      # loop through each url
      for url in batch_urls:
        img_tensor = load_image_from_url(url)
        if img_tensor is not None:
          batch_imgs.append(img_tensor)
          success_flags.append(True)
        else:
          batch_imgs.append(None)
          success_flags.append(False)

      # embed valid images
      valid_imgs = [img for img in batch_imgs if img is not None]

      if valid_imgs:
        # stack to a single batch
        stack_batch = torch.stack(valid_imgs).to(device)
        # embed valid images
        img_embeddings = model.encode_image(stack_batch).cpu()
      else:
        img_embeddings = []

      idx=0
      for flag in success_flags:
        if flag:
          all_embeddings.append(img_embeddings[idx].cpu())
          idx+=1
        else:
          all_embeddings.append(torch.zeros(embedding_dim).cpu())

    return torch.stack(all_embeddings) # (10002, embedding_dim)

In [None]:
# embedding process
if turn_image_embeddings_on:
  image_urls = df['Image'].tolist()
  image_embeddings = embed_images(image_urls)

100%|██████████| 313/313 [16:17<00:00,  3.12s/it]


In [None]:
# save embeddings
if turn_image_embeddings_on:
  save_path = "/content/drive/MyDrive/Gen_AI/image_embeddings.npy"
  np.save(save_path, image_embeddings.numpy())

# Store Embeddings in FAISS

| Query Type                 | Input Required                                                                              | Used Embedding Model                                        | Search Index     | Use Case Example                                          |
| -------------------------- | ------------------------------------------------------------------------------------------- | ----------------------------------------------------------- | ---------------- | --------------------------------------------------------- |
| **Text → Product**         | User enters a product-related question or description (e.g., "lightweight kids skateboard") | `CLIP.encode_text()`                                        | `text_index`     | Search products using natural language                    |
| **Image → Product**        | User uploads a product image                                                                | `CLIP.encode_image()`                                       | `image_index`    | Find similar-looking products from catalog                |
| **Text + Image → Product** | User enters text **and** provides an image                                                  | `CLIP.encode_text()` + `CLIP.encode_image()` (concatenated) | `combined_index` | Retrieve products that match both visual and textual cues |


In [None]:
faiss_on = False

In [None]:
if faiss_on:
  import faiss
  # load embeddings
  if not turn_image_embeddings_on and not turn_text_embeddings_on:
    text_embeddings = torch.tensor(np.load("/content/drive/MyDrive/Gen_AI/text_embeddings.npy"))
    image_embeddings = torch.tensor(np.load("/content/drive/MyDrive/Gen_AI/image_embeddings.npy"))

  text_embeddings = text_embeddings.numpy().astype('float32')
  image_embeddings = image_embeddings.numpy().astype('float32')
  combined_embeddings = np.concatenate([text_embeddings, image_embeddings], axis=1)

  # normalize for cosine similarity
  faiss.normalize_L2(text_embeddings)
  faiss.normalize_L2(image_embeddings)
  faiss.normalize_L2(combined_embeddings)

  # text index
  text_index = faiss.IndexFlatL2(512)
  text_index.add(text_embeddings)

  # image index
  image_index = faiss.IndexFlatL2(512)
  image_index.add(image_embeddings)

  # combined index
  combined_index = faiss.IndexFlatL2(1024)
  combined_index.add(combined_embeddings)

# Store Embedding in Qdrant

In [None]:
qdrant_on = False

In [None]:
if qdrant_on:
  from qdrant_client import QdrantClient
  from qdrant_client.models import PointStruct, VectorParams, Distance
  from sklearn.preprocessing import normalize

  client = QdrantClient(
      url="https://7e8950b7-f7cd-476b-9fe2-cfbabcc676d4.us-east4-0.gcp.cloud.qdrant.io:6333",
      api_key="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.5-6feBUSirUh0qNrxH8ou2clwuKjY1e_lB_jE4DyUjA"
  )

  import torch
  from sklearn.preprocessing import normalize

  # load embeddings
  text_embeddings = torch.tensor(np.load("/content/drive/MyDrive/Gen_AI/text_embeddings.npy"))
  image_embeddings = torch.tensor(np.load("/content/drive/MyDrive/Gen_AI/image_embeddings.npy"))

  # convert to float for normalization
  text_embeddings = text_embeddings.numpy().astype('float32')
  image_embeddings = image_embeddings.numpy().astype('float32')
  combined_embeddings = np.concatenate([text_embeddings, image_embeddings], axis=1)

  # normalize for cosine similarity
  text_embeddings = normalize(text_embeddings, axis=1)
  image_embeddings = normalize(image_embeddings, axis=1)
  combined_embeddings = normalize(combined_embeddings, axis=1)

  # build payload
  def build_payload(row, index):
    return {

        "Product_Name": row["Product_Name"],
        "Product_Category": row["Product_Category"],
        "Selling_Price": row["Selling_Price"],
        "About_Product": row["About_Product"],
        "Product_Specification": row["Product_Specification"],
        "Technical_Details": row["Technical_Details"],
        "Image_URL": row["Image"],
        "index": index
    }

  from qdrant_client.models import CollectionStatus

  # upload embedding function
  def upload_embeddings(name, vectors, dim, df):
    if client.collection_exists(name):
        client.delete_collection(name)


    client.create_collection(
        collection_name=name,
        vectors_config=VectorParams(size=dim, distance=Distance.COSINE)
    )


    points = [
        PointStruct(
            id=i,
            vector=vectors[i].tolist(),
            payload=build_payload(df.iloc[i], i)
        ) for i in range(len(vectors))
    ]

    client.upload_points(collection_name=name, points=points)

  # upload
  upload_embeddings("text_products", text_embeddings, 512, df)
  upload_embeddings("image_products", image_embeddings, 512, df)
  upload_embeddings("combined_products", combined_embeddings, 1024, df)


In [None]:
# test if work properly
from qdrant_client.models import SampleQuery

hits = client.query_points(
    collection_name="combined_products",
    query=SampleQuery(sample="random"),
    limit=5,
    with_payload=True
)

for point in hits.points:
    print(f"Random Product: {point.payload['Product_Name']}")

Random Product: Megahouse Gintama: Ochatomo Galson & Mysterious Visitor Mini Figure Set
Random Product: Rubie's Costume Company Skull Print Tie for Pet
Random Product: Redcat Racing SH .28 Drill Start Backing Plate
Random Product: ECR4Kids Birch Multi-Section Storage Cabinet with 15 Scoop Front Bins, Assorted Colors
Random Product: Tiger Tribe Forest Fairies Colouring Set


In [None]:
from IPython.display import Image, display

for point in hits.points:
    name = point.payload["Product_Name"]
    image_url = point.payload["Image_URL"].split("|")[0]
    print(f"{name}")
    display(Image(url=image_url))

Megahouse Gintama: Ochatomo Galson & Mysterious Visitor Mini Figure Set


Rubie's Costume Company Skull Print Tie for Pet


Redcat Racing SH .28 Drill Start Backing Plate


ECR4Kids Birch Multi-Section Storage Cabinet with 15 Scoop Front Bins, Assorted Colors


Tiger Tribe Forest Fairies Colouring Set


In [None]:
# compare FAISS and Qdrant
compare = False
if compare:

  query_vec = combined_embeddings[0].reshape(1, -1).astype("float32")

  # FAISS
  _, faiss_ids = combined_index.search(query_vec, k=5)

  # Qdrant
  qdrant_hits = client.search(
      collection_name="combined_products",
      query_vector=query_vec.tolist()[0],
      limit=5
  )
  qdrant_ids = [hit.payload["index"] for hit in qdrant_hits]

  print("FAISS top5:", faiss_ids[0])
  print("Qdrant top5:", qdrant_ids)

# Evaluate Retrieval

In [None]:
from qdrant_client.models import Filter
import numpy as np

# query
query_text = "DB Longboards CoreFlex Crossbow 41 Bamboo Fiberglass Longboard Complete"
query_embedding = embed_texts([query_text]).cpu().numpy()[0]  # shape: (512,)
query_embedding = query_embedding / np.linalg.norm(query_embedding)

# retrieve
hits = client.query_points(
    collection_name="text_products",
    query=query_embedding.tolist(),
    limit=10,
    with_payload=True
)
retrieved_indices = [hit.payload['index'] for hit in hits.points]

# evaluate
ground_truth = {0}
def evaluate_retrieval(retrieved, ground_truth, k=5):
    top_k = retrieved[:k]
    correct = sum([1 for idx in top_k if idx in ground_truth])
    accuracy = 1.0 if retrieved[0] in ground_truth else 0.0
    recall = correct / len(ground_truth) if ground_truth else 0.0
    return accuracy, recall

# if the retrieved index contains the ground truth, accuracy = recall = 1
accuracy, recall_at_1 = evaluate_retrieval(retrieved_indices, ground_truth, k=1)
_, recall_at_5 = evaluate_retrieval(retrieved_indices, ground_truth, k=5)
_, recall_at_10 = evaluate_retrieval(retrieved_indices, ground_truth, k=10)

print(f"Accuracy: {accuracy:.2f}")
print(f"Recall@1: {recall_at_1:.2f}")
print(f"Recall@5: {recall_at_5:.2f}")
print(f"Recall@10: {recall_at_10:.2f}")


100%|██████████| 1/1 [00:00<00:00, 96.45it/s]


Accuracy: 1.00
Recall@1: 1.00
Recall@5: 1.00
Recall@10: 1.00


# RAG + LLM

## Load Models and Test Functions

In [None]:
# check vector databse
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance
from sklearn.preprocessing import normalize

client = QdrantClient(
    url="https://7e8950b7-f7cd-476b-9fe2-cfbabcc676d4.us-east4-0.gcp.cloud.qdrant.io:6333",
    api_key="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.5-6feBUSirUh0qNrxH8ou2clwuKjY1e_lB_jE4DyUjA"
)
collections = client.get_collections()
print(collections)

collections=[CollectionDescription(name='image_products'), CollectionDescription(name='text_products'), CollectionDescription(name='combined_products')]


In [None]:
# load CLIP
import open_clip
import torch

clip_model_name = "ViT-B-32"
clip_pretrained = "openai"
clip_model, _, clip_preprocess = open_clip.create_model_and_transforms(clip_model_name, pretrained=clip_pretrained)
clip_tokenizer = open_clip.get_tokenizer(clip_model_name)

clip_model.eval()
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model = clip_model.to(device)

def embed_query(text):
  tokenized = clip_tokenizer([text]).to(device)
  with torch.no_grad():
      features = clip_model.encode_text(tokenized)
  embedding = features[0].cpu().numpy()
  return embedding / np.linalg.norm(embedding)

from PIL import Image
def embed_image(image_file):
    image = Image.open(image_file).convert("RGB")
    processed = clip_preprocess(image).unsqueeze(0).to(device)
    with torch.no_grad():
        features = clip_model.encode_image(processed)
    embedding = features[0].cpu().numpy()
    return embedding / np.linalg.norm(embedding)

In [None]:
# load llm
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
login(token='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')

llama_model_id = "meta-llama/Llama-2-7b-chat-hf"
llama_tokenizer = AutoTokenizer.from_pretrained(llama_model_id)
llama_model = AutoModelForCausalLM.from_pretrained(
    llama_model_id,
    torch_dtype=torch.float16,
    device_map="auto"
).eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# generate answer function
def generate_llama_answer(prompt, max_tokens=512):
  inputs = llama_tokenizer(prompt, return_tensors="pt").to(llama_model.device)
  with torch.no_grad():
      outputs = llama_model.generate(
          **inputs,
          max_new_tokens=max_tokens,
          do_sample=True,
          temperature=0.7,
          top_p=0.9
      )
  return llama_tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
# test if retrieval works
query_text = "wireless headphones"
query_embedding = embed_query(query_text)

hits = client.query_points(
    collection_name="text_products",
    query=query_embedding.tolist(),
    limit=5,
    with_payload=True
)

for i, point in enumerate(hits.points):
    print(f"{i+1}. {point.payload.get('Product_Name', 'N/A')}")

1. Tune Into Environmental Sounds
2. -
3. Little Pretender Walkie Talkies for Kids, 2 Mile Range, 3 Channels, Built in Flash Light
4. Sort It and Store It
5. Crossfire


## LLM Prompt + Answer for Text-only User Query

In [None]:
def build_multimodal_prompt(user_query, hits, max_products=3):
    # Format product context
    context_blocks = []
    for i, h in enumerate(hits[:max_products]):
        p = h.payload
        name = p.get("Product_Name", "Unknown Product")
        price = p.get("Selling_Price", "N/A")
        about = p.get("About_Product", "No description provided.")
        image_url = p.get("Image_URL", "No image available.")

        block = f"""Product {i+1}:
Name: {name}
Price: ${price}
Key Info: {about}
Image: {image_url}"""
        context_blocks.append(block)

    context = "\n\n".join(context_blocks)

    # Few-shot fallback example
    fallback_example = """Example:
User Query: What are the features of the iPhone 15?
Context: No Apple or iPhone product appears in the list.

Answer:
Sorry, I couldn’t find product information related to “iPhone 15” in the provided context. Please try a different query or upload an image.
"""

    # Final prompt with instructions and example
    prompt = f"""You are a multimodal shopping assistant for an e-commerce platform.

A customer has asked a question related to one or more products. You are provided with relevant product information retrieved from a vision-language model. Use **only** this information to answer the query. Do not use any external knowledge or make assumptions.

Instructions:
- Your justifications for your product choice hould only include informations and reasonings that are relevant to the user query.
- If the product in question is not in the context, respond accordingly.
- Do not guess or hallucinate information outside context
- If helpful, you should better include product name and image URL.

{fallback_example}

Context:
{context}

User Query:
{user_query}

Answer:"""

    return prompt


In [None]:
# query
query_text = "I want a lightweight longboard for carving and cruising."

# query embedding
query_embedding = embed_query(query_text)

# get context
raw_result = client.query_points(
    collection_name="text_products",
    query=query_embedding.tolist(),
    limit=5,
    with_payload=True
)
hits = raw_result.points

# get prompt
prompt = build_multimodal_prompt(query_text, hits)

# get answer
answer = generate_llama_answer(prompt)
print(answer)

You are a multimodal shopping assistant for an e-commerce platform.

A customer has asked a question related to one or more products. You are provided with relevant product information retrieved from a vision-language model. Use **only** this information to answer the query. Do not use any external knowledge or make assumptions.

Instructions:
- Your justifications for your product choice hould only include informations and reasonings that are relevant to the user query.
- If the product in question is not in the context, respond accordingly.
- Do not guess or hallucinate information outside context
- If helpful, you should better include product name and image URL.

Example:
User Query: What are the features of the iPhone 15?
Context: No Apple or iPhone product appears in the list.

Answer:
Sorry, I couldn’t find product information related to “iPhone 15” in the provided context. Please try a different query or upload an image.


Context:
Product 1:
Name: SWAGSKATE NG2 A.I.-Powered El

Based on your query, I would recommend the Rayne Longboards Minotaur 34" Double Kick Cruiser Skateboard. It is a lightweight and versatile longboard that is ideal for carving and cruising. With its double kick design, it provides excellent stability and control, making it perfect for transition skating and high-speed alley ripping. Additionally, Rayne Longboards are committed to creating premium longboards and accessories from eco-friendly materials and processes, which aligns with your preference for a lightweight board

## LLM Prompt + Answer for Text+Image User Query

In [None]:
def build_multimodal_prompt_with_image(user_query, hits, image_uploaded=True, max_products=3):
    context_blocks = []
    for i, h in enumerate(hits[:max_products]):
        p = h.payload
        name = p.get("Product_Name", "Unknown Product")
        price = p.get("Selling_Price", "N/A")
        about = p.get("About_Product", "No description provided.")
        image_url = p.get("Image_URL", "No image available.")

        block = f"""Product {i+1}:
Name: {name}
Price: ${price}
Key Info: {about}
Image: {image_url}"""
        context_blocks.append(block)

    context = "\n\n".join(context_blocks)

    # Add fallback few-shot example for image queries
    fallback_example = """Example:
User Query: [Image of an iPhone]
Context: No Apple or iPhone product appears in the list.

Answer:
Sorry, I couldn’t identify the product in the uploaded image based on the current context. Please try uploading a clearer image or rephrasing your query.
"""

    # Adjust instructions based on image presence
    instructions = f"""You are a multimodal shopping assistant for an e-commerce platform.

A customer has asked a question that may involve an uploaded image, a text query, or both. You are given relevant product information retrieved that compares the image and/or text query against product images and descriptions.

Instructions:
- ONLY use the retrieved product information in the context.
- Do NOT guess or hallucinate any product details.
- If the product in the uploaded image is not among the retrieved results, say so clearly.
- If helpful, include product name and image URL in your answer.
- Be concise, accurate, and under 100 words.
"""

    prompt = f"""{instructions}

{fallback_example if image_uploaded else ""}

Context:
{context}

User Query:
{user_query}

Answer:"""

    return prompt


In [None]:
# images and query
image_path = "/content/drive/MyDrive/Gen_AI/LongBoards.jpg"
query_text = "Is this board good for cruising?"

# embedding
with open(image_path, "rb") as f:
  img_emb = embed_image(f)

txt_emb = embed_query(query_text)

# combine images and text embeddings
combined_emb = np.concatenate([txt_emb, img_emb])
combined_emb /= np.linalg.norm(combined_emb)

# retrieve top context
raw_result = client.query_points(
    collection_name="combined_products",
    query=combined_emb.tolist(),
    limit=5,
    with_payload=True
)
hits = raw_result.points

# get prompt
prompt = build_multimodal_prompt_with_image(query_text, hits, image_uploaded=True)

# get answer
answer = generate_llama_answer(prompt)
print(answer)

You are a multimodal shopping assistant for an e-commerce platform.

A customer has asked a question that may involve an uploaded image, a text query, or both. You are given relevant product information retrieved that compares the image and/or text query against product images and descriptions.

Instructions:
- ONLY use the retrieved product information in the context.
- Do NOT guess or hallucinate any product details.
- If the product in the uploaded image is not among the retrieved results, say so clearly.
- If helpful, include product name and image URL in your answer.
- Be concise, accurate, and under 100 words.


Example:
User Query: [Image of an iPhone]
Context: No Apple or iPhone product appears in the list.

Answer:
Sorry, I couldn’t identify the product in the uploaded image based on the current context. Please try uploading a clearer image or rephrasing your query.


Context:
Product 1:
Name: Bamboo Skateboards - Pintail Longboard Tiki Man 44" x 9.5" Deck
Price: $74.77
Key In

Based on the product information provided, the Prism Skate Co Biscuit Artist Series Skateboard is suitable for cruising. It has a small size and lightweight design, making it easy to maneuver and control. Additionally, the bamboo top sheet provides a smooth and responsive ride.