# Large Dataset Embeddings

In [2]:
import os
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
from PIL import Image
# !pip install git+https://github.com/openai/CLIP.git
import clip

In [3]:
excel_path = './dataset_m.csv'
df = pd.read_csv(excel_path)
df.head()

Unnamed: 0,name,description,price,image_key
0,Twill Wide-Leg Cargo Pants,A pair of twill pants featuring contrasting to...,24.49,2
1,"Denim for all seasons, 3-length vintage Faded ...",Country of Origin : China(OEM)\nMaterial : 100...,25.68,3
2,Mid-Rise Wide-Leg Cargo Pants,A pair of twill pants featuring a mid-rise wai...,24.49,4
3,Fine layered check blouse,Country of Origin : China(OEM)\nMaterial : Cot...,33.53,5
4,Blena Heart Strap Button Lace Cropped Sleevele...,Country of Origin : Korea/China(OEM)\nMaterial...,24.57,6


In [4]:
df.shape

(6219, 4)

In [5]:
# filter out items with images uploaded
valid_image_indices = list(range(2, 1375)) + list(range(5075, 6221))
# df = df[df['image_key'].isin(valid_image_indices)].reset_index(drop=True)

# print(f"New dataset shape: {df.shape}")

In [6]:
#load clip
device = "cuda:3" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

100%|████████████████████████████████████████| 338M/338M [00:03<00:00, 112MiB/s]


In [13]:
image_folder = "./dataset"
df["text_embedding"] = None
df["image_embedding"] = None

In [15]:
# Setup
BATCH_SIZE = 32
SAVE_EVERY = 200

# Add image_path column based on image_key
df['image_path'] = df['image_key'].apply(lambda k: os.path.join(image_folder, f"{k}.png"))

# Identify rows that still need processing
to_process = df[df["text_embedding"].isnull() | df["image_embedding"].isnull()].copy()
to_process.reset_index(inplace=True)  # Keep original index for df.at[]

batched_texts = []
batched_images = []
batched_df_rows = []

for i, row in tqdm(to_process.iterrows(), total=len(to_process)):
    idx_in_df = row["index"]  # Original index in df
    image_key = row['image_key']

    # Prepare image path
    jpg_path = os.path.join(image_folder, f"{image_key}.jpg")
    png_path = os.path.join(image_folder, f"{image_key}.png")

    # Convert .jpg to .png if needed
    if os.path.isfile(jpg_path) and not os.path.isfile(png_path):
        try:
            img = Image.open(jpg_path).convert("RGB")
            img.save(png_path, "PNG")
            os.remove(jpg_path)
            print(f"Converted {jpg_path} → {png_path}")
        except Exception as e:
            print(f"Error converting {jpg_path}: {e}")

    if not os.path.isfile(png_path):
        print(f"Warning: No image found for {image_key}")
        continue

    # Prepare text
    text_str = f"{row['name']} {row['description']}"[:77]
    batched_texts.append(text_str)

    # Prepare image
    try:
        raw_image = Image.open(png_path).convert("RGB")
        image_tensor = preprocess(raw_image)
        batched_images.append(image_tensor)
        batched_df_rows.append(idx_in_df)
    except Exception as e:
        print(f"Error loading image for {image_key}: {e}")
        continue

    # When enough for a batch or at the end
    if len(batched_images) == BATCH_SIZE or i == len(to_process) - 1:
        with torch.no_grad():
            # Encode batch of text
            text_tokens = clip.tokenize(batched_texts).to(device)
            text_embeds = model.encode_text(text_tokens).cpu().numpy()

            # Encode batch of images
            image_input = torch.stack(batched_images).to(device)
            image_embeds = model.encode_image(image_input).cpu().numpy()

        # Store in df
        for j, df_idx in enumerate(batched_df_rows):
            df.at[df_idx, "text_embedding"] = text_embeds[j].tolist()
            df.at[df_idx, "image_embedding"] = image_embeds[j].tolist()
            df.at[df_idx, "image_path"] = os.path.join(image_folder, f"{df.at[df_idx, 'image_key']}.png")

        # Reset for next batch
        batched_texts = []
        batched_images = []
        batched_df_rows = []

        # # Save checkpoint
        # if i % SAVE_EVERY < BATCH_SIZE:
        #     df.to_pickle('/content/drive/MyDrive/4830 project/embeddings_checkpoint.pkl')
        #     print(f"Checkpoint saved at iteration {i}")

# Final save
# df.to_pickle('/content/drive/MyDrive/4830 project/embeddings_final.pkl')
df.to_csv('./embeddings_final.csv', index=False)
print("✅ Done and saved final version.")

 43%|████▎     | 2649/6187 [01:43<00:01, 2957.07it/s]



 76%|███████▌  | 4693/6187 [01:43<00:00, 6267.59it/s]

Converted ./dataset/5075.jpg → ./dataset/5075.png


100%|██████████| 6187/6187 [03:04<00:00, 33.46it/s]  


✅ Done and saved final version.


In [17]:
output_df = pd.read_csv('./embeddings_final.csv')
output_df.tail()

Unnamed: 0,name,description,price,image_key,text_embedding,image_embedding,image_path
6214,SUITLTD Men Men Navy Striped Single-Breasted R...,SUITLTD Men Men Navy Striped Single Breasted R...,3276,6216,"[0.01517486572265625, -0.059661865234375, 0.04...","[-0.14111328125, -0.2071533203125, 0.163330078...",./dataset/6216.png
6215,Parx Men Blue Single-Breasted Urban Fit Formal...,Parx Men Blue Single Breasted Urban Fit Formal...,3999,6217,"[-0.0133209228515625, 0.06854248046875, 0.1406...","[-0.17431640625, 0.0230560302734375, 0.1593017...",./dataset/6217.png
6216,Peter England Elite Men Black Single-Breasted ...,Peter England Elite Men Black Single Breasted ...,5999,6218,"[0.32568359375, 0.1627197265625, 0.13818359375...","[0.1160888671875, -0.08514404296875, 0.0349426...",./dataset/6218.png
6217,Parx Men Brown Single-Breasted Urban Fit Forma...,Parx Men Brown Single Breasted Urban Fit Forma...,3199,6219,"[0.141357421875, 0.197998046875, 0.22387695312...","[-0.09588623046875, 0.031402587890625, 0.04428...",./dataset/6219.png
6218,SUITLTD Men Grey Striped Suit,"SUITLTD Men Grey Striped Suit, SUITLTD, Suits...",5004,6220,"[-0.2841796875, -0.124267578125, -0.3076171875...","[-0.10406494140625, -0.0645751953125, 0.025955...",./dataset/6220.png
