In [1]:
# Cell 1: Import Libraries
import pandas as pd
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone
import torch
from tqdm.auto import tqdm # For a progress bar

print("Libraries imported successfully.")

Libraries imported successfully.


In [2]:
# Cell 2: Initialize Models and Pinecone
# --- IMPORTANT ---
# Replace with your actual Pinecone API Key and Environment
PINECONE_API_KEY = "pcsk_4vgmFt_4ABagzHLAoEzjBmZX1VoEPZZDGJUZ6wNnoiNDkRCT6s32h6fRQyWpSH35LJGqdu"
PINECONE_ENVIRONMENT = "us-east-1"
PINECONE_INDEX_NAME = "product-recommendations"

# Initialize Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)

# Connect to the index
index = pc.Index(PINECONE_INDEX_NAME)

# Check device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load the multi-modal CLIP model
# This model can embed both text and images into the same vector space.
# Load the new, smaller model directly from the internet
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

print("CLIP model loaded successfully.")

Using device: cpu


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

CLIP model loaded successfully.


In [3]:
# Cell 3: Load the Cleaned Dataset
# We use the cleaned dataset we saved in the previous notebook.
df = pd.read_csv('cleaned_dataset.csv')

# Let's fill any remaining NaNs in object columns just in case
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].fillna('N/A')

print("Cleaned dataset loaded.")
df.head()

Cleaned dataset loaded.


Unnamed: 0,title,brand,description,price,categories,images,manufacturer,package_dimensions,country_of_origin,material,color,uniq_id
0,"GOYMFK 1pc Free Standing Shoe Rack, Multi-laye...",GOYMFK,"multiple shoes, coats, hats, and other items E...",24.99,"['Home & Kitchen', 'Storage & Organization', '...",['https://m.media-amazon.com/images/I/416WaLx1...,GOYMFK,"2.36""D x 7.87""W x 21.6""H",China,Metal,White,02593e81-5c09-5069-8516-b0b29f439ded
1,Plant Repotting Mat MUYETOL Waterproof Transpl...,MUYETOL,,5.98,"['Patio, Lawn & Garden', 'Outdoor Décor', 'Doo...",['https://m.media-amazon.com/images/I/41RgefVq...,MUYETOL,"26.8""L x 26.8""W",,Polyethylene,Green,b2ede786-3f51-5a45-9a5b-bcf856958cd8
2,"Pickleball Doormat, Welcome Doormat Absorbent ...",VEWETOL,The decorative doormat features a subtle textu...,13.99,"['Patio, Lawn & Garden', 'Outdoor Décor', 'Doo...",['https://m.media-amazon.com/images/I/61vz1Igl...,Contrence,"24""L x 16""W",,Rubber,A5589,8fd9377b-cfa6-5f10-835c-6b8eca2816b5
3,JOIN IRON Foldable TV Trays for Eating Set of ...,JOIN IRON Store,Set of Four Folding Trays With Matching Storag...,89.99,"['Home & Kitchen', 'Furniture', 'Game & Recrea...",['https://m.media-amazon.com/images/I/41p4d4VJ...,,"18.9""D x 14.2""W x 26""H",,Iron,Grey Set of 4,bdc9aa30-9439-50dc-8e89-213ea211d66a
4,Folews Bathroom Organizer Over The Toilet Stor...,Folews Store,,63.99,"['Home & Kitchen', 'Furniture', 'Bathroom Furn...",['https://m.media-amazon.com/images/I/41ixgM73...,Folews,"12.6""D x 25.2""W x 68.5""H",China,,,aba4138e-6401-52ca-a099-02e30b638db4


In [4]:
# Cell 4: Prepare Data for Upsert
# Reasoning: We create a 'combined_text' field to generate a rich embedding.
# The metadata will store crucial product information that we want to retrieve
# along with the vector during a search query.

df['combined_text'] = (
    "Title: " + df['title'] + ". " +
    "Brand: " + df['brand'] + ". " +
    "Description: " + df['description'].str.slice(0, 200) + ". " + # Truncate long descriptions
    "Material: " + df['material'] + ". " +
    "Color: " + df['color'] + "."
)

# First, let's see how many rows we actually have
print(f"Total rows in the cleaned dataframe: {len(df)}")

# Now, sample the smaller of 2000 or the total length of the dataframe
sample_size = min(2000, len(df))
df_sample = df.sample(n=sample_size, random_state=42)

print(f"Processing a sample of {len(df_sample)} products.")

Total rows in the cleaned dataframe: 215
Processing a sample of 215 products.


In [5]:
# Cell 5: Generate Embeddings and Upsert to Pinecone in Batches (Corrected)

batch_size = 64
for i in tqdm(range(0, len(df_sample), batch_size)):
    # Get the current batch
    i_end = min(i + batch_size, len(df_sample))
    batch = df_sample.iloc[i:i_end]

    # Get unique IDs for the batch
    ids = [str(x) for x in batch['uniq_id']]

    # Create text embeddings
    texts = batch['combined_text'].tolist()
    text_embeddings = model.encode(texts).tolist()

    # Prepare metadata
    metadata = []
    for idx, row in batch.iterrows():
        meta = {
            'title': row['title'],
            'brand': row['brand'],
            'price': row['price'],
            'description': row['description'],
            # ✅ THIS IS THE CORRECTED LINE 👇
            'image_url': row['images'].split(',')[0].strip("[]' ") if isinstance(row['images'], str) else '',
            'material': row['material'],
            'color': row['color']
        }
        metadata.append(meta)

    # Prepare records for upsert
    records_to_upsert = zip(ids, text_embeddings, metadata)

    # Upsert to Pinecone
    index.upsert(vectors=list(records_to_upsert))

print("\nUpsert to Pinecone complete.")
print(f"Pinecone index stats: {index.describe_index_stats()}")

  0%|          | 0/4 [00:00<?, ?it/s]


Upsert to Pinecone complete.
Pinecone index stats: {'dimension': 384,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}
