In [None]:
!pip install -qU \
  pinecone-client==3.1.0 \
  pinecone-datasets==0.7.0 \
  sentence-transformers==2.2.2 \
  pinecone-notebooks==0.1.1

In [2]:
import os

print("Check environment\n---------------------")

pinecone_api_key = os.environ.get('PINECONE_API_KEY') or ""

print("pinecone_api_key:", pinecone_api_key)

Check environment
---------------------
pinecone_api_key: d4b84512-ffa0-41ab-b149-d7db7637c9e2


## Load Data

In [31]:
import pandas as pd
df_images = pd.read_csv('images/images.csv')

In [32]:
import json
import pandas as pd

# Path to your JSON file
file_path = 'listings/listings_0.json'

# Lists to store image IDs and item names
image_ids = []
img_paths = []
item_names = []
feature_list = []
# other_images_ids = []

count = 0
# Open and read the file line by line
with open(file_path, 'r') as file:
    for line in file:
        one_entry_img_paths = []
        # Parse the JSON data from each line
        json_data = json.loads(line)
        if 'main_image_id' in json_data and 'item_name' in json_data:
            if json_data['item_name'][0]['language_tag'].startswith('en'):
                image_id = json_data['main_image_id']
                item_name = json_data['item_name'][0]['value']
                image_path = df_images[df_images['image_id'] == image_id]['path'].tolist()[0]
                one_entry_img_paths.append(image_path)
                if 'other_image_id' in json_data:
                    other_ids = json_data['other_image_id']
                    for ids in other_ids:
                        im_path = df_images[df_images['image_id'] == image_id]['path'].tolist()[0]
                        one_entry_img_paths.append(im_path)
                else:
                    other_ids = ''
                # Collect bullet points if they exist and are in English
                if 'bullet_point' in json_data:
                    bullets = [bp['value'] for bp in json_data['bullet_point'] if bp['language_tag'].startswith('en')]
                    bullet_text = '; '.join(bullets)  # Join all bullet points into a single string
                else:
                    bullet_text = ''
                
                img_paths.append(one_entry_img_paths)
                image_ids.append(image_id)
                item_names.append(item_name)
                feature_list.append(bullet_text)
                # other_images_ids.append(other_ids)
                count+=1
                if count == 50:
                    break




In [33]:
metadata = {
    'primary_image_id': image_ids,
    'image_path': img_paths,
    'item_name': item_names,
    'features': feature_list
}

In [78]:
count_id = 0
main_json = {}

for i, p_id in enumerate(metadata['primary_image_id']):

    # pid_to_int = {}
    single_entry = {'item_name': metadata['item_name'][i], 'features': metadata['features'][i], 'image_paths': []}

    for path in metadata['image_path'][i]:

        single_entry['image_paths'].append(path)

    main_json[p_id] = single_entry

In [104]:
main_json

{'81NP7qh2L6L': {'item_name': 'AmazonBasics PETG 3D Printer Filament, 1.75mm, 1 kg Spool 1.75mm AMG10528516-10',
  'features': "3D printer filament with 1.75mm diameter + / - .05mm; designed to fit most common 3D printers (check spool size for compatibility); Translucent Yellow; 1 kg spool; Made of PETG plastic, known for its ease of use (like PLA) and durable strength (like ABS); no heating bed needed; offers easy bed adhesion, stiffness, and a glossy finish; Spool's built-in gauge shows percentage of material remaining and approximate length remaining; Engineered to reduce jamming; resealable storage bag included to protect filament between use; Measures 9.2 by 8.3 by 3 inches (LxWxH); weighs 2.2 pounds; backed by an AmazonBasics 1-year limited warranty",
  'image_paths': ['66/665cc994.jpg', '66/665cc994.jpg', '66/665cc994.jpg']},
 '61Rp4qOih9L': {'item_name': 'Stone & Beam Stone Brown Swatch, 25020039-01',
  'features': '',
  'image_paths': ['b4/b4f9d0cc.jpg']},
 '714CmIfKIYL': {'it

In [105]:
import json

# Specify the file name
filename = 'mainjson.json'

# Writing JSON data
with open(filename, 'w') as f:
    json.dump(main_json, f, indent=4)

In [90]:
count_id = 0

pid_to_int_id = {}
img_paths_to_int_id = {}
for i, pid in enumerate(main_json):

    for path in main_json[pid]['image_paths']:

        pid_to_int_id[count_id] = pid
        img_paths_to_int_id[count_id] = path

        count_id+=1



In [106]:
# Specify the file name
filename = 'pid_to_int_id.json'

# Writing JSON data
with open(filename, 'w') as f:
    json.dump(pid_to_int_id, f, indent=4)

In [107]:
# Specify the file name
filename = 'img_paths_to_int_id.json'

# Writing JSON data
with open(filename, 'w') as f:
    json.dump(img_paths_to_int_id, f, indent=4)

## Resnet50 feature extractor

In [93]:
import pandas as pd
import torch
from torchvision import models, transforms
from PIL import Image
import numpy as np



# Pre-trained ResNet50 model setup
model = models.resnet50(pretrained=True)
model = torch.nn.Sequential(*(list(model.children())[:-1]))  # Remove the last layer
model.eval()  # Set the model to evaluation mode

# Image preprocessing
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Function to extract features
def extract_features(image_path):
    image = Image.open(image_path).convert('RGB')
    image = preprocess(image).unsqueeze(0)  # Add batch dimension
    with torch.no_grad():  # No need to compute gradients
        features = model(image)
    return features.flatten().numpy()






In [94]:
int_id_to_embedding = {}
for int_id, img_path in img_paths_to_int_id.items():
    # embeddings = []
    # for img_path in img_paths:
    emb = extract_features('images/small/'+img_path)
    int_id_to_embedding[int_id] = emb

## Pinecone Setup

In [99]:
from pinecone import ServerlessSpec, Pinecone

pc = Pinecone(api_key='d4b84512-ffa0-41ab-b149-d7db7637c9e2')

pc.list_indexes().names()



['pd-listing']

In [100]:
pc.describe_index(name="pd-listing")

{'dimension': 2048,
 'host': 'pd-listing-p3m338e.svc.aped-4627-b74a.pinecone.io',
 'metric': 'euclidean',
 'name': 'pd-listing',
 'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
 'status': {'ready': True, 'state': 'Ready'}}

In [101]:
product_listing_index = pc.Index('pd-listing')

In [103]:
for int_id, emb in int_id_to_embedding.items():

    product_listing_index.upsert(
        vectors=[
            {
                "id": str(int_id), 
                "values": emb, 
                # "metadata": {"genre": "drama"}
            },
        ],
        namespace= "ns1"
    )

In [36]:
all_embeddings = []
for img_paths in metadata['image_path']:
    embeddings = []
    for img_path in img_paths:
        try:
            emb = extract_features('images/small/'+img_path)
            embeddings.append(emb)
        except Exception as e:
            print(f"Error processing image {img_path}: {e}")
            embeddings.append(np.nan)
    all_embeddings.append(embeddings)