In [1]:
import os
import openai


In [2]:

print("Check environment\n---------------------")

pinecone_api_key = os.environ.get('PINECONE_API_KEY') or ""

print("pinecone_api_key:", pinecone_api_key)


# openai doesn't need to be initialized, but need to set api key
openai.api_key=os.environ.get("OPENAI_API_KEY")

Check environment
---------------------
pinecone_api_key: f385511c-d71a-41c4-b148-2ab39def04b6


## Load image data

In [3]:
import pandas as pd
df_images = pd.read_csv('product-listing-dataset/images/images.csv')

In [4]:
df_images.head()

Unnamed: 0,image_id,height,width,path
0,010-mllS7JL,106,106,14/14fe8812.jpg
1,01dkn0Gyx0L,122,122,da/daab0cad.jpg
2,01sUPg0387L,111,111,d2/d2daaae9.jpg
3,1168jc-5r1L,186,186,3a/3a4e88e6.jpg
4,11RUV5Fs65L,30,500,d9/d91ab9cf.jpg


## generate a json to store tiltes, descrption and its attached images in a json 

In [7]:
from tqdm import tqdm
def embed(docs: list[str], batch_size=100) -> list[list[float]]:
    doc_embeds = []
    # Process documents in batches
    for i in tqdm(range(0, len(docs), batch_size)):
        batch = docs[i:i + batch_size]
        res = openai.embeddings.create(
        input=batch,
        model="text-embedding-ada-002"
        )
        # Extract embeddings for each document in the batch
        batch_embeds = [r.embedding for r in res.data]
        doc_embeds.extend(batch_embeds)
    return doc_embeds

In [6]:
import json
import pandas as pd

# Path to your JSON file
# file
file_paths = ['product-listing-dataset/listings/listings_0.json','product-listing-dataset/listings/listings_1.json','product-listing-dataset/listings/listings_3.json']


count = 0
main_json = {}
for file_path in file_paths:
    # Open and read the file line by line
    with open(file_path, 'r') as file:
        for line in file:
            one_entry_img_paths = []
            # Parse the JSON data from each line
            json_data = json.loads(line)
            if 'main_image_id' in json_data and 'item_name' in json_data:
                if json_data['item_name'][0]['language_tag'].startswith('en'):
                    image_id = json_data['main_image_id']
                    item_name = json_data['item_name'][0]['value']
                    image_path = df_images[df_images['image_id'] == image_id]['path'].tolist()[0]
                    one_entry_img_paths.append(image_path)
                    if 'other_image_id' in json_data:
                        other_ids = json_data['other_image_id']
                        for ids in other_ids:
                            im_path = df_images[df_images['image_id'] == image_id]['path'].tolist()[0]
                            one_entry_img_paths.append(im_path)
                    else:
                        other_ids = ''
                    # Collect bullet points if they exist and are in English
                    if 'bullet_point' in json_data:
                        bullets = [bp['value'] for bp in json_data['bullet_point'] if bp['language_tag'].startswith('en')]
                        bullet_text = '; '.join(bullets)  # Join all bullet points into a single string
                    else:
                        bullet_text = ''


                    single_entry = {'item_name': item_name, 'features': bullet_text, 'image_paths': one_entry_img_paths}

                    main_json[count]=single_entry

                    count+=1


                    
                    # img_paths.append(one_entry_img_paths)
                    # image_ids.append(image_id)
                    # item_names.append(item_name)
                    # feature_list.append(bullet_text)
        #             # other_images_ids.append(other_ids)
        #             if count == 100:
        #                 break
        # break

In [9]:
texts = [main_json[item]['item_name'] + main_json[item]['features'] for item in main_json.keys()]

In [12]:
embeddings=embed(texts)

100%|██████████| 200/200 [06:08<00:00,  1.84s/it]


In [None]:
for id,embedding in zip(main_json,embeddings):
    print(id, embedding)

In [14]:
for id,embedding in zip(main_json,embeddings):
    main_json[id]['text_embedding']=embedding
    

In [34]:
len(main_json)

19924

In [20]:
import json

# Specify the file name
filename = 'updated_struct_jsons/titles-descrp-paths.json'

# Writing JSON data
with open(filename, 'w') as f:
    json.dump(main_json, f, indent=4)

## create image_embedding

In [22]:
import pandas as pd
import torch
from torchvision import models, transforms
from PIL import Image
import numpy as np



# Pre-trained ResNet50 model setup
model = models.resnet50(pretrained=True)
model = torch.nn.Sequential(*(list(model.children())[:-1]))  # Remove the last layer
model.eval()  # Set the model to evaluation mode

# Image preprocessing
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Function to extract features
def extract_features(image_path):
    image = Image.open(image_path).convert('RGB')
    image = preprocess(image).unsqueeze(0)  # Add batch dimension
    with torch.no_grad():  # No need to compute gradients
        features = model(image)
    return features.flatten().numpy()




## create a id mapping to images and its titles

In [24]:
count_id = 0

id_to_pid_image_pth_embed = {}
for pid in tqdm(main_json):

    for path in main_json[pid]['image_paths']:

        id_to_pid_image_pth_embed[count_id] = {'p_id':pid, 'image_pth': path}
        id_to_pid_image_pth_embed[count_id]['image_embedding'] = extract_features('product-listing-dataset/images/small/'+path)

        count_id+=1



100%|██████████| 19924/19924 [57:58<00:00,  5.73it/s] 


In [25]:
id_to_pid_image_pth_embed

{0: {'p_id': 0,
  'image_pth': '66/665cc994.jpg',
  'image_embedding': array([0.08097482, 0.4570974 , 0.05311731, ..., 0.91808164, 0.3820621 ,
         0.22633062], dtype=float32)},
 1: {'p_id': 0,
  'image_pth': '66/665cc994.jpg',
  'image_embedding': array([0.08097482, 0.4570974 , 0.05311731, ..., 0.91808164, 0.3820621 ,
         0.22633062], dtype=float32)},
 2: {'p_id': 0,
  'image_pth': '66/665cc994.jpg',
  'image_embedding': array([0.08097482, 0.4570974 , 0.05311731, ..., 0.91808164, 0.3820621 ,
         0.22633062], dtype=float32)},
 3: {'p_id': 1,
  'image_pth': 'b4/b4f9d0cc.jpg',
  'image_embedding': array([0.58037436, 0.        , 0.00552477, ..., 0.00571453, 0.02709544,
         0.785745  ], dtype=float32)},
 4: {'p_id': 2,
  'image_pth': '2b/2b1c2516.jpg',
  'image_embedding': array([0.2102012 , 0.3178227 , 0.34608513, ..., 0.4285842 , 2.1814303 ,
         1.3671106 ], dtype=float32)},
 5: {'p_id': 2,
  'image_pth': '2b/2b1c2516.jpg',
  'image_embedding': array([0.2102012 , 

In [27]:
def ndarray_to_list(obj):
    if isinstance(obj, np.ndarray):
        return obj.tolist()
    raise TypeError(f"Object of type {type(obj)} is not JSON serializable")

In [28]:
# Specify the file name
filename = 'updated_struct_jsons/intid_to_pdid_path_imageEmbed.json'

# Writing JSON data
with open(filename, 'w') as f:
    json.dump(id_to_pid_image_pth_embed, f, indent=4,default=ndarray_to_list)




In [7]:
def ndarray_to_list(obj):
    if isinstance(obj, np.ndarray):
        return obj.tolist()
    raise TypeError(f"Object of type {type(obj)} is not JSON serializable")

## split jsons seperating embeddings

In [1]:
import json
def load_json(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)


In [2]:
id_to_pid_image_pth_embed=load_json('updated_struct_jsons/intid_to_pdid_path_imageEmbed.json')

In [3]:
id_to_pid_image_pth_embed['0']

{'p_id': 0,
 'image_pth': '66/665cc994.jpg',
 'image_embedding': [0.08097482472658157,
  0.4570974111557007,
  0.0531173050403595,
  0.31813111901283264,
  0.666716456413269,
  0.15806016325950623,
  0.2719702124595642,
  2.2316596508026123,
  0.16540542244911194,
  0.28136956691741943,
  0.28358110785484314,
  0.3700280487537384,
  0.26680660247802734,
  0.8432878851890564,
  0.5482661128044128,
  0.027355656027793884,
  0.261583536863327,
  0.3337819576263428,
  0.3239371180534363,
  0.20092500746250153,
  0.3601188659667969,
  0.07882501929998398,
  0.09701230376958847,
  0.5052309632301331,
  0.1591760516166687,
  0.31101569533348083,
  0.609798014163971,
  0.6178173422813416,
  0.28462761640548706,
  0.7570529580116272,
  0.054478470236063004,
  1.0819370746612549,
  0.6098838448524475,
  0.2797328531742096,
  0.16604606807231903,
  0.28493598103523254,
  0.3661588430404663,
  0.216024711728096,
  0.07642373442649841,
  0.3476225435733795,
  0.6214969158172607,
  0.115883804857730

In [4]:
id_to_img_embed={}

for key in id_to_pid_image_pth_embed:
    id_to_img_embed[key]=id_to_pid_image_pth_embed[key]['image_embedding']



In [9]:
filename = 'updated_struct_jsons/intid_to_imageEmbed.json'


with open(filename, 'w') as f:
    json.dump(id_to_img_embed, f, indent=4,default=ndarray_to_list)

In [10]:
del id_to_img_embed

In [4]:
id_to_pid_image_pth_embed['0'].keys()

dict_keys(['p_id', 'image_pth', 'image_embedding'])

In [5]:
id_to_img_paths_pid={}

for key in id_to_pid_image_pth_embed:
    id_to_img_paths_pid[key]={'p_id':id_to_pid_image_pth_embed[key]['p_id'], 'image_pth': id_to_pid_image_pth_embed[key]['image_pth']}

In [8]:
filename = 'updated_struct_jsons/intid_to_imagepth_pid.json'


with open(filename, 'w') as f:
    json.dump(id_to_img_paths_pid, f, indent=4,default=ndarray_to_list)