In [None]:
from models import data
import torch
from models import imagebind_model
from models.imagebind_model import ModalityType
import pandas as pd
import numpy as np
from PIL import Image
from matplotlib.pyplot as plt
import pickle
import requests
from io import BytesIO

In [None]:
topk = 3
my_index_name = "pinterest-multimodal-search"

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = imagebind_model.imagebind_huge(pretrained=True)

In [None]:
import os
import requests

def download_image(url, save_directory):
    os.makedirs(save_directory, exist_ok=True)
    print(url)
    try:
        response = requests.get(url)
        if response.status_code == 200:
            # Extract the filename from the URL
            filename = url.split("/")[-1]
            save_path = os.path.join(save_directory, filename)
        
            with open(save_path, "wb") as f:
                f.write(response.content)
            return save_path
        else:
            return ""
    except requests.exceptions.RequestException as e:
        return ""


In [None]:
def get_text_image_embeddings(df, save_directory):
    from models import data
    import pickle
    import os
    df['text_embeddings'] = None 
    df['image_embeddings'] = None

    for i, row in df.iterrows():
        print(i)
        text_description = [row['image_description']]
        image_path = download_image(row['image_url'], save_directory)
        if image_path:
            inputs = {
                ModalityType.TEXT: data.load_and_transform_text(text_description, device),
                ModalityType.VISION: data.load_and_transform_vision_data([image_path], device),
            }

            with torch.no_grad():
                embeddings = model(inputs)
            
            text_embedding = embeddings[ModalityType.TEXT].numpy()
            print(text_embedding.shape)
            image_embedding = embeddings[ModalityType.VISION].numpy()
            print(image_embedding.shape)
            df.at[i, 'text_embeddings'] = text_embedding
            df.at[i, 'image_embeddings'] = image_embedding

            os.remove(image_path)
        else:
            continue

    df['text_embeddings'] = df['text_embeddings'].astype(object)
    df['image_embeddings'] = df['image_embeddings'].astype(object)

    with open('data/ImageBind_multimodal_pinterestData_embeddings', 'wb') as file:
        pickle.dump(df, file)

    return df

Download the pinterest-fashion-dataset.csv from here https://www.kaggle.com/datasets/samikshakolhe/pinterest-fashion-dataset

In [None]:
df = pd.read_csv('pinterest-fashion-dataset.csv')
save_directory = 'images/'
new_df = get_text_image_embeddings(df, save_directory)

In [None]:
# open the pickle file which contains image embddings
with open('data/ImageBind_multimodal_pinterestData_embeddings', 'rb') as file:
      image_data_df = pickle.load(file)

In [None]:
# Pinecone Connect
from pinecone import Pinecone, ServerlessSpec
import pickle
import os

print(type(image_data_df['image_embeddings'].iloc[0]))
pinecone = Pinecone(
   api_key = os.getenv['PINECONE_API_KEY']
)

vector_dim = image_data_df.image_embeddings[0].shape[1]
print(vector_dim)
if my_index_name not in pinecone.list_indexes():
 # Create the vectors dimension
 pinecone.create_index(name = my_index_name,
                       dimension=vector_dim,
                       metric="cosine",
                       spec=ServerlessSpec(
                        cloud="aws",
                        region="us-east-1"
                        ))
# Connect to the index
my_index = pinecone.Index(name = my_index_name)

In [None]:
pinecone.list_indexes()

In [None]:
def convert_to_upsert(data):
    upsert_list = []
    for index, (id_, values, metadata) in enumerate(data):
        entry = {
            "id": id_,
            "values": [val for sublist in values for val in sublist],
            "metadata": metadata
        }
        upsert_list.append(entry)
    return upsert_list

In [None]:
#Insert Image Embeddings into the Pinecone

image_data_df = image_data_df.dropna(subset=['image_embeddings'])
image_data_df["vector_id"] = image_data_df.index
image_data_df["vector_id"] = image_data_df["vector_id"].apply(str)
print(type(image_data_df.iloc[0].age))

# Get all the metadata
final_metadata = []
for index in range(len(image_data_df)):
 final_metadata.append({
     'ID':  index,
     'user_name': image_data_df.iloc[index].user_name,
     'age': int(image_data_df.iloc[index].age),
     'gender': image_data_df.iloc[index].gender,
     'category' : image_data_df.iloc[index].category,
     'brand': image_data_df.iloc[index].brand,
     'image_url': image_data_df.iloc[index].image_url
 })
image_IDs = image_data_df.vector_id.tolist()
image_embeddings = [arr.tolist() for arr in image_data_df.image_embeddings.tolist()]
# # Create the single list of dictionary format to insert
data_to_upsert = list(zip(image_IDs, image_embeddings, final_metadata))
data_to_upsert = convert_to_upsert(data_to_upsert)
# # Upload the final data

def chunks(lst, chunk_size):
    for i in range(0, len(lst), chunk_size):
        yield lst[i:i + chunk_size]

with pinecone.Index(my_index_name, pool_threads=30) as index:
   # Send requests in parallel
   async_results = [
       index.upsert(vectors=ids_vectors_chunk, async_req=True)
       for ids_vectors_chunk in chunks(data_to_upsert, chunk_size=100)
   ]
   # Wait for and retrieve responses (this raises in case of error)
   [async_result.get() for async_result in async_results]
# my_index.upsert(vectors = data_to_upsert)

my_index.describe_index_stats()