In [None]:
colab = False

In [None]:
# Required for SPLADE package
if colab:
    !pip install omegaconf

In [None]:
if colab:
    from google.colab import drive
    drive.flush_and_unmount()

    fastcampus_dir = '/content/drive/MyDrive/fastcampus'

    import os
    import zipfile
    from google.colab import drive
    drive.mount('/content/drive')
    os.chdir(fastcampus_dir)
    print("Current Working Directory: ", os.getcwd())

In [None]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import json

In [None]:
from splade.splade.models.transformer_rep import Splade
from transformers import AutoTokenizer

sparse_model_id = 'naver/splade-cocondenser-ensembledistil'

# splade = 'naver/splade-v3'
sparse_model = Splade(sparse_model_id, agg='max')
sparse_model.to('cuda')  # move to GPU if possible
sparse_model.eval()

tokenizer = AutoTokenizer.from_pretrained(sparse_model_id)

In [None]:
image_emb_file = 'img_embeddings_fashion_fine_tuned.json'
upsert_emb_file = 'upsert_vectors_fashion_fine_tuned.json'

In [None]:
embeddings = {}

with open(image_emb_file, 'r') as file:
    for line in file:
        # Convert each line to a dictionary
        embedding_dict = json.loads(line.strip())

        # Convert the list back to a NumPy array if necessary
        for img_name, emb_list in embedding_dict.items():
            embeddings[img_name] = np.array(emb_list)

image_embedddings = pd.DataFrame([embeddings]).T.reset_index()
image_embedddings.rename(columns={"index":"img_id", 0:"img_emb"}, inplace=True)

In [None]:
new_df = pd.read_csv("clothes_final_sparse_doc.csv")

In [None]:
base_path = "imaterialist-fashion-2020-fgvc7/cropped_images/"

new_df['img_path'] = base_path + new_df['ImageId'].astype(str) + "_" + new_df['entity_id'].astype(str) + ".jpg"
# Create key for joining with image df
new_df['img_id'] = new_df['ImageId'].astype(str) + "_" + new_df['entity_id'].astype(str)

In [None]:
new_df = pd.merge(new_df, image_embedddings, on='img_id', how='left')

In [None]:
import torch

In [None]:
def gen_sparse_vector(text):
    tokens = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

    with torch.no_grad():
        sparse_emb = sparse_model(
            d_kwargs=tokens.to('cuda')
        )['d_rep'].squeeze()

    indices = sparse_emb.nonzero().squeeze().cpu().tolist()
    values = sparse_emb[indices].cpu().tolist()

    return indices, values

def upsert_format(id, text, img_emb):
    index, value = gen_sparse_vector(text)

    sparse_values = {
        "indices": index,
        "values": value
    }

    upsert = {
        "id": id,
        "values": img_emb,
        "sparse_values":sparse_values,
        "metadata":{"img_path":"imaterialist-fashion-2020-fgvc7/cropped_images/"+id+".jpg"}
    }
    return upsert

In [None]:
# open('upsert_vectors_fashion_fine_tuned.json', 'w').close()

In [None]:
upserts = list()

for _, row in tqdm(new_df.iterrows(), total=new_df.shape[0]):
    upserts.append(upsert_format(row['img_id'], row['doc'], row['img_emb'].tolist()))
    with open(upsert_emb_file, 'a') as file:
      file.write(json.dumps(upsert_format(row['img_id'], row['doc'], row['img_emb'].tolist())) + '\n')

Check if it was created properly

In [None]:
data_read = []

# Open the file in read mode
with open("upsert_vectors_fashion_fine_tuned.json", 'r') as file:
    # Iterate through each line in the file
    for line in file:
        # Parse the JSON string into a Python dictionary
        data = json.loads(line)
        # Append the dictionary to the list
        data_read.append(data)

# Now, data_read contains all the dictionaries read from the file
print(f"Successfully read {len(data_read)} items from upsert_vectors_fashion_fine_tuned.json")

# Optionally, print the dictionaries to verify

In [None]:
!pip install pinecone-client

In [None]:
## Upsert to pineconeDB!!
from pinecone import Pinecone

pc = Pinecone(api_key="YOUR_PINECONE_API_KEY")
index = pc.Index("fastcampus")
index.describe_index_stats()

In [None]:
index.upsert(upserts)