In [1]:
!pip install pinecone-client

Collecting pinecone-client
  Downloading pinecone_client-4.1.0-py3-none-any.whl (215 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/215.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━[0m [32m122.9/215.5 kB[0m [31m3.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m215.5/215.5 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pinecone-client
Successfully installed pinecone-client-4.1.0


In [36]:
import pandas as pd
from tqdm.notebook import tqdm
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel

In [2]:
from pinecone import Pinecone, ServerlessSpec

# pinecone api key
api_key = ""
pc = Pinecone(api_key=api_key)

In [3]:
index_name = "ecomm-product-desc"
pc.create_index(
    name=index_name,
    dimension=768,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)
index = pc.Index(index_name)


In [4]:
products = pd.read_csv('products.csv', sep='|', usecols=["ProductID", "SellerID", "product_description"])
products["SellerID"].isna().sum()

134

In [5]:
products["SellerID"] = products["SellerID"].fillna("Unknown")

In [6]:
products = products.dropna(subset=["product_description", "ProductID"]).reset_index(drop=True)
print('Products with description:', products.shape[0])
products.sample(5)

Products with description: 1226


Unnamed: 0,product_description,SellerID,ProductID
806,Bluetooth 5.3 y conexión rápida：Los auriculare...,monyhigh,B0D2XW81QH
1216,Para el jugador experto llegan los auriculares...,ardistel,1540743
100,【Emparejamiento Automático Bluetooth 5.3】Los a...,Unknown,B0CQLQL87D
518,[Calidad de sonido estéreo de alta fidelidad] ...,tozo,B07RP6NF5J
87,【diseño de orejas de gato con LED brillantes】E...,Unknown,B0CTHBV8RT


In [7]:
tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-base-es', device="gpu")
model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-es', trust_remote_code=True).cuda()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [8]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def encode(sentences, batch_size=32):
    embeddings_list = []
    for i in range(0, len(sentences), batch_size):
        batch_sentences = sentences[i:i+batch_size]
        encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors='pt')
        # Move encoded inputs to GPU
        encoded_input = {key: tensor.to("cuda") for key, tensor in encoded_input.items()}
        with torch.no_grad():
            model_output = model(**encoded_input)
        batch_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
        batch_embeddings = F.normalize(batch_embeddings, p=2, dim=1).cpu()
        embeddings_list.append(batch_embeddings)
    return torch.cat(embeddings_list, dim=0)

In [9]:
sentences = list(products["product_description"].values)
embedding = encode(sentences)

In [10]:
vectors = []
for ind, row in products.iterrows():
    # if row["ProductID"].isdigit():
    #     identifier = f'm-{row["ProductID"]}'
    # else:
    #     identifier = row["ProductID"]
    vectors.append({"id": row["ProductID"],
                    "metadata": {"SellerID": row["SellerID"], "ProductID": row["ProductID"]},
                    "values": embedding[ind].tolist()
                    })

In [11]:
import itertools

def chunks(iterable, batch_size=100):
    """A helper function to break an iterable into chunks of size batch_size."""
    it = iter(iterable)
    chunk = tuple(itertools.islice(it, batch_size))
    while chunk:
        yield chunk
        chunk = tuple(itertools.islice(it, batch_size))

In [12]:
# Upsert data with 100 vectors per upsert request
for ids_vectors_chunk in chunks(vectors, batch_size=100):
    index.upsert(vectors=ids_vectors_chunk)

In [13]:
print(index.describe_index_stats())

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 1188}},
 'total_vector_count': 1188}


In [14]:
products.head()

Unnamed: 0,product_description,SellerID,ProductID
0,Diseño ergonómico a prueba de sudor: el diseño...,Unknown,B0CPSHKR37
1,♪♫【Entrega rápida-Amazon】:Gracias por elegirno...,Unknown,B09CKJ9T48
2,【Auriculares para dormir & Diadema Bluetooth& ...,Unknown,B0CRDS3MCN
3,Tecnología Bluetooth 5.3: Los auriculares inal...,Unknown,B0CSW88F32
4,Tecnología Bluetooth 5.3: Los auriculares inal...,Unknown,B0CSW88F32


In [26]:
i = 0
product_id = products.loc[0, "ProductID"]
seller_id = products.loc[0, "SellerID"]
vector = encode([products.loc[0, "product_description"]]).ravel().tolist()

In [33]:
def get_similar_products(row):
    seller_id = row["SellerID"]
    description = row["product_description"]
    vector = encode([description]).ravel().tolist()

    query_results = index.query(
        vector=vector,
        top_k=5,
        filter={
            "SellerID": {"$ne": seller_id}
        }
    )
    similar_products = []
    for result in query_results['matches']:
        product_id = result['id']
        score = result['score']
        similar_products.append((product_id, score))
    return similar_products

In [37]:
similar_products_df = pd.DataFrame(columns=["ProductID", "Matching_ProductID", "score"])

# Loop through each product in the products DataFrame
for ind, row in tqdm(products.iterrows(), total=products.shape[0]):
    product_id = row['ProductID']
    similar_products = get_similar_products(row)
    # Create a DataFrame from the similar_products list of tuples
    temp_df = pd.DataFrame(similar_products, columns=["Matching_ProductID", "score"])
    # Add the original product_id to the DataFrame
    temp_df["ProductID"] = product_id
    # Concatenate the temporary DataFrame to the main DataFrame
    similar_products_df = pd.concat([similar_products_df, temp_df], ignore_index=True)
similar_products_df.info()
similar_products_df.sample(10)

  0%|          | 0/1226 [00:00<?, ?it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6130 entries, 0 to 6129
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ProductID           6130 non-null   object 
 1   Matching_ProductID  6130 non-null   object 
 2   score               6130 non-null   float64
dtypes: float64(1), object(2)
memory usage: 143.8+ KB


Unnamed: 0,ProductID,Matching_ProductID,score
1516,B091CQH6VT,B0D1C46R58,0.770321
6100,1478286,B0CPMZX3P6,0.770498
1856,B0CL94BQ56,B0B2DTNYRJ,0.765542
3999,B0BVY3MXS4,B0CZRWDMLF,0.817633
2299,B0CQF7NX81,B0CHYDX493,0.92818
5148,B0CSSRYVTM,B0BCKGV29W,0.902429
1545,B00NBR6RDS,B0763L841F,0.685303
1783,B0CT4VHRBQ,B0B8HXXNFZ,0.850429
2718,B01N0Z1YKE,B07YCDW6JP,0.690574
3905,B084PB8DKV,B0CZHKMMZM,0.787633


In [39]:
similar_products_df.head()

Unnamed: 0,ProductID,Matching_ProductID,score
0,B0CPSHKR37,B0CZQ7CSMX,0.906745
1,B0CPSHKR37,B0CW5JB9C9,0.885579
2,B0CPSHKR37,B0CR5C2RCD,0.881963
3,B0CPSHKR37,B0CR5LN8Q5,0.881963
4,B0CPSHKR37,B0CHJN5CJB,0.871771


In [38]:
similar_products_df.to_csv("similar_products.csv", index=False)