In [1]:
import pandas as pd
import tiktoken
from openai import AsyncOpenAI
from pathlib import Path
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

from src.data import load_data
from src.settings import Settings
from src.utils import gen_batches

In [2]:
DATA_PATH: Path = Path("../data")
SETTINGS: Settings = Settings()

#### Load data

In [3]:
query_df: pd.DataFrame
product_df: pd.DataFrame
label_df: pd.DataFrame
query_df, product_df, label_df= load_data(datapath=DATA_PATH)

QueryDF: Rows [480], Columns: [3]
ProductDF: Rows [42,994], Columns: [9]
LabelDF: Rows [233,448], Columns: [4]


#### Select product name or combination of features.

In [4]:
embedding_model: str = "text-embedding-3-small"
tokenizer: tiktoken.Encoding = tiktoken.encoding_for_model(model_name=embedding_model)
price_per_million_tokens: float = 0.13

In [5]:
total_cost: float = (len(tokenizer.encode(text="".join(product_df["product_name"].tolist()))) * price_per_million_tokens) / 1e6
print(f"Cost of generating the embeddings of all the products: {total_cost} USD")

Cost of generating the embeddings of all the products: 0.05395585 USD


#### OpenAI embeddings

In [6]:
client: AsyncOpenAI = AsyncOpenAI(api_key=SETTINGS.openai_api_key)

In [7]:
results: list = []
batch_size: int = 1_024
total: int = len(product_df["product_name"].tolist()) // batch_size + 1

In [8]:
query_df

Unnamed: 0,query_id,query,query_class
0,0,salon chair,Massage Chairs
1,1,smart coffee table,Coffee & Cocktail Tables
2,2,dinosaur,Kids Wall Décor
3,3,turquoise pillows,Accent Pillows
4,4,chair and a half recliner,Recliners
...,...,...,...
475,483,rustic twig,Faux Plants and Trees
476,484,nespresso vertuo next premium by breville with...,Espresso Machines
477,485,pedistole sink,Kitchen Sinks
478,486,54 in bench cushion,Furniture Cushions


In [8]:
for idx, batch in enumerate(gen_batches(iterable=product_df["product_name"].tolist(), n=batch_size), start=1):

    if any([idx % 20 == 0, idx // total == 1]):
        print(f"Batch {idx}/{total} processing...")

    vectors: list = await client.embeddings.create(
        model=embedding_model,
        input=batch
    )
    results.extend([embedding.embedding for embedding in vectors.data])

    if any([idx % 20 == 0, idx // total == 1]):
        print(f"Batch {idx}/{total} done!")

Batch 20/42 processing...
Batch 20/42 done!
Batch 40/42 processing...
Batch 40/42 done!
Batch 42/42 processing...
Batch 42/42 done!


In [43]:
np_result = np.array(results)

In [44]:
query = "salon chair"
query_embedded = await client.embeddings.create(model=embedding_model, input=query)
# query_embedded.data[0].embedding

In [46]:
cosine_similarities = cosine_similarity(
    np.array(query_embedded.data[0].embedding).reshape(1, -1), np_result
)

In [64]:
preds = cosine_similarities.flatten().argsort()[-10:][::-1]

In [65]:
actual = label_df[(label_df["query_id"] == 0) & (label_df["label"] == "Exact")]["product_id"].tolist()

In [66]:
hits = 0
for pred in preds.flatten():
    if pred in actual:
        hits += 1

In [None]:
import numpy as np
from typing import List, Dict

def get_top_k_relevant_products(
    query: str,
    embeddings_map: Dict[str, List[float]],
    k: int = 5
) -> List[str]:
    """
    Find the top k product IDs most relevant to the given query.

    Args:
    query (str): The input query string.
    embeddings_map (Dict[str, List[float]]): A dictionary mapping product IDs to their embeddings.
    k (int): The number of top relevant products to return. Default is 5.

    Returns:
    List[str]: A list of the top k relevant product IDs.
    """
    # Convert embeddings_map to a more efficient structure
    product_ids = np.array(list(embeddings_map.keys()))
    embeddings = np.array(list(embeddings_map.values()))

    # Transform query to embedding (placeholder)
    query_embedding = np.random.rand(3072)  # Replace with actual transformation

    # Compute cosine similarity between query and all product embeddings
    similarity_scores = np.dot(embeddings, query_embedding) / (
        np.linalg.norm(embeddings, axis=1) * np.linalg.norm(query_embedding)
    )

    # Get indices of top k similar products
    top_k_indices = np.argsort(similarity_scores)[-k:][::-1]

    # Return the corresponding product IDs
    return product_ids[top_k_indices].tolist()