In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import tiktoken
from openai import AsyncOpenAI
from sklearn.metrics.pairwise import cosine_similarity

from src.data import load_data
from src.evaluation import naive_map_at_k, weighted_map_at_k
from src.settings import Settings
from src.utils import gen_batches

In [2]:
# Define variables that will be used in the notebook
DATA_PATH: Path = Path("../data")
SETTINGS: Settings = Settings()

#### Load data

In [24]:
# Load all the dataframes
query_df: pd.DataFrame
product_df: pd.DataFrame
label_df: pd.DataFrame
query_df, product_df, label_df= load_data(datapath=DATA_PATH)

QueryDF: Rows [480], Columns: [3]
ProductDF: Rows [42,994], Columns: [9]
LabelDF: Rows [233,448], Columns: [4]


#### Cost expectation 

In [4]:
# Using tiktoken, we can calculate the cost of the embeddings
embedding_model: str = "text-embedding-3-large"
tokenizer: tiktoken.Encoding = tiktoken.encoding_for_model(model_name=embedding_model)
price_per_million_tokens: float = 0.13

In [5]:
total_cost: float = (len(tokenizer.encode(text="".join(product_df["product_name"].tolist()))) * price_per_million_tokens) / 1e6
print(f"Cost of generating the embeddings of all the products: {total_cost} USD")

Cost of generating the embeddings of all the products: 0.05395585 USD


In [6]:
total_cost: float = (len(tokenizer.encode(text="".join(query_df["query"].tolist()))) * price_per_million_tokens) / 1e6
print(f"Cost of generating the embeddings of all the queries: {total_cost} USD")

Cost of generating the embeddings of all the queries: 0.00029263 USD


## Generate the embeddings

In [7]:
# Define OpenAI asynchronous client
client: AsyncOpenAI = AsyncOpenAI(api_key=SETTINGS.openai_api_key)

#### Generating product names embeddings

In [8]:
# The embeddings of products are going to be generated in batches because there are more than 40k products 
product_names_embeddings: list = []
batch_size: int = 1_024
total: int = len(product_df["product_name"].tolist()) // batch_size + 1

In [9]:
# Batch execution
for idx, batch in enumerate(gen_batches(iterable=product_df["product_name"].tolist(), n=batch_size), start=1):

    if any([idx % 20 == 0, idx // total == 1]):
        print(f"Batch {idx}/{total} processing...")

    vectors: list = await client.embeddings.create(
        model=embedding_model,
        input=batch
    )
    product_names_embeddings.extend([embedding.embedding for embedding in vectors.data])

    if any([idx % 20 == 0, idx // total == 1]):
        print(f"Batch {idx}/{total} done!")

Batch 20/42 processing...
Batch 20/42 done!
Batch 40/42 processing...
Batch 40/42 done!
Batch 42/42 processing...
Batch 42/42 done!


In [10]:
# Convert embeddings to numpy arrays
product_names_embeddings: np.ndarray = np.array(object=product_names_embeddings)
product_names_embeddings.shape

(42994, 3072)

#### Generate queries embeddings

In [11]:
# Queries embeddings
vectors = await client.embeddings.create(model=embedding_model, input=query_df["query"].to_list())
queries_embeddings: np.ndarray = np.array(object=[embedding.embedding for embedding in vectors.data])
queries_embeddings.shape

(480, 3072)

## Calculate cosine similarities

In [12]:
# Compute cosine similarities for all queries
cosine_similarities: np.ndarray = cosine_similarity(X=queries_embeddings, Y=product_names_embeddings)
cosine_similarities.shape

(480, 42994)

In [13]:
# Get top 10 predictions for each query
top_10_indices: np.ndarray = np.argpartition(a=cosine_similarities, kth=-10, axis=1)[:, -10:]
top_10_indices.shape

(480, 10)

In [14]:
# Sort the top 10 predictions by similarity score
top_10_sorted: np.ndarray = np.array([row[np.argsort(-cosine_similarities[i, row])] for i, row in enumerate(top_10_indices)])
top_10_sorted.shape

(480, 10)

## Calculate MAP@10

In [55]:
# Get preds and actual values 
query_df["preds"] = top_10_sorted.tolist()

query_df = query_df.merge(
    how="left",
    right=label_df[label_df["label"] == "Exact"].groupby(by="query_id")["product_id"].unique().rename("actuals_exact"),
    left_on="query_id",
    right_index=True
)
query_df = query_df.merge(
    how="left",
    right=label_df.groupby(by="query_id")["product_id"].unique().rename("actuals"),
    left_on="query_id",
    right_index=True
)

In [56]:
query_df["actuals_exact"] = query_df["actuals_exact"].fillna("").apply(list)

In [57]:
# Calculate naive map@10
query_df['map@k'] = query_df.apply(lambda x: naive_map_at_k(x['actuals_exact'], x['preds'], k=10), axis=1)
query_df.loc[:, 'map@k'].mean()

0.38698099233906524

In [28]:
def apply_weighted_map(row, label_df, k=10, partial_weight=0.5):
    return weighted_map_at_k(query_id=row['query_id'], true_ids=row["actuals"], predicted_ids=row['preds'], label_df=label_df, k=k, partial_weight=partial_weight)

In [49]:
query_df['weighted_map@k'] = query_df.apply(lambda x: apply_weighted_map(row=x, label_df=label_df, k=10, partial_weight=0.5), axis=1)
weighted_map_at_k_score = query_df['weighted_map@k'].mean()
print(f"Weighted MAP@K: {weighted_map_at_k_score}")

Weighted MAP@K: 0.5927448330026456
