In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

from src.data import load_data
from src.evaluation import naive_map_at_k, apply_weighted_map

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Define variables that will be used in the notebook
DATA_PATH: Path = Path("../data")

#### Load data

In [3]:
# Load all the dataframes
query_df: pd.DataFrame
product_df: pd.DataFrame
label_df: pd.DataFrame
query_df, product_df, label_df= load_data(datapath=DATA_PATH)

QueryDF: Rows [480], Columns: [3]
ProductDF: Rows [42,994], Columns: [9]
LabelDF: Rows [233,448], Columns: [4]


#### Load model

In [4]:
model_id: str = "Snowflake/snowflake-arctic-embed-l"
model: SentenceTransformer = SentenceTransformer(model_name_or_path=model_id, device="mps")

## Generate the embeddings

#### Generate queries embeddings

In [5]:
queries_embeddings: np.ndarray = model.encode(
    sentences=query_df["query"].to_list(),
    batch_size=32,
    show_progress_bar=True
)

Batches:   0%|          | 0/15 [00:00<?, ?it/s]

In [6]:
queries_embeddings.shape

(480, 1024)

#### Generating product names embeddings

In [7]:
product_names_embeddings: np.ndarray = model.encode(
    sentences=product_df["product_name"].tolist(),
    batch_size=32,
    show_progress_bar=True
)

Batches:   0%|          | 0/1344 [00:00<?, ?it/s]

In [8]:
product_names_embeddings.shape

(42994, 1024)

## Calculate cosine similarities

In [9]:
# Compute cosine similarities for all queries
cosine_similarities: np.ndarray = cosine_similarity(X=queries_embeddings, Y=product_names_embeddings)
cosine_similarities.shape

(480, 42994)

In [10]:
# Get top 10 predictions for each query
top_10_indices: np.ndarray = np.argpartition(a=cosine_similarities, kth=-10, axis=1)[:, -10:]
top_10_indices.shape

(480, 10)

In [11]:
# Sort the top 10 predictions by similarity score
top_10_sorted: np.ndarray = np.array([row[np.argsort(-cosine_similarities[i, row])] for i, row in enumerate(top_10_indices)])
top_10_sorted.shape

(480, 10)

## Calculate MAP@10

In [12]:
# Get preds and actual values 
query_df["preds"] = top_10_sorted.tolist()

query_df = query_df.merge(
    how="left",
    right=label_df[label_df["label"] == "Exact"].groupby(by="query_id")["product_id"].unique().rename("actuals_exact"),
    left_on="query_id",
    right_index=True
)
query_df["actuals_exact"] = query_df["actuals_exact"].fillna("").apply(list)

query_df = query_df.merge(
    how="left",
    right=label_df[label_df["label"].isin(["Exact", "Partial"])].groupby(by="query_id")["product_id"].unique().rename("actuals"),
    left_on="query_id",
    right_index=True
)
query_df["actuals"] = query_df["actuals"].fillna("").apply(list)

In [15]:
# Calculate naive map@10
query_df['map@k'] = query_df.apply(lambda x: naive_map_at_k(x['actuals_exact'], x['preds'], k=10), axis=1)
print(f"Naive MAP@K: {query_df.loc[:, 'map@k'].mean()}")

Naive MAP@K: 0.3212957129997061


In [14]:
query_df['weighted_map@k'] = query_df.apply(lambda x: apply_weighted_map(row=x, label_df=label_df, k=10, partial_weight=0.5), axis=1)
weighted_map_at_k_score = query_df['weighted_map@k'].mean()
print(f"Weighted MAP@K: {weighted_map_at_k_score}")

Weighted MAP@K: 0.5212010995370371
