#### WARNING! This Notebook uses a GPU

In [1]:
!nvidia-smi

Mon Jul 22 20:38:31 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.161.07             Driver Version: 535.161.07   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA RTX 6000 Ada Gene...    On  | 00000000:C1:00.0 Off |                  Off |
| 30%   37C    P8              22W / 300W |      1MiB / 49140MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [3]:
import sys
import os

# Quick fix to use package from external instance with GPU
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(project_root)

In [4]:
from pathlib import Path

import numpy as np
import pandas as pd
from sentence_transformers import InputExample, SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from sentence_transformers import losses
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from sentence_transformers.trainer import SentenceTransformerTrainer, SentenceTransformerTrainingArguments

from src.data import load_data
from src.evaluation import naive_map_at_k, apply_weighted_map
from src.fine_tuning import create_product_dict, create_query_dict, create_relevance_dict, prepare_data

  from tqdm.autonotebook import tqdm, trange


In [5]:
# Define variables that will be used in the notebook
DATA_PATH: Path = Path("../data")
SEED: int = 1399

#### Load data

In [6]:
# Load all the dataframes
query_df: pd.DataFrame
product_df: pd.DataFrame
label_df: pd.DataFrame
query_df, product_df, label_df= load_data(datapath=DATA_PATH)

QueryDF: Rows [480], Columns: [3]
ProductDF: Rows [42,994], Columns: [9]
LabelDF: Rows [233,448], Columns: [4]


#### Load model

In [7]:
model_id: str = "Snowflake/snowflake-arctic-embed-l"
model: SentenceTransformer = SentenceTransformer(model_name_or_path=model_id, device="cuda")

## Fine Tuning

#### Queries, Corpus and Relevant Documents

In [8]:
# Create queries (query_id, query), corpus (product_id, product_name) and relevant_documents (exact matches) dictionaries.
queries: dict[str, str]
corpus: dict[str, str]
relevant_documents: dict[str, list[str]]
queries, corpus, relevant_documents = prepare_data(query_df=query_df, product_df=product_df, label_df=label_df)

In [9]:
len(queries), len(corpus), len(relevant_documents)

(480, 42994, 379)

#### Train and test splits

In [10]:
train_queries: list[str]
test_queries: list[str]
train_queries, test_queries = train_test_split(list(queries.keys()), test_size=0.2, random_state=SEED, shuffle=True)

In [11]:
train_queries_dict: dict[str, str]
test_queries_dict: dict[str, str]
train_queries_dict, test_queries_dict = {k: queries[k] for k in train_queries}, {k: queries[k] for k in test_queries}

In [12]:
train_relevant_documents: dict[str, list[str]]
test_relevant_documents: dict[str, list[str]]
train_relevant_documents, test_relevant_documents = {k: relevant_documents[k] for k in train_queries if k in relevant_documents}, {k: relevant_documents[k] for k in test_queries if k in relevant_documents}

#### Dataloader

In [13]:
# Create train input examples
examples: list[InputExample] = []
for query_id, query in train_queries_dict.items():
    if query_id in train_relevant_documents:
        for product_id in train_relevant_documents[query_id]:
            examples.append(InputExample(texts=[query, corpus[product_id]]))

In [14]:
# Create Dataloader
BATCH_SIZE: int = 128
loader: DataLoader = DataLoader(dataset=examples, batch_size=BATCH_SIZE)

#### Define Loss and Evaluator

In [15]:
# Loss
loss = losses.MultipleNegativesRankingLoss(model)

In [16]:
# Evaluator
evaluator: InformationRetrievalEvaluator = InformationRetrievalEvaluator(
    queries=test_queries_dict,
    corpus=corpus,
    relevant_docs=test_relevant_documents,
    show_progress_bar=True
)

#### Train

In [17]:
EPOCHS = 2
warmup_steps = int(len(loader) * EPOCHS * 0.1)

model.fit(
    train_objectives=[(loader, loss)],
    epochs=EPOCHS,
    warmup_steps=warmup_steps,
    output_path='finetuned_model',
    show_progress_bar=True,
    evaluator=evaluator,
)

Step,Training Loss,Validation Loss,Cosine Accuracy@1,Cosine Accuracy@3,Cosine Accuracy@5,Cosine Accuracy@10,Cosine Precision@1,Cosine Precision@3,Cosine Precision@5,Cosine Precision@10,Cosine Recall@1,Cosine Recall@3,Cosine Recall@5,Cosine Recall@10,Cosine Ndcg@10,Cosine Mrr@10,Cosine Map@100,Dot Accuracy@1,Dot Accuracy@3,Dot Accuracy@5,Dot Accuracy@10,Dot Precision@1,Dot Precision@3,Dot Precision@5,Dot Precision@10,Dot Recall@1,Dot Recall@3,Dot Recall@5,Dot Recall@10,Dot Ndcg@10,Dot Mrr@10,Dot Map@100
150,No log,No log,0.571429,0.688312,0.727273,0.792208,0.571429,0.506494,0.467532,0.431169,0.049945,0.093425,0.123041,0.193934,0.512839,0.6391,0.387754,0.571429,0.688312,0.727273,0.792208,0.571429,0.502165,0.467532,0.431169,0.049945,0.0931,0.123041,0.193934,0.512267,0.636936,0.387759
300,No log,No log,0.571429,0.675325,0.701299,0.779221,0.571429,0.506494,0.45974,0.433766,0.049861,0.093165,0.111268,0.183292,0.50768,0.63221,0.377196,0.571429,0.675325,0.701299,0.779221,0.571429,0.506494,0.45974,0.433766,0.049861,0.093165,0.111268,0.183292,0.50768,0.63221,0.377196


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:26<00:00, 26.09s/it]


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:26<00:00, 26.05s/it]


## Generate the embeddings

#### Generate queries embeddings

In [18]:
queries_embeddings: np.ndarray = model.encode(
    sentences=query_df["query"].to_list(),
    batch_size=128,
    show_progress_bar=True
)

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

In [19]:
queries_embeddings.shape

(480, 1024)

#### Generating product names embeddings

In [20]:
product_names_embeddings: np.ndarray = model.encode(
    sentences=product_df["product_name"].tolist(),
    batch_size=128,
    show_progress_bar=True
)

Batches:   0%|          | 0/336 [00:00<?, ?it/s]

In [21]:
product_names_embeddings.shape

(42994, 1024)

## Calculate cosine similarities

In [22]:
# Compute cosine similarities for all queries
cosine_similarities: np.ndarray = cosine_similarity(X=queries_embeddings, Y=product_names_embeddings)
cosine_similarities.shape

(480, 42994)

In [23]:
# Get top 10 predictions for each query
top_10_indices: np.ndarray = np.argpartition(a=cosine_similarities, kth=-10, axis=1)[:, -10:]
top_10_indices.shape

(480, 10)

In [24]:
# Sort the top 10 predictions by similarity score
top_10_sorted: np.ndarray = np.array([row[np.argsort(-cosine_similarities[i, row])] for i, row in enumerate(top_10_indices)])
top_10_sorted.shape

(480, 10)

## Calculate MAP@10

In [25]:
# Get preds and actual values 
query_df["preds"] = top_10_sorted.tolist()

query_df = query_df.merge(
    how="left",
    right=label_df[label_df["label"] == "Exact"].groupby(by="query_id")["product_id"].unique().rename("actuals_exact"),
    left_on="query_id",
    right_index=True
)
query_df["actuals_exact"] = query_df["actuals_exact"].fillna("").apply(list)

query_df = query_df.merge(
    how="left",
    right=label_df[label_df["label"].isin(["Exact", "Partial"])].groupby(by="query_id")["product_id"].unique().rename("actuals"),
    left_on="query_id",
    right_index=True
)
query_df["actuals"] = query_df["actuals"].fillna("").apply(list)

In [26]:
# Calculate naive map@10
query_df['map@k'] = query_df.apply(lambda x: naive_map_at_k(x['actuals_exact'], x['preds'], k=10), axis=1)
print(f"Naive MAP@K: {query_df.loc[:, 'map@k'].mean()}")

Naive MAP@K: 0.39397622492283957


In [27]:
query_df['weighted_map@k'] = query_df.apply(lambda x: apply_weighted_map(row=x, label_df=label_df, k=10, partial_weight=0.5), axis=1)
weighted_map_at_k_score = query_df['weighted_map@k'].mean()
print(f"Weighted MAP@K: {weighted_map_at_k_score}")

Weighted MAP@K: 0.6125091903659611
