In [38]:
from pathlib import Path

import numpy as np
import pandas as pd
from sentence_transformers import InputExample, SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from sentence_transformers import losses
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from sentence_transformers.trainer import SentenceTransformerTrainer, SentenceTransformerTrainingArguments
import torch

from src.data import load_data
from src.evaluation import naive_map_at_k
from src.fine_tuning import create_product_dict, create_query_dict, create_relevance_dict, prepare_data

In [2]:
# Define variables that will be used in the notebook
DATA_PATH: Path = Path("../data")
SEED: int = 1399

#### Load data

In [3]:
# Load all the dataframes
query_df: pd.DataFrame
product_df: pd.DataFrame
label_df: pd.DataFrame
query_df, product_df, label_df= load_data(datapath=DATA_PATH)

QueryDF: Rows [480], Columns: [3]
ProductDF: Rows [42,994], Columns: [9]
LabelDF: Rows [233,448], Columns: [4]


#### Load model

In [36]:
model_id: str = "Snowflake/snowflake-arctic-embed-l"
model: SentenceTransformer = SentenceTransformer(model_name_or_path=model_id, device="mps", model_kwargs={"torch_dtype": torch.float})

#### Prepare data

In [22]:
# Create queries (query_id, query), corpus (product_id, product_name) and relevant_documents (exact matches) dictionaries.
queries, corpus, relevant_documents = prepare_data(query_df=query_df, product_df=product_df, label_df=label_df)
len(queries), len(corpus), len(relevant_documents)

(480, 42994, 379)

In [23]:
# Split train, test
train_queries, test_queries = train_test_split(list(queries.keys()), test_size=0.2, random_state=SEED)
train_queries_dict, test_queries_dict = {k: queries[k] for k in train_queries}, {k: queries[k] for k in test_queries}
train_relevant_documents, test_relevant_documents = {k: relevant_documents[k] for k in train_queries if k in relevant_documents}, {k: relevant_documents[k] for k in test_queries if k in relevant_documents}
len(train_queries), len(test_queries)

(384, 96)

In [24]:
# Create train input examples
examples = []
for query_id, query in train_queries_dict.items():
    if query_id in train_relevant_documents:
        for product_id in train_relevant_documents[query_id]:
            examples.append(InputExample(texts=[query, corpus[product_id]]))

In [25]:
# Create Dataloader
BATCH_SIZE: int = 64
loader = DataLoader(dataset=examples, batch_size=BATCH_SIZE)

In [26]:
# Loss
loss = losses.MultipleNegativesRankingLoss(model)

In [27]:
# Evaluator
evaluator = InformationRetrievalEvaluator(queries=queries, corpus=corpus, relevant_docs=relevant_documents, show_progress_bar=True)

In [29]:
EPOCHS = 5
warmup_steps = int(len(loader) * EPOCHS * 0.1)

model.fit(
    train_objectives=[(loader, loss)],
    epochs=EPOCHS,
    warmup_steps=warmup_steps,
    output_path='finetuned_arctic',
    show_progress_bar=True,
    evaluator=evaluator,
    evaluation_steps=50,
    use_amp=True
)

ValueError: fp16 mixed precision requires a GPU (not 'mps').

In [19]:
from sentence_transformers import InputExample

def create_training_examples(queries: Dict[str, str], corpus: Dict[str, str], relevant_docs: Dict[str, List[str]]) -> List[InputExample]:
    examples = []
    for query_id, query in queries.items():
        if query_id in relevant_docs:
            for product_id in relevant_docs[query_id]:
                examples.append(InputExample(texts=[query, corpus[product_id]], label=1.0))
            
            # Add negative examples
            negative_samples = set(corpus.keys()) - set(relevant_docs[query_id])
            for neg_product_id in list(negative_samples)[:len(relevant_docs[query_id])]:  # 1:1 ratio of positive to negative samples
                examples.append(InputExample(texts=[query, corpus[neg_product_id]], label=0.0))
    
    return examples

In [20]:
def prepare_data_for_fine_tuning(query_df: pd.DataFrame, product_df: pd.DataFrame, label_df: pd.DataFrame) -> Tuple[List[InputExample], Dict[str, str], Dict[str, str], Dict[str, List[str]], Dict[str, str], Dict[str, str], Dict[str, List[str]]]:
    # Prepare the data
    queries, corpus, relevant_docs = prepare_data(query_df, product_df, label_df)
    
    # Split the data
    train_queries, train_corpus, train_relevant_docs, test_queries, test_corpus, test_relevant_docs = split_data(queries, corpus, relevant_docs)
    
    # Create training examples
    train_examples = create_training_examples(train_queries, train_corpus, train_relevant_docs)
    
    return train_examples, train_queries, train_corpus, train_relevant_docs, test_queries, test_corpus, test_relevant_docs

# Usage
train_examples, train_queries, train_corpus, train_relevant_docs, test_queries, test_corpus, test_relevant_docs = prepare_data_for_fine_tuning(query_df, product_df, label_df)

## Fine tuning

In [17]:
# Function to create InputExamples
def create_examples(row):
    query = row['query_id']
    product = row['product_id']
    label = 1.0 if row['label'] == 'Exact' else 0.5 if row['label'] == 'Partial' else 0.0
    return InputExample(texts=[query, product], label=label)

# Create training examples
train_examples = label_df.apply(create_examples, axis=1).tolist()

In [21]:
label_df["label"].value_counts()

label
Partial       146633
Irrelevant     61201
Exact          25614
Name: count, dtype: int64

In [23]:
product_df["product_id"].nunique()

42994

In [27]:
label_df.groupby("query_id")["product_id"].nunique()

query_id
0      3302
1      1436
2       165
3       997
4       239
       ... 
483     248
484      53
485     255
486      87
487     946
Name: product_id, Length: 480, dtype: int64

In [14]:
label_df[label_df["label"] == "Exact"].groupby(by="query_id")["product_id"].nunique()

query_id
0       39
1        8
2      151
3       55
4       23
      ... 
478      1
481      1
482     12
483      4
487      2
Name: product_id, Length: 379, dtype: int64

In [7]:
BATCH_SIZE: int = 64

## Generate the embeddings

#### Generate queries embeddings

In [5]:
queries_embeddings: np.ndarray = model.encode(
    sentences=query_df["query"].to_list(),
    batch_size=32,
    show_progress_bar=True
)

Batches: 100%|██████████| 15/15 [00:03<00:00,  4.70it/s]


In [6]:
queries_embeddings.shape

(480, 1024)

#### Generating product names embeddings

In [7]:
product_names_embeddings: np.ndarray = model.encode(
    sentences=product_df["product_name"].tolist(),
    batch_size=32,
    show_progress_bar=True
)

Batches: 100%|██████████| 1344/1344 [08:01<00:00,  2.79it/s]


In [8]:
product_names_embeddings.shape

(42994, 1024)

## Calculate cosine similarities

In [9]:
# Compute cosine similarities for all queries
cosine_similarities: np.ndarray = cosine_similarity(X=queries_embeddings, Y=product_names_embeddings)
cosine_similarities.shape

(480, 42994)

In [10]:
# Get top 10 predictions for each query
top_10_indices: np.ndarray = np.argpartition(a=cosine_similarities, kth=-10, axis=1)[:, -10:]
top_10_indices.shape

(480, 10)

In [11]:
# Sort the top 10 predictions by similarity score
top_10_sorted: np.ndarray = np.array([row[np.argsort(-cosine_similarities[i, row])] for i, row in enumerate(top_10_indices)])
top_10_sorted.shape

(480, 10)

## Calculate MAP@10

In [12]:
# Get preds and actual values 
query_df["preds"] = top_10_sorted.tolist()
query_df = query_df.merge(
    right=label_df[label_df["label"] == "Exact"].groupby(by="query_id")["product_id"].unique().rename("actuals"),
    left_on="query_id",
    right_index=True
)

In [13]:
# Calculate naive map@10
query_df['map@k'] = query_df.apply(lambda x: naive_map_at_k(x['actuals'], x['preds'], k=10), axis=1)
query_df.loc[:, 'map@k'].mean()

0.4069180534033216