# Embedding Model Selection

# Setup

In [None]:
import pandas as pd
import numpy as np
import os
import sys
from dotenv import load_dotenv
import matplotlib.pyplot as plt
import seaborn as sns
colours = sns.color_palette("Set2")
from sentence_transformers import SentenceTransformer, util
import warnings
warnings.filterwarnings("ignore")
from model_tester import test_embedding_models

project_root = os.path.abspath('..')
if project_root not in sys.path:
    sys.path.insert(0, project_root)
from utils import get_table_from_supabase

#get keys from env
load_dotenv()
url = os.getenv("SUPABASE_URL")
key = os.getenv("SUPABASE_KEY")

## Recipient Objectives

Until this point, the database was limited to recipient names and activities. I have chosen to extend this to include recipient objectives to hopefully enrich the data and provide another context to test for the embedding model selection. I have updated the database builders, so that objectives can be imported from the start in future iterations, and also created `2_recipients_table_builder/recipient_objectives_importer.py` so that I do not have to re-build the entire database at this point in the project.

----

# Retrieving Data from Supabase and Building Dataframes

I will connect to Supabase and retrieve the relevant records, in order to isolate the text data that I will use to embed and select the best model. For this purpose, I will use the 12 funder-recipient pairs that I have curated for my evaluation app.

In [None]:
#get evaluation pairs and extract ids
evaluation_pairs = get_table_from_supabase(url, key, "evaluation_pairs")
funder_ids = evaluation_pairs["funder_registered_num"].unique()
eval_recip_ids = evaluation_pairs["recipient_id"].unique()

#add my ratings to compare later
ratings = {
    1: 0.25,
    2: 0.60,
    3: 0.25,
    4: 0.25,
    5: 0.10,
    6: 0.60,
    7: 0.60,
    8: 0.25,
    9: 0.25,
    10: 0.25,
    11: 0.80,
    12: 0.30
}
evaluation_pairs["my_rating"] = evaluation_pairs["id"].map(ratings)

In [None]:
#get the 12 funders and recipients
funders = get_table_from_supabase(url, key, "funders")
funders = funders[funders["registered_num"].isin(funder_ids)]

recipients = get_table_from_supabase(url, key, "recipients")
recipients = recipients[recipients["recipient_id"].isin(eval_recip_ids)]

#create dataframes
recipients_df = recipients.copy()
funders_df = funders.copy()

#add embedding columns to funder/recipient dfs
funders_df["embeddings"] = None
recipients_df["embeddings"] = None

#add funder and recipient names to pairs df
evaluation_pairs = evaluation_pairs.merge(
    funders_df[["registered_num", "name"]],
    left_on="funder_registered_num",
    right_on="registered_num",
    how="left"
)
evaluation_pairs = evaluation_pairs.rename(columns={"name": "funder_name"})
evaluation_pairs = evaluation_pairs.drop(columns=["registered_num"])
evaluation_pairs = evaluation_pairs.merge(
    recipients_df[["recipient_id", "recipient_name"]],
    on="recipient_id",
    how="left"
)

In [None]:
#check dfs
print(f"Recipients: {recipients_df.shape} | Funders: {funders_df.shape} | Evaluation Pairs: {evaluation_pairs.shape}")

In [None]:
#create checkpoint - save dfs to pickle
# recipients_df.to_pickle("recipients_df.pkl")
# funders_df.to_pickle("funders_df.pkl")
# evaluation_pairs.to_pickle("evaluation_pairs.pkl")

In [None]:
recipients_df = pd.read_pickle("recipients_df.pkl")
funders_df = pd.read_pickle("funders_df.pkl")
evaluation_pairs = pd.read_pickle("evaluation_pairs.pkl")

----

# Embedding Model Evaluation by Sections

I have chosen to test four models on my small evaluation dataset: 
- `all-MiniLM-L6-v2`
- `snowflake-arctic-embed-s`
- `all-roberta-large-v1`
- `bge-large-en-v1.5` 

Snowflake's model is generally finetuned for retrieval but I have decided to include it as it has performed so highly against other benchmarks and compared to major competitors (Merrick et al, 2024). The other models are consistent with the analysis by Pavlyshenko and Stasiuk (2025), who found these architectures to be reliable on semantic similarity tasks.

In [None]:
#add model names
models = ["all-MiniLM-L6-v2", "Snowflake/snowflake-arctic-embed-s", "all-roberta-large-v1", "BAAI/bge-large-en-v1.5"]

## Test 1 - Activities Only

In [None]:
#prepare text columns
recipients_df["recipients_text"] = recipients_df["recipient_activities"].fillna("").str.lower()
funders_df["funders_text"] = funders_df["activities"].fillna("").str.lower()

results_act = []

In [None]:
#test models
results_act_df, pairs_act = test_embedding_models(
    models_list=models,
    funders_df=funders_df,
    recipients_df=recipients_df,
    evaluation_pairs=evaluation_pairs
)

In [None]:
results_act_df

In [None]:
#view pairs with scores from each model
pairs_act

## Test 2 - Objectives Only

In [None]:
#prepare text columns
recipients_df["recipients_text"] = recipients_df["recipient_objectives"].fillna("").str.lower()
funders_df["funders_text"] = funders_df["objectives"].fillna("").str.lower()

results_obj = []

In [23]:
#test models
results_obj_df, pairs_obj = test_embedding_models(
    models_list=models,
    funders_df=funders_df,
    recipients_df=recipients_df,
    evaluation_pairs=evaluation_pairs
)

In [None]:
results_obj_df

In [None]:
pairs_obj

## Test 3 - Activities and Objectives (API Only)

In [None]:
#prepare text columns
recipients_df["recipients_text"] = recipients_df["recipient_activities"].fillna("") + "" + recipients_df["recipient_objectives"].fillna("").str.lower()
funders_df["funders_text"] = funders_df["activities"].fillna("") + "" + funders_df["objectives"].fillna("").str.lower()

results_ao_api = []

In [None]:
#test models
results_ao_api, pairs_ao_api = test_embedding_models(
    models_list=models,
    funders_df=funders_df,
    recipients_df=recipients_df,
    evaluation_pairs=evaluation_pairs
)

In [None]:
results_ao_api

In [None]:
pairs_ao_api

## Test 4 - Activities and Objectives (Extracted and API)

In [None]:
#prepare text columns
recipients_df["recipients_text"] = recipients_df["recipient_activities"].fillna("") + "" + recipients_df["recipient_objectives"].fillna("").str.lower()
funders_df["funders_text"] = funders_df["activities"].fillna("") + "" + funders_df["objectives"].fillna("") + "" + funders_df["objectives_activities"].fillna("").str.lower()

results_ao_ext = []

In [None]:
#test models
results_ao_ext, pairs_ao_ext = test_embedding_models(
    models_list=models,
    funders_df=funders_df,
    recipients_df=recipients_df,
    evaluation_pairs=evaluation_pairs
)

In [None]:
results_ao_ext

In [None]:
pairs_ao_ext

In [None]:
#make scatterplots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for idx, (model_col, title) in enumerate([
    ("all-MiniLM-L6-v2_sim", "MiniLM"),
    ("Snowflake/snowflake-arctic-embed-s_sim", "Snowflake Arctic Embed"),
    ("all-roberta-large-v1_sim", "Roberta"),
    ("BAAI/bge-large-en-v1.5_sim", "BGE-M3")
]):
    axes[idx].scatter(evaluation_pairs["my_rating"], evaluation_pairs[model_col],
                        alpha=0.6, s=100, color=colours[idx])

    #show line of best fit
    z = np.polyfit(evaluation_pairs["my_rating"], evaluation_pairs[model_col], 1)
    p = np.poly1d(z)
    axes[idx].plot(evaluation_pairs["my_rating"].sort_values(),
                    p(evaluation_pairs["my_rating"].sort_values()),
                    "r-", alpha=0.5, linewidth=2)

    axes[idx].set_xlabel("My Rating")
    axes[idx].set_ylabel("Model Similarity Score")
    axes[idx].set_title(f"{title}\nCorrelation: {results_df.iloc[idx]['correlation']:.3f}")
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()