# Embedding Model Selection

# Setup

In [106]:
import pandas as pd
import os
import sys
from dotenv import load_dotenv
import matplotlib.pyplot as plt
import seaborn as sns
colours = sns.color_palette("Set2")
from sentence_transformers import SentenceTransformer, util
import warnings
warnings.filterwarnings("ignore")

project_root = os.path.abspath('..')
if project_root not in sys.path:
    sys.path.insert(0, project_root)
from utils import get_table_from_supabase

#get keys from env
load_dotenv()
url = os.getenv("SUPABASE_URL")
key = os.getenv("SUPABASE_KEY")

-----

# Retrieving Data from Supabase and Building Dataframes

I will connect to Supabase and retrieve the relevant records, in order to isolate the text data that I will use to embed and select the best model. For this purpose, I will use the 12 funder-recipient pairs that I have curated for my evaluation app.

In [111]:
#get evaluation pairs and extract ids
evaluation_pairs = get_table_from_supabase(url, key, "evaluation_pairs")
funder_ids = evaluation_pairs["funder_registered_num"].unique()
eval_recip_ids = evaluation_pairs["recipient_id"].unique()

#add my ratings to compare later
ratings = {
    1: 0.25,
    2: 0.60,
    3: 0.25,
    4: 0.25,
    5: 0.10,
    6: 0.80,
    7: 0.60,
    8: 0.25,
    9: 0.25,
    10: 0.50,
    11: 0.80,
    12: 0.30
}
evaluation_pairs["my_rating"] = evaluation_pairs["id"].map(ratings)

In [112]:
#get the 12 funders and recipients
funders = get_table_from_supabase(url, key, "funders")
funders = funders[funders["registered_num"].isin(funder_ids)]

recipients = get_table_from_supabase(url, key, "recipients")
recipients = recipients[recipients["recipient_id"].isin(eval_recip_ids)]

#create dataframes
recipients_df = recipients.copy()
funders_df = funders.copy()

#add embedding columns to funder/recipient dfs
funders_df["embeddings"] = None
recipients_df["embeddings"] = None

#add funder and recipient names to pairs df
evaluation_pairs = evaluation_pairs.merge(
    funders_df[["registered_num", "name"]],
    left_on="funder_registered_num",
    right_on="registered_num",
    how="left"
)
evaluation_pairs = evaluation_pairs.rename(columns={"name": "funder_name"})
evaluation_pairs = evaluation_pairs.drop(columns=["registered_num"])
evaluation_pairs = evaluation_pairs.merge(
    recipients_df[["recipient_id", "recipient_name"]],
    on="recipient_id",
    how="left"
)

In [113]:
#check dfs
print(f"Recipients: {recipients_df.shape} | Funders: {funders_df.shape} | Evaluation Pairs: {evaluation_pairs.shape}")

Recipients: (12, 5) | Funders: (12, 14) | Evaluation Pairs: (12, 6)


----

# Embeddings Creation

In [114]:
models = ["all-MiniLM-L6-v2", "Snowflake/snowflake-arctic-embed-s"]
results = []

#concat columns and replace NAs with empty strings
recipients_df["recipients_text"] = recipients_df["recipient_activities"].fillna("").str.lower()
funders_df["funders_text"] = funders_df["activities"].fillna("").str.lower()

In [115]:
#test both models
for model_name in models:
    model = SentenceTransformer(model_name)
    
    #make embeddings
    funders_ems = model.encode(funders_df["funders_text"].tolist())
    recipients_ems = model.encode(recipients_df["recipients_text"].tolist())
    
    #calculate similarities
    similarities = []
    for i in range(len(funders_df)):
        similarity = util.cos_sim(funders_ems[i], recipients_ems[i]).item()
        similarities.append(similarity)
    
    evaluation_pairs[f"{model_name}_sim"] = similarities
    
    #calculate correlation with my ratings
    correlation = evaluation_pairs["my_rating"].corr(evaluation_pairs[f"{model_name}_sim"])
    
    results.append({
        "model": model_name,
        "correlation": correlation
    })

#make dfs
evaluation_pairs = evaluation_pairs[["funder_name", "recipient_name","my_rating", 
                "all-MiniLM-L6-v2_sim", 
                "Snowflake/snowflake-arctic-embed-s_sim"]]
results_df = pd.DataFrame(results)

In [116]:
#view pairs with scores from each model
evaluation_pairs

Unnamed: 0,funder_name,recipient_name,my_rating,all-MiniLM-L6-v2_sim,Snowflake/snowflake-arctic-embed-s_sim
0,BACON CHARITABLE TRUST,CATTON GROVE COMMUNITY CENTRE CIO,0.25,0.206052,0.584426
1,BEAVERBROOK FOUNDATION,FAITH IN LATER LIFE LTD,0.6,0.152812,0.752805
2,JESSIE SPENCER TRUST,COMBAT STRESS,0.25,0.178033,0.72273
3,GRUT TRUST,NORTHERN CANCER VOICES,0.25,0.331746,0.713746
4,JOHN WHIPPY FOUNDATION,ANIMAL RESCUE CYMRU,0.1,0.243448,0.644499
5,TESLER FOUNDATION,SEVENTH HEAVEN,0.6,0.328495,0.704759
6,FRIENDS OF FAWLEY CHURCH,"BERKSHIRE, BUCKINGHAMSHIRE AND OXFORDSHIRE WIL...",0.25,0.163585,0.617711
7,3 TS CHARITABLE TRUST,WELDMAR HOSPICECARE,0.25,0.206725,0.674962
8,DAVID AND RUTH BEHREND FUND,FREEDOM FROM TORTURE,0.5,0.144311,0.629727
9,MRS WATERHOUSE CHARITABLE TRUST,KIDZ KLUB - LEEDS,0.8,0.487164,0.716023


In [117]:

results_df

Unnamed: 0,model,correlation
0,all-MiniLM-L6-v2,0.629824
1,Snowflake/snowflake-arctic-embed-s,0.299794
