# Embedding Model Selection

# Setup

In [58]:
import pandas as pd
import os
import sys
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer, util
import warnings
warnings.filterwarnings("ignore")

project_root = os.path.abspath('..')
if project_root not in sys.path:
    sys.path.insert(0, project_root)
from utils import get_table_from_supabase

#get keys from env
load_dotenv()
url = os.getenv("SUPABASE_URL")
key = os.getenv("SUPABASE_KEY")

# Retrieving Data from Supabase and Building Dataframes

I will connect to Supabase and retrieve the relevant records, in order to isolate the text data that I will use to embed and select the best model. For this purpose, I will use the 12 funder-recipient pairs that I have curated for my evaluation app.

In [59]:
#get evaluation pairs and extract ids
evaluation_pairs = get_table_from_supabase(url, key, "evaluation_pairs")
funder_ids = evaluation_pairs["funder_registered_num"].unique()
eval_recip_ids = evaluation_pairs["recipient_id"].unique()

In [60]:
#get the 12 funders and their grants
funders = get_table_from_supabase(url, key, "funders")
funders = funders[funders["registered_num"].isin(funder_ids)]
funder_grants = get_table_from_supabase(url, key, "funder_grants")
eval_funder_grants = funder_grants[funder_grants["registered_num"].isin(funder_ids)]

#get grants for 12 funders
grant_ids = eval_funder_grants["grant_id"].unique()
grants = get_table_from_supabase(url, key, "grants")
grants = grants[grants["grant_id"].isin(grant_ids)]

In [61]:
#get recipients - join table first
recipient_grants = get_table_from_supabase(url, key, "recipient_grants")
eval_recip_grants = recipient_grants[recipient_grants["grant_id"].isin(grant_ids)]

#get all recipients of 12 funders' grants
all_recipient_ids = eval_recip_grants["recipient_id"].unique()
all_recipients = get_table_from_supabase(url, key, "recipients")
all_recipients = all_recipients[all_recipients["recipient_id"].isin(all_recipient_ids)]

#get chosen 12 recipients
recipients = get_table_from_supabase(url, key, "recipients")
recipients = recipients[recipients["recipient_id"].isin(eval_recip_ids)]

In [62]:
#create dataframes
recipients_df = recipients.copy()
funders_df = funders.copy()
grants_df = grants.copy()

In [63]:
#add funder and recipient names to grants
grants_df = grants_df.merge(eval_funder_grants[["grant_id", "registered_num"]], on="grant_id", how="left")
grants_df = grants_df.merge(funders[["registered_num", "name"]], on="registered_num", how="left")
grants_df = grants_df.rename(columns={"name": "funder_name"})
grants_df = grants_df.merge(eval_recip_grants[["grant_id", "recipient_id"]], on="grant_id", how="left")
grants_df = grants_df.merge(all_recipients[["recipient_id", "recipient_name"]], on="recipient_id", how="left")

#filter to columns to be embedded
recipients_df = recipients_df[["recipient_id", "recipient_name", "recipient_activities"]]
funders_df = funders_df[["registered_num", "name", "activities", "objectives", "objectives_activities", "achievements_performance", "grant_policy"]]
grants_df = grants_df[["grant_id", "grant_title", "grant_desc", "registered_num", "funder_name", "recipient_id", "recipient_name"]]

In [65]:
#add embedding columns
for col in ["recipient_name", "recipient_activities"]:
    recipients_df[col + "_em"] = None
for col in ["name", "activities", "objectives", "objectives_activities", "achievements_performance", "grant_policy"]:
    funders_df[col + "_em"] = None
for col in ["grant_title", "grant_desc", "recipient_name"]:
    grants_df[col + "_em"] = None

In [66]:
#check dfs
print(f"Recipients: {recipients_df.shape} | Funders: {funders_df.shape} | Grants: {grants_df.shape}")

Recipients: (12, 5) | Funders: (12, 13) | Grants: (727, 10)


In [67]:
recipients_df.head(1)

Unnamed: 0,recipient_id,recipient_name,recipient_activities,recipient_name_em,recipient_activities_em
3714,1194238,KIDZ KLUB - LEEDS,KIDZ KLUB LEEDS EXISTS TO PROMOTE AND WORK TO ...,,


In [68]:
funders_df.head(1)

Unnamed: 0,registered_num,name,activities,objectives,objectives_activities,achievements_performance,grant_policy,name_em,activities_em,objectives_em,objectives_activities_em,achievements_performance_em,grant_policy_em
11,298633,BACON CHARITABLE TRUST,MAKING DONATIONS TO SUCH CHARITABLE INSTITUTIO...,TO APPLY THE INCOME AND ALL OR SUCH PART OR PA...,,,,,,,,,


In [69]:
grants_df.head(1)

Unnamed: 0,grant_id,grant_title,grant_desc,registered_num,funder_name,recipient_id,recipient_name,grant_title_em,grant_desc_em,recipient_name_em
0,2023_219289_82,,,219289,JESSIE SPENCER TRUST,PDF-002796,ADDINGTON FRAMEWORK KNITTERS' MUSEUM,,,
