# Evaluation

# Setup

In [1]:
import pandas as pd
from pathlib import Path
import os
import sys
import json
import time
from datetime import datetime
from dotenv import load_dotenv
import warnings
warnings.filterwarnings("ignore")
from sentence_transformers import SentenceTransformer

project_root = os.path.abspath('..')
if project_root not in sys.path:
    sys.path.insert(0, project_root)
from utils import get_table_from_supabase, build_relationship_cols, build_financial_history, extract_classifications
from evaluation_utils import get_recipients_by_id

#get keys from env
load_dotenv()
url = os.getenv("SUPABASE_URL")
key = os.getenv("SUPABASE_KEY")

----

# Retrieving Data from Checkpoints

In [2]:
#get checkpoint folder
checkpoint_folder = Path("./13.1_checkpoints/")

#get checkpoint
funders_df = pd.read_pickle(checkpoint_folder / "funders_df.pkl")
grants_df = pd.read_pickle(checkpoint_folder / "grants_df.pkl")
areas_df = pd.read_pickle(checkpoint_folder / "areas_df.pkl")
hierarchies_df = pd.read_pickle(checkpoint_folder / "hierarchies_df.pkl")
eval_df = pd.read_pickle(checkpoint_folder / "eval_df.pkl")

-----

# Retrieving Data from Supabase and Building Dataframes

I have imported the evaluation pairs dataframe that was built as part of the logic development, but it is missing data for rows where `is_recipient=False`. This is due to the way that the database was built; these charities had not been linked to any grants from the 360Giving API or extracted from accounts, so did not get fed through the entire database building process. I will run the same embedding and classification extraction processes for these rows that have been used for the user preparation in prior notebooks, to ensure that the evaluation funder-recipient pairs have all data necessary to calculate their alignment scores.

In [3]:
#get data for evaluation recipients
recipient_ids = eval_df["recipient_id"].unique()
recipients_df = get_recipients_by_id(url, key, recipient_ids)

### Creation of Embeddings

In [4]:
model = SentenceTransformer("all-roberta-large-v1")
recipient_cols = ["recipient_name", "recipient_activities", "recipient_objectives"]
recipients_false = recipients_df[recipients_df["is_recipient"] == False].copy()
recipients_false["recipient_concat_em"] = None

if len(recipients_false) > 0:
    for col in recipient_cols:
        #replace nans with empty string
        texts = recipients_false[col].fillna("").tolist()
        embeddings = model.encode(texts)

        #add to df
        recipients_false[f"{col}_em"] = list(embeddings)

    recipients_false["concat_text"] = recipients_false[recipient_cols[0]].fillna("")
    for col in recipient_cols[1:]:
        recipients_false["concat_text"] += " " + recipients_false[col].fillna("")

    #make lowercase
    recipients_false["concat_text"] = recipients_false["concat_text"].str.lower()

    #create embeddings
    texts = recipients_false["concat_text"].tolist()
    embeddings = model.encode(texts)
    recipients_false["recipient_concat_em"] = list(embeddings)

    #drop concatenated text
    recipients_false = recipients_false.drop(columns=["concat_text"])

    embedding_cols = [f"{col}_em" for col in recipient_cols] + ["recipient_concat_em"]
    for col in embedding_cols:
        recipients_false.loc[recipients_false["is_recipient"] == False, col] = recipients_false[col].values

### Creation of Extracted Classes

In [5]:
#load classifications data
ukcat_url = "https://raw.githubusercontent.com/lico27/ukcat/main/data/ukcat.csv"
ukcat_df = pd.read_csv(ukcat_url)

#define elements to process
recipient_sections = ["recipient_name", "recipient_objectives", "recipient_activities"]
keyword_data = [(recipients_false, recipient_sections, "recipient")]
recipients_false["extracted_class"] = recipients_false["recipient_areas"].copy()

#extract classifications
for df, sections, name in keyword_data:
    start_time = time.time()
    df["recipient_extracted_class"] = df.apply(lambda row: extract_classifications(row, sections, ukcat_df, areas_df), axis=1)
    elapsed_time = time.time() - start_time
    print(f"Classification extraction complete for {name}. Total time: {elapsed_time:.2f}s")

Classification extraction complete for recipient. Total time: 0.28s


In [8]:
#capitalise extracted classifications
recipients_false["recipient_extracted_class"] = recipients_false["recipient_extracted_class"].apply(lambda classifications: [phrase.upper() for phrase in classifications] if isinstance(classifications, list) else [])

#remove "grant making"
recipients_false["recipient_extracted_class"] = recipients_false["recipient_extracted_class"].apply(lambda classifications: [phrase for phrase in classifications if phrase != "GRANT MAKING"] if isinstance(classifications, list) else [])

#drop extra column
recipients_false = recipients_false.drop(columns=["extracted_class"])

### The Final Evaluation Pairs Dataframe

In [9]:
#remove rows with missing data and concat recipients dfs
recipients_df = recipients_df[~recipients_df["recipient_id"].isin(recipients_false["recipient_id"])]
recipients_df = pd.concat([recipients_df, recipients_false], ignore_index=True)

#drop old recipient cols and add new to eval_df
recipient_cols = [col for col in eval_df.columns if col.startswith("recipient_") and col != "recipient_id"]
eval_df = eval_df.drop(columns=recipient_cols)
eval_df = eval_df.merge(recipients_df, on="recipient_id", how="left")

In [11]:
pd.set_option("display.max_columns", None)
eval_df.head(1)

Unnamed: 0,id,funder_registered_num,recipient_id,name,website,activities,objectives,income_latest,expenditure_latest,objectives_activities,achievements_performance,grant_policy,is_potential_sbf,is_on_list,is_nua,name_em,activities_em,objectives_em,objectives_activities_em,achievements_performance_em,grant_policy_em,concat_em,extracted_class,causes,areas,beneficiaries,income_history,expenditure_history,list_entries,recipient_name,recipient_activities,is_recipient,recipient_objectives,recipient_name_em,recipient_activities_em,recipient_objectives_em,recipient_concat_em,recipient_extracted_class,recipient_areas,recipient_causes,recipient_beneficiaries
0,1,298633,1207372,BACON CHARITABLE TRUST,,MAKING DONATIONS TO SUCH CHARITABLE INSTITUTIO...,TO APPLY THE INCOME AND ALL OR SUCH PART OR PA...,72879.0,72879.0,,,,False,False,False,"[0.004460917,0.035826597,-0.01591086,-0.011070...","[-0.0064024967,0.037508424,-0.0025785645,-0.02...","[-0.024516284,0.022869943,-0.032995474,-0.0348...","[-0.019817753,-0.00571729,0.022262126,-0.03666...","[-0.019817753,-0.00571729,0.022262126,-0.03666...","[-0.019817753,-0.00571729,0.022262126,-0.03666...","[-0.021681184,0.0020920401,0.009623636,-0.0578...","[""GRANT MAKING""]","[General Charitable Purposes, Environment/cons...","[Lincolnshire, Norfolk]",[Other Charities Or Voluntary Bodies],"{2020: 23696.0, 2021: 27920.0, 2022: 242325.0,...","{2020: 23696.0, 2021: 27920.0, 2022: 242325.0,...",[],CATTON GROVE COMMUNITY CENTRE CIO,CATTON GROVE COMMUNITY CENTRE SUPPORTS A WIDE ...,False,THE OBJECTS OF THE CIO ARE TO FURTHER BENEFIT ...,"[0.006319203, 0.0067654094, -0.029557364, -0.0...","[0.0027825288, -0.043152414, 0.016655156, -0.0...","[-0.011579754, 0.0077819657, -0.030523552, -0....","[-0.02640823, 0.0003374529, 0.019409541, -0.04...","[NORFOLK, COMMUNITY ASSOCIATION, COMMUNITY DEV...",[Norfolk],[Recreation],"[Children/young People, Elderly/old People, Pe..."
