# The Scoring Logic - Development

# Setup

In [1]:
import pandas as pd
from pathlib import Path
import os
import sys
import json
import time
from datetime import datetime
from dotenv import load_dotenv
import warnings
warnings.filterwarnings("ignore")
from sentence_transformers import SentenceTransformer

project_root = os.path.abspath('..')
if project_root not in sys.path:
    sys.path.insert(0, project_root)
from utils import get_table_from_supabase, build_relationship_cols, build_financial_history, extract_areas, extract_classifications
from logic_utils import get_name_from_id, get_id_from_name, get_granularity_weight, check_if_parent, calculate_similarity_score

#get keys from env
load_dotenv()
url = os.getenv("SUPABASE_URL")
key = os.getenv("SUPABASE_KEY")

----

# Retrieving Data from Supabase and Building Dataframes

As with my EDA, I will connect to Supabase and retrieve all records, I will create one dataframe for funder information, and another for grants and recipients information. This will allow me to easily access funders' giving history, plus the classifications for both funders and recipients, to be used as part of the calculation of the alignment score.

In [None]:
#get tables and build dataframes
tables = ["funders", "causes", "areas", "beneficiaries", "grants",
               "funder_causes", "funder_areas", "funder_beneficiaries", "funder_grants", 
               "financials", "funder_financials",
               "embedding_pairs", "evaluation_pairs", "logic_pairs",
               "area_hierarchy"]

for table in tables:
    globals()[table] = get_table_from_supabase(url, key, table)

#get recipients with filter
recipients = get_table_from_supabase(url, key, "recipients", batch_size=50, filter_recipients=True)
all_recipient_ids = set(recipients["recipient_id"].unique())

#get and filter recipient join tables
recipient_join_tables = ["recipient_grants", "recipient_areas", "recipient_beneficiaries", "recipient_causes"]
for table in recipient_join_tables:
    df = get_table_from_supabase(url, key, table)
    globals()[table] = df[df["recipient_id"].isin(all_recipient_ids)]

## The Funders Dataframe

### Main Table

In [None]:
funders_df = funders.copy()

#define table relationships for funders
funder_rels = [
    {
        "join_table": funder_causes,
        "lookup_table": causes,
        "key": "cause_id",
        "value_col": "cause_name",
        "result_col": "causes"
    },
    {
        "join_table": funder_areas,
        "lookup_table": areas,
        "key": "area_id",
        "value_col": "area_name",
        "result_col": "areas"
    },
    {
        "join_table": funder_beneficiaries,
        "lookup_table": beneficiaries,
        "key": "ben_id",
        "value_col": "ben_name",
        "result_col": "beneficiaries"
    }
]

#add relationship columns
funders_df = build_relationship_cols(funders_df, "registered_num", funder_rels)

#round to 2 decimal places
funders_df = funders_df.round(2)
pd.set_option("display.float_format", "{:.2f}".format)

### Financial History Table

In [None]:
funders_df = build_financial_history(funders_df, "registered_num", funder_financials, financials)

### The List Entries

In [None]:
#get list entries
list_entries = get_table_from_supabase(url, key, "list_entries")
funder_list = get_table_from_supabase(url, key, "funder_list")
list_with_info = funder_list.merge(list_entries, on="list_id")

#get list of entries for each funder
list_grouped = list_with_info.groupby("registered_num")["list_type"].apply(list).reset_index()
list_grouped.columns = ["registered_num", "list_entries"]

#merge with funders and replace nans
funders_df = funders_df.merge(list_grouped, on="registered_num", how="left")
funders_df["list_entries"] = funders_df["list_entries"].apply(lambda x: x if isinstance(x, list) else [])

In [None]:
#get checkpoint folder
checkpoint_folder = Path("./10.1_checkpoints/")

#create checkpoint - save df to pickle
# funders_df.to_pickle(checkpoint_folder / "funders_df.pkl")
# print("Saved funders_df to checkpoint")

## The Grants Dataframe

### Main Table

In [None]:
grants_df = grants.copy()

#ddd funder info
grants_df = grants_df.merge(funder_grants, on="grant_id")
grants_df = grants_df.merge(funders[["registered_num", "name"]], on="registered_num")
grants_df = grants_df.rename(columns={"name": "funder_name", "registered_num": "funder_num"})

#ddd recipient info  
grants_df = grants_df.merge(recipient_grants, on="grant_id")
grants_df = grants_df.merge(recipients[["recipient_id", "recipient_name", "recipient_activities", "recipient_objectives", 
                                        "recipient_name_em", "recipient_activities_em", "recipient_objectives_em", "recipient_concat_em", "is_recipient", "recipient_extracted_class"]], 
                        on="recipient_id", 
                        how="left")

#define relationships for recipients
recipient_rels = [
    {
        "join_table": recipient_areas,
        "lookup_table": areas,
        "key": "area_id",
        "value_col": "area_name",
        "result_col": "recipient_areas"
    },
    {
        "join_table": recipient_causes,
        "lookup_table": causes,
        "key": "cause_id",
        "value_col": "cause_name",
        "result_col": "recipient_causes"
    },
    {
        "join_table": recipient_beneficiaries,
        "lookup_table": beneficiaries,
        "key": "ben_id",
        "value_col": "ben_name",
        "result_col": "recipient_beneficiaries"
    }
]

#add relationship columns
grants_df = build_relationship_cols(grants_df, "recipient_id", recipient_rels)

#add source of grant
grants_df["source"] = grants_df["grant_id"].apply(lambda x: "Accounts" if str(x).startswith("2") else "360Giving")

#round to 2 decimal places
grants_df = grants_df.round(2)

In [None]:
#create checkpoint - save df to pickle
# grants_df.to_pickle(checkpoint_folder / "grants_df.pkl")
# print("Saved grants_df to checkpoint")

## The Pairs Dataframe

In [None]:
pairs_df = logic_pairs.copy()

In [None]:
#merge to enrich with funder data
pairs_enriched = pairs_df.merge(
    funders_df,
    left_on="funder_registered_num",
    right_on="registered_num",
    how="left",
    suffixes=("", "_funder")
)

#drop duplicate col
pairs_enriched = pairs_enriched.drop("registered_num", axis=1)

#merge to enrich with recipient data
pairs_enriched = pairs_enriched.merge(
    grants_df[["recipient_id", "recipient_name", "recipient_activities", "recipient_objectives",
                "recipient_areas", "recipient_causes",
                "recipient_beneficiaries", "recipient_extracted_class"]].drop_duplicates(subset=["recipient_id"]),
    on="recipient_id",
    how="left"
)

pairs_df = pairs_enriched.copy()

In [None]:
#create checkpoint - save df to pickle
# pairs_df.to_pickle(checkpoint_folder / "pairs_df.pkl")
# print("Saved pairs_df to checkpoint")

## The Areas Dataframes

In [None]:
#get checkpoint folder
checkpoint_folder = Path("./10.1_checkpoints/")

# areas_df = areas.copy()
# hierarchies_df = area_hierarchy.copy()

#create checkpoint - save dfs to pickle
# areas_df.to_pickle(checkpoint_folder / "areas_df.pkl")
# hierarchies_df.to_pickle(checkpoint_folder / "hierarchies_df.pkl")
# print("Saved areas_df and hierarchies_df to checkpoint")

---

# Retrieving Data from Checkpoints

In [2]:
#get checkpoint folder
checkpoint_folder = Path("./10.1_checkpoints/")

#get checkpoint
funders_df = pd.read_pickle(checkpoint_folder / "funders_df.pkl")
grants_df = pd.read_pickle(checkpoint_folder / "grants_df.pkl")
pairs_df = pd.read_pickle(checkpoint_folder / "pairs_df.pkl")
areas_df = pd.read_pickle(checkpoint_folder / "areas_df.pkl")
hierarchies_df = pd.read_pickle(checkpoint_folder / "hierarchies_df.pkl")

----

# Preparation of User and Funder Data

## Users

I will use the data for the recipient at index 0 in `pairs_df` as a proxy for a real user's input, to simulate the functionality of the final artefact as I build the logic. 

First, the user will input information about their charity (the applicant), then embeddings will be created for the inputted text data. For the purposes of this development notebook, I will simulate the user's keyword input by using extracted classifications from recipients' data, but in the final artefact, the user will be asked to enter their own keywords.

In [3]:
#simulate user's input
user_df = pairs_df.iloc[[0]][["recipient_id", "recipient_name", "recipient_activities", "recipient_objectives", "recipient_areas", "recipient_causes", "recipient_beneficiaries"]]
user_df["funder_num"] = "1124856"

In [4]:
#save registered numbers
user_num = user_df["recipient_id"].iloc[0]
funder_num = user_df["funder_num"].iloc[0]

### Creation of Embeddings from User Input

In [5]:
model = SentenceTransformer("all-roberta-large-v1")
user_cols = ["recipient_name", "recipient_activities", "recipient_objectives"]

for col in user_cols:
    #replace nans with empty string
    texts = user_df[col].fillna("").tolist()
    embeddings = model.encode(texts)
    
    #add to df
    user_df[f"{col}_em"] = list(embeddings)

user_df["concat_text"] = user_df[user_cols[0]].fillna("")
for col in user_cols[1:]:
    user_df["concat_text"] += " " + user_df[col].fillna("")

#make lowercase
user_df["concat_text"] = user_df["concat_text"].str.lower()

#create embeddings
texts = user_df["concat_text"].tolist()
embeddings = model.encode(texts)
user_df["concat_em"] = list(embeddings)

#drop concatenated text
user_df = user_df.drop(columns=["concat_text"])

#change recipient_ to user_
user_df = user_df.rename(columns=lambda col: f"user_{col[len('recipient_'):]}" if col.startswith("recipient_") else col)

### Creation of Extracted Classes from User Input

In [6]:
#load classifications data
ukcat_url = "https://raw.githubusercontent.com/lico27/ukcat/main/data/ukcat.csv"
ukcat_df = pd.read_csv(ukcat_url)

#define elements to process
user_sections = ["user_name", "user_objectives", "user_activities"]
keyword_data = [(user_df, user_sections, "user")]
user_df["extracted_class"] = user_df["user_areas"].copy()

#extract classifications
for df, sections, name in keyword_data:
    start_time = time.time()
    df["user_extracted_class"] = df.apply(lambda row: extract_classifications(row, sections, ukcat_df, areas_df), axis=1)
    elapsed_time = time.time() - start_time
    print(f"Classification extraction complete for {name}. Total time: {elapsed_time:.2f}s")

Classification extraction complete for user. Total time: 0.08s


In [7]:
#capitalise extracted classifications
user_df["user_extracted_class"] = user_df["user_extracted_class"].apply(lambda classifications: [phrase.upper() for phrase in classifications] if isinstance(classifications, list) else [])

#remove "grant making"
user_df["user_extracted_class"] = user_df["user_extracted_class"].apply(lambda classifications: [phrase for phrase in classifications if phrase != "GRANT MAKING"] if isinstance(classifications, list) else [])

#drop extra column
user_df = user_df.drop(columns=["extracted_class"])

In [8]:
pd.set_option("display.max_columns", None)
user_df.head()

Unnamed: 0,user_id,user_name,user_activities,user_objectives,user_areas,user_causes,user_beneficiaries,funder_num,user_name_em,user_activities_em,user_objectives_em,concat_em,user_extracted_class
0,328729,ASYLUM AID,THE PROVISION OF LEGAL ADVICE AND REPRESENTATI...,2. OBJECTS2.1 THE CHARITY IS ESTABLISHED FOR T...,[Throughout England And Wales],"[Education/training, The Prevention Or Relief ...",[People Of A Particular Ethnic Or Racial Origi...,1124856,"[-0.008021089, 0.01503942, -0.022991765, -0.02...","[-0.022860372, 0.029296849, -0.017596042, -0.0...","[0.016410686, 0.00963383, -0.04015383, -0.0136...","[-0.0008344314, -0.01810797, -0.0055338936, -0...","[THROUGHOUT ENGLAND AND WALES, ASYLUM SEEKERS ..."


## Funders

I will next build a dataframe for the funder selected by the user, and a separate dataframe to store details of previous grants given by, and recipients of, this funder.

In [9]:
#get funder data from number inputted by user
funder_df = funders_df[funders_df["registered_num"] == funder_num].copy()
funder_df.head()

Unnamed: 0,registered_num,name,website,activities,objectives,income_latest,expenditure_latest,objectives_activities,achievements_performance,grant_policy,is_potential_sbf,is_on_list,is_nua,name_em,activities_em,objectives_em,objectives_activities_em,achievements_performance_em,grant_policy_em,concat_em,extracted_class,causes,areas,beneficiaries,income_history,expenditure_history,list_entries
257,1124856,ROSA FUND,https://www.rosauk.org,ROSA IS THE FIRST UK-WIDE FUND FOR WOMEN'S INI...,THE OBJECTS OF THE CHARITY ARE TO FURTHER ANY ...,1407453.0,1372296.0,,,,False,False,False,"[0.036068745,0.02428467,-0.026885081,-0.001224...","[0.005698055,0.011768709,-0.015513399,0.004063...","[-0.023482107,-0.02466424,0.0036000705,-0.0259...","[-0.019817753,-0.00571729,0.022262126,-0.03666...","[-0.019817753,-0.00571729,0.022262126,-0.03666...","[-0.019817753,-0.00571729,0.022262126,-0.03666...","[0.00031545621,0.014991585,-0.003386881,0.0022...","[""UK"",""WALES"",""GIRLS"",""WOMEN"",""CHARITY AND VCS...",[General Charitable Purposes],[Throughout England And Wales],"[Other Charities Or Voluntary Bodies, Other De...","{2020: 155612.0, 2021: 4478996.0, 2022: 237267...","{2020: 974678.0, 2021: 2118687.0, 2022: 266530...",[]


In [10]:
#get grants for selected funder
funder_grants_df = grants_df[grants_df["funder_num"] == funder_num].copy()
funder_grants_df.head(1)

Unnamed: 0,grant_title,grant_desc,amount,year,grant_id,source,grant_title_em,grant_desc_em,grant_concat_em,grant_extracted_class,funder_num,funder_grants_id,funder_name,recipient_id,recipient_grants_id,recipient_name,recipient_activities,recipient_objectives,recipient_name_em,recipient_activities_em,recipient_objectives_em,recipient_concat_em,is_recipient,recipient_extracted_class,recipient_areas,recipient_causes,recipient_beneficiaries
30403,,THE PROJECT WILL USE PART OF THE FUNDING TO PU...,22500.0,2024,360G-RosaUK 3260-01-169603259,360Giving,"[-0.019817753,-0.00571729,0.022262126,-0.03666...","[-0.027255304,0.06465661,-0.018874627,-0.03348...","[-0.045676723,0.0343797,0.006716128,-0.0356264...",[],1124856,42699,ROSA FUND,invalid_2299,41509,HOLLOWAY NEIGHBOURHOOD GROUP,WE RUN THE OLD FIRE STATION - A BUSY COMMUNITY...,,"[-0.03290081,-0.003398436,-0.01662999,-0.01840...","[-0.03992139,-0.010387247,-0.04818863,-0.02297...","[-0.019817753,-0.00571729,0.022262126,-0.03666...","[-0.057219584,-0.038814045,-0.013197874,-0.033...",True,"[""THE FINSBURY PARK AREA"",""SOCIAL CLUB"",""COMMU...",[],[],[]


----

# Binary Criteria (Stated Preferences)

## Step 1: Single-Beneficiary Funders

In [11]:
#check if funder has a single beneficiary
is_sbf = funder_df["is_potential_sbf"].iloc[0]
print(f"Funder has a single beneficiary: {is_sbf}")

Funder has a single beneficiary: False


## Step 2: No Unsolicited Applications

In [12]:
#check if funder states no unsolicited applications
is_nua = funder_df["is_nua"].iloc[0]
print(f"Funder states no unsolicited applications: {is_nua}")
print(f"NUA score: {is_nua * 1.0}")

Funder states no unsolicited applications: False
NUA score: 0.0


## Step 3: The List

In [29]:
#check if funder is on the list
def check_is_on_list(funder_df):

    is_on_list = funder_df["is_on_list"].iloc[0]
    reasoning = []

    if is_on_list:
        reasoning = set(funder_df["list_entries"].iloc[0])
    else:
        reasoning = None
        
    return is_on_list, reasoning

is_on_list, list_reasoning = check_is_on_list(funder_df)
print(f"Funder is on The List: {is_on_list}")
print(f"Type of List entry: {list_reasoning}")

Funder is on The List: False
Type of List entry: None


## Step 4: Existing Relationship

In [14]:
#check if funder has ever given a grant to applicant
existing_relationship = False
relationship = grants_df[
    (grants_df["funder_num"] == funder_num) &
    (grants_df["recipient_id"] == user_num)
]
num_grants = len(relationship)

if num_grants > 0:
    existing_relationship = True

print(f"Funder and user have existing relationship: {existing_relationship}")
print(f"Funder has given {num_grants} grant(s) to user")

Funder and user have existing relationship: True
Funder has given 1 grant(s) to user


----

# Classification Criteria (Stated Preferences)

## Step 5: Areas

Scoring for areas uses hierarchical matching to account for parent-child geographic relationships. The granularity of each area affects its weight - specific locations like local authorities score higher (1.0) than broad regions (0.7). 

I will check three types of matches:
- Exact matches where both funder and user state that they work in the same area
- Hierarchical matches where the funder's area contains the user's (e.g., funder says "Throughout England", user works in "Bristol")
- Hierarchical matches where the user's area contains the funder's specific location (e.g., user works throughout "Africa", funder focuses on "Kenya"). 

Each match will be weighted differently to reflect the strength of the match. The final score will average only the matched areas, ignoring non-matches, so that having some high-quality geographic alignment is valued over penalising for coverage gaps.

In [15]:
def check_areas(funder_list, user_list, areas_df, hierarchies_df):
    """
    Calculates a score based on matches between the funder's and user's stated areas.
    """

    #convert names to ids
    funder_ids = [get_id_from_name(name, areas_df) for name in funder_list if get_id_from_name(name, areas_df) is not None]
    user_ids = [get_id_from_name(name, areas_df) for name in user_list if get_id_from_name(name, areas_df) is not None]
    
    #avoid zero division
    if len(user_ids) == 0:
        return 0.0, []
    
    #store ids as set and scores/reasoning as lists
    funder_set = set(funder_ids)
    scores = []
    reasoning = []
    
    for user_area in user_ids:
        user_area_name = get_name_from_id(user_area, areas_df)
        
        #check for exact match
        if user_area in funder_set:
            score = get_granularity_weight(user_area, areas_df) * 1.0
            scores.append(score)
            reasoning.append(f"Exact match: {user_area_name}")
        
        #check if user area is within funder area
        else:
            hierarchy_user_in_funder = None
            for funder_area in funder_ids:
                if check_if_parent(funder_area, user_area, hierarchies_df):
                    hierarchy_user_in_funder = funder_area
                    break
            
            if hierarchy_user_in_funder:
                parent_name = get_name_from_id(hierarchy_user_in_funder, areas_df)
                score = get_granularity_weight(hierarchy_user_in_funder, areas_df) * 0.6
                scores.append(score)
                reasoning.append(f"Hierarchical match: {user_area_name} (user) within {parent_name} (funder)")
            
            #check if funder area is within user area
            else:
                hierarchy_funder_in_user = None
                for funder_area in funder_ids:
                    if check_if_parent(user_area, funder_area, hierarchies_df):
                        hierarchy_funder_in_user = funder_area
                        break
                
                if hierarchy_funder_in_user:
                    child_name = get_name_from_id(hierarchy_funder_in_user, areas_df)
                    score = get_granularity_weight(user_area, areas_df) * 0.4
                    scores.append(score)
                    reasoning.append(f"Hierarchical match: {child_name} (funder) within {user_area_name} (user)")
                
                #no match
                else:
                    scores.append(0.0)
                    reasoning.append(f"No match: {user_area_name}")
    
    matched_scores = [s for s in scores if s > 0]
    if len(matched_scores) > 0:
        score = sum(matched_scores) / len(matched_scores)
    else:
        score = 0.0
    
    return max(0.0, score), reasoning

#get lists
funder_areas = funder_df["areas"].iloc[0].copy()
user_areas = user_df["user_areas"].iloc[0].copy()

#get score and reasoning
areas_score, areas_reasoning = check_areas(funder_areas, user_areas, areas_df, hierarchies_df)
print(f"Areas score: {areas_score:.2f}\n\nReasoning:")
for reason in areas_reasoning:
    print(reason)

Areas score: 0.70

Reasoning:
Exact match: Throughout England And Wales


## Step 6: Beneficiaries

Scoring for beneficiaries is simpler than for areas. I will exclude the generic "Other Charities Or Voluntary Bodies" as it is likely that almost all funders will fall into this category, adding noise to the scoring. I will use a hierarchical scoring approach but without the granularity weighting, as the higher level categories in this classification are too broad as to offer real value to the calculation. 

In [16]:
def check_beneficiaries(funder_list, user_list):
    """
    Calculates a score based on matches between the funder's and user's stated beneficiaries.
    """

    #define categories and filter
    high_level_bens = {"Other Defined Groups", "The General Public/mankind"}
    exclude_bens = {"Other Charities Or Voluntary Bodies"}
    funder_bens = [ben for ben in funder_list if ben not in exclude_bens]
    user_bens = [ben for ben in user_list if ben not in exclude_bens]
    
    #avoid zero division
    if len(user_bens) == 0:
        return 0.0, []
    
    #categorise funder beneficiaries
    funder_specific = set(ben for ben in funder_bens if ben not in high_level_bens)
    has_high_level = any(ben in high_level_bens for ben in funder_bens)
    
    scores = []
    reasoning = []
    for user_ben in user_bens:
        if user_ben in funder_specific:
            scores.append(1.0)
            reasoning.append(f"Exact match: {user_ben}")
        elif has_high_level:
            scores.append(0.2)
            reasoning.append(f"Weak match: user states '{user_ben}' and funder supports broad categories")
        else:
            scores.append(0.0)
            reasoning.append(f"No match: {user_ben}")
    
    matched_scores = [s for s in scores if s > 0]
    if len(matched_scores) > 0:
        score = sum(matched_scores) / len(matched_scores)
    else:
        score = 0.0

    return max(0.0, score), reasoning

#get lists
funder_beneficiaries = funder_df["beneficiaries"].iloc[0].copy()
user_beneficiaries = user_df["user_beneficiaries"].iloc[0].copy()

#get score and reasoning
beneficiaries_score, beneficiaries_reasoning = check_beneficiaries(funder_beneficiaries, user_beneficiaries)
print(f"Beneficiaries score: {beneficiaries_score:.2f}\n\nReasoning:")
for reason in beneficiaries_reasoning:
    print(reason)

Beneficiaries score: 0.20

Reasoning:
Weak match: user states 'People Of A Particular Ethnic Or Racial Origin' and funder supports broad categories
Weak match: user states 'Other Defined Groups' and funder supports broad categories


## Step 7: Causes

For causes, I will exclude "Other Charitable Purposes" as it adds noise. However, I will not exclude "General Charitable Purposes" (GCP) as this is used by funders to indicate that they would be willing to consider any causes. I will use it as a fallback similar to how "Throughout England" works for areas. 

The scoring checks for exact matches between the user's and funder's causes first, which score 1.0. If no exact match exists but the funder lists GCP, this scores 0.5 as a weak indicator that the funder might support the cause. Non-matches score 0.0.

In [17]:
def check_causes(funder_list, user_list):
    """
    Calculates a score based on matches between the funder's and user's stated causes.
    """
    #define categories and filter
    gcp = "General Charitable Purposes"
    exclude_causes = {"Other Charitable Purposes"}
    funder_causes = [cause for cause in funder_list if cause not in exclude_causes]
    user_causes = [cause for cause in user_list if cause not in exclude_causes]
    
    #avoid zero division
    if len(user_causes) == 0:
        return 0.0, []
    
    #categorise funder causes
    funder_specific = set(cause for cause in funder_causes if cause != gcp)
    has_gcp = gcp in funder_causes
    
    scores = []
    reasoning = []
    
    for user_cause in user_causes:
        if user_cause in funder_specific:
            scores.append(1.0)
            reasoning.append(f"Exact match: {user_cause}")
        elif has_gcp:
            scores.append(0.6)
            reasoning.append(f"Weak match: user states '{user_cause}' and funder supports general charitable purposes")
        else:
            scores.append(0.0)
            reasoning.append(f"No match: {user_cause}")
    
    matched_scores = [s for s in scores if s > 0]
    if len(matched_scores) > 0:
        score = sum(matched_scores) / len(matched_scores)
    else:
        score = 0.0
    
    return max(0.0, score), reasoning

#get lists
funder_causes = funder_df["causes"].iloc[0].copy()
user_causes = user_df["user_causes"].iloc[0].copy()

#get score and reasoning
causes_score, causes_reasoning = check_causes(funder_causes, user_causes)
print(f"Causes score: {causes_score:.2f}\n\nReasoning:")
for reason in causes_reasoning:
    print(reason)

Causes score: 0.60

Reasoning:
Weak match: user states 'Education/training' and funder supports general charitable purposes
Weak match: user states 'The Prevention Or Relief Of Poverty' and funder supports general charitable purposes


-----

# Semantic Similarity (Stated Preferences)

## Step 8: Text Sections

In [18]:
#get embeddings
funder_embedding = funder_df["concat_em"].iloc[0]
user_embedding = user_df["concat_em"].iloc[0]

#get score
text_similarity_score = calculate_similarity_score(funder_embedding, user_embedding)
print(f"Text semantic similarity score: {text_similarity_score:.2f}")

Text semantic similarity score: 0.53


## Step 9: Keywords

In [19]:
def check_keywords(funder_keywords, user_keywords, model):
    """
    Calculates semantic similarity between funder (extracted) and user (inputted) keywords.
    """
    
    #parse json
    if isinstance(funder_keywords, str):
        funder_keywords = json.loads(funder_keywords)
    if isinstance(user_keywords, str):
        user_keywords = json.loads(user_keywords)
    
    #handle empty/nans
    if not funder_keywords:
        funder_keywords = []
    if not user_keywords:
        user_keywords = []
    
    if len(funder_keywords) == 0 or len(user_keywords) == 0:
        return 0.0, {}, ["No keywords to compare"], False
    
    #create embeddings for each keyword
    funder_keywords_em = {}
    for keyword in funder_keywords:
        embedding = model.encode(keyword)
        funder_keywords_em[keyword] = embedding

    user_keywords_em = {}
    for keyword in user_keywords:
        embedding = model.encode(keyword)
        user_keywords_em[keyword] = embedding

    #compare every funder keyword to every user keyword
    all_scores = []
    for funder_kw, funder_em in funder_keywords_em.items():
        for user_kw, user_em in user_keywords_em.items():
            similarity = calculate_similarity_score(funder_em, user_em)
            all_scores.append({
                "funder_keyword": funder_kw,
                "user_keyword": user_kw,
                "similarity": similarity
            })
    
    #sort and check for bonus (matches >= 0.8)
    all_scores.sort(key=lambda x: x["similarity"], reverse=True)
    gets_bonus = any(match["similarity"] >= 0.80 for match in all_scores)
    
    #get dictionary of matches >= 0.80
    strong_matches = {}
    for match in all_scores:
        if match["similarity"] >= 0.80:
            key = f"{match['funder_keyword']} & {match['user_keyword']}"
            strong_matches[key] = match["similarity"]
    
    #filter to top 10 matches <= 0.80 and get average
    scores_under_80 = [match for match in all_scores if match["similarity"] < 0.80]
    top_10 = scores_under_80[:10]

    if len(top_10) > 0:
        score = sum(match["similarity"] for match in top_10) / len(top_10)
    else:
        score = 0.0
    
    #build reasoning from medium matches
    reasoning = []
    for match in scores_under_80[:9]:
        reasoning.append(f"'{match['funder_keyword']}' & '{match['user_keyword']}': {match['similarity']:.3f}")
    
    return max(0.0, score), strong_matches, reasoning, gets_bonus

#get keyword lists
funder_keywords = funder_df["extracted_class"].iloc[0]
user_keywords = user_df["user_extracted_class"].iloc[0]

#get score
keyword_similarity_score, keyword_strong_matches, keyword_reasoning, keyword_gets_bonus = check_keywords(funder_keywords, user_keywords, model)

print(f"Keyword similarity score: {keyword_similarity_score:.2f}")
print(f"Eligible for keyword bonus: {keyword_gets_bonus}")


Keyword similarity score: 0.44
Eligible for keyword bonus: True


-----

# Semantic Similarity (Revealed Preferences)

## Step 10: Name Score

In [20]:
def check_name_rp(recipients_embedding_dict, user_embedding, user_name):
    """
    Calculates semantic similarity between the user's name and the names of the funder's previous recipients.
    """

    #handle empty/nan
    score = 0.0
    reasoning = []

    #compare every recipient name to the user's name
    all_scores = []
    for recipient_name, recipient_embedding in recipients_embedding_dict.items():
        if recipient_name != user_name:
            similarity = calculate_similarity_score(recipient_embedding, user_embedding)
            all_scores.append({
                "recipient_name": recipient_name,
                "similarity": similarity
            })

    #sort and calculate average of top 10
    all_scores.sort(key=lambda x: x["similarity"], reverse=True)
    top_10 = all_scores[:10]
    if len(top_10) > 0:
        score = sum(match["similarity"] for match in top_10) / len(top_10)
    else:
        score = 0.0

    #build reasoning from top 10 matches
    reasoning = []
    for match in top_10:
        reasoning.append(f"{match['recipient_name']}: {match['similarity']:.3f}")

    return max(0.0, score), reasoning

#get embeddings
recipients_name_all_em = dict(zip(funder_grants_df["recipient_name"], funder_grants_df["recipient_name_em"]))
user_name_em = user_df["user_name_em"].iloc[0]
user_name = user_df["user_name"].iloc[0]

#get score
name_score_rp, name_rp_reasoning = check_name_rp(recipients_name_all_em, user_name_em, user_name)
print(f"Name (RP) similarity score: {name_score_rp:.2f}\n\nReasoning:")
for reason in name_rp_reasoning:
    print(reason)

Name (RP) similarity score: 0.45

Reasoning:
AAWAZ: 0.513
WELSH WOMEN'S AID: 0.482
SOLACE WOMEN'S AID: 0.479
SOUTHAMPTON WOMEN'S AID: 0.472
JUNO WOMEN'S AID: 0.447
APNA HAQ: 0.443
SIKH WOMEN'S AID: 0.434
GUIDE ASSOCIATION: 0.426
CARDIFF WOMEN'S AID: 0.421
EVA WOMENS AID LTD: 0.420


## Step 11: Grants Score

In [21]:
def check_grants_rp(grants_embedding_dict, user_embedding, user_name):
    """
    Calculates semantic similarity between the user's text sections and the funder's previous grants.
    """

    #handle empty/nan
    score = 0.0
    reasoning = []

    #compare every grant to the user's text
    all_scores = []
    for grant_recipient_name, grant_embedding in grants_embedding_dict.items():
        if grant_recipient_name != user_name:
            similarity = calculate_similarity_score(grant_embedding, user_embedding)
            all_scores.append({
                "grant_recipient_name": grant_recipient_name,
                "similarity": similarity
            })

    #sort and calculate average of top 10
    all_scores.sort(key=lambda x: x["similarity"], reverse=True)
    top_10 = all_scores[:10]
    if len(top_10) > 0:
        score = sum(match["similarity"] for match in top_10) / len(top_10)
    else:
        score = 0.0

    #build reasoning from top 10 matches
    reasoning = []
    for match in top_10:
        reasoning.append(f"{match['grant_recipient_name']}: {match['similarity']:.3f}")

    return max(0.0, score), reasoning

#get embeddings
non_empty_grants = funder_grants_df[
    (funder_grants_df["grant_title"].notna() & (funder_grants_df["grant_title"] != "")) |
    (funder_grants_df["grant_desc"].notna() & (funder_grants_df["grant_desc"] != ""))
]

grants_all_em = dict(zip(non_empty_grants["recipient_name"], non_empty_grants["grant_concat_em"]))
user_concat_em = user_df["concat_em"].iloc[0]
user_name = user_df["user_name"].iloc[0]

#get score
grants_rp_score, grants_rp_reasoning = check_grants_rp(grants_all_em, user_concat_em, user_name)
print(f"Grants (RP) similarity score: {grants_rp_score:.2f}\n\nReasoning:")
for reason in grants_rp_reasoning:
    print(reason)

Grants (RP) similarity score: 0.57

Reasoning:
REFUGEE WOMEN'S ASSOCIATION: 0.617
ANGELS OF HOPE FOR WOMEN: 0.594
REFUGEE WOMEN CONNECT: 0.588
RAPE CRISIS ENGLAND & WALES: 0.582
TRANSFORM FORTH VALLEY: 0.558
TRANSACTUAL: 0.556
APNA GHAR: 0.547
BAWSO LTD: 0.545
WOMEN FOR REFUGEE WOMEN: 0.541
TEEN ACTION: 0.540


## Step 12: Recipients Score

In [22]:
def check_recipients_rp(recipients_embedding_dict, user_embedding, user_name):
    """
    Calculates semantic similarity between the user's text sections and those of the funder's previous recipients.
    """

    #handle empty/nan
    score = 0.0
    reasoning = []

    #compare every recipient's text to the user's text
    all_scores = []
    for recipient_name, recipient_embedding in recipients_embedding_dict.items():
        if recipient_name != user_name:
            similarity = calculate_similarity_score(recipient_embedding, user_embedding)
            all_scores.append({
                "grant_recipient_name": recipient_name,
                "similarity": similarity
            })

    #sort and calculate average of top 10
    all_scores.sort(key=lambda x: x["similarity"], reverse=True)
    top_10 = all_scores[:10]
    if len(top_10) > 0:
        score = sum(match["similarity"] for match in top_10) / len(top_10)
    else:
        score = 0.0

    #build reasoning from top 10 matches
    reasoning = []
    for match in top_10:
        reasoning.append(f"{match['grant_recipient_name']}: {match['similarity']:.3f}")

    return max(0.0, score), reasoning

#get embeddings
recipients_all_em = dict(zip(funder_grants_df["recipient_name"], funder_grants_df["recipient_concat_em"]))
user_concat_em = user_df["concat_em"].iloc[0]
user_name = user_df["user_name"].iloc[0]

#get score
recipients_rp_score, recipients_rp_reasoning = check_recipients_rp(recipients_all_em, user_concat_em, user_name)
print(f"Recipients (RP) similarity score: {recipients_rp_score:.2f}\n\nReasoning:")
for reason in recipients_rp_reasoning:
    print(reason)

Recipients (RP) similarity score: 0.66

Reasoning:
VOICE OF DOMESTIC WORKERS: 0.717
SUFFOLK REFUGEE SUPPORT: 0.699
WOMEN FOR REFUGEE WOMEN: 0.695
AFRICAN WOMEN'S CARE: 0.667
SAFETY4SISTERS NORTH WEST: 0.664
LESBIAN IMMIGRATION SUPPORT GROUP: 0.656
BASIS YORKSHIRE LTD: 0.624
MOTHER AND CHILD WELFARE ORGANISATION: 0.623
SANDWELL AFRICAN WOMEN ASSOCIATION: 0.618
ETHIOPIAN WOMEN'S EMPOWERMENT GROUP: 0.617


----

# Penalties and Bonuses (Stated Preferences)

## Step 13: SBF Penalty

In [23]:
sbf_penalty = 0.2 if is_sbf else 1.0
print(f"Single-beneficiary penalty: *{sbf_penalty}")

Single-beneficiary penalty: *1.0


## Step 14: Keywords Bonus

In [24]:
def calculate_keywords_bonus(strong_matches, ukcat_df):
    """
    Calculates bonus based on keyword matches. Only runs if keywords with semantic scores above 0.8 exist.
    """

    #weight by specificity of ukcat level
    level_weights = {
        1: 0.3, 
        2: 0.7, 
        3: 1.0
    }
    
    weighted_scores = []
    for keyword, score in strong_matches.items():
        #find keyword in ukcat_df
        match = ukcat_df[ukcat_df['tag'].str.upper() == keyword.upper()]
        
        if not match.empty:
            level = match.iloc[0]['level']
            weighted_score = score * level_weights.get(level, 1.0)
        else:
            weighted_score = score * 0.3
        
        weighted_scores.append(weighted_score)
    
    avg_weighted = sum(weighted_scores) / len(weighted_scores)
    
    #calculate bonus
    bonus = 1.1 + (avg_weighted * 0.2)
    bonus = min(max(bonus, 1.1), 1.3)
    
    return bonus

#get bonus and reasoning
keywords_bonus = calculate_keywords_bonus(keyword_strong_matches, ukcat_df)

print(f"{len(keyword_strong_matches)} strong keyword matches:")
for match, score in keyword_strong_matches.items():
    print(f"{match}: {score:.2f}")
print(f"Keywords bonus: *{keywords_bonus:.2f}")

2 strong keyword matches:
HEALTH & HEALTH: 1.00
HEALTH & MENTAL HEALTH: 0.82
Keywords bonus: *1.15


## Step 15: Existing Relationship Bonus

In [25]:
def calculate_relationship_bonus(relationship_df):
    """
    Calculates time since last grant and calculates a bonus. Only runs if there is a relationship.
    """

    #get time lapsed since last gift
    last_grant_year = relationship_df["year"].max()
    current_year = datetime.now().year
    time_lapsed = current_year - last_grant_year
    
    #assign bands
    if time_lapsed <= 2:
        bonus = 1.5
    elif time_lapsed <= 3:
        bonus = 1.4
    elif time_lapsed <= 5:
        bonus = 1.3
    elif time_lapsed <= 10:
        bonus = 1.2
    else:
        bonus = 1.1
    
    #add uplift for recurring relationship
    num_grants = len(relationship_df)
    if num_grants >= 5:
        bonus += 0.1
    
    return time_lapsed, bonus, last_grant_year

#get bonus and reasoning
time_lapsed, relationship_bonus, last_grant_year = calculate_relationship_bonus(relationship)

print(f"Last gift given in {last_grant_year} ({time_lapsed} years ago)")
print(f"Relationship bonus: *{relationship_bonus}")

Last gift given in 2013 (12 years ago)
Relationship bonus: *1.1


-----

# Penalties and Bonuses (Revealed Preferences)

## Step 16: Areas Bonus

In [26]:
def calculate_areas_bonus_rp(funder_grants_df, user_areas, areas_df, hierarchies_df):
    """
    Calculates a bonus based on how well the user's areas match the funder's recipient's areas.
    """

    if funder_grants_df.empty:
        return 0.0, ["No grants history available"]

    #get unique areas from recipients
    all_areas = []
    for areas_list in funder_grants_df["recipient_areas"]:
        if isinstance(areas_list, list):
            all_areas.extend(areas_list)

    if len(all_areas) == 0:
        return 0.0, ["No area data available"]

    recipient_areas = list(set(all_areas))

    #check areas
    match_score, _ = check_areas(recipient_areas, user_areas, areas_df, hierarchies_df)

    #convert to bonus multiplier
    bonus = 1.0 + (match_score * 0.2)

    #get reasoning from top 10 (low level tiers only)
    area_count = {}
    for area_name in all_areas:
        area_id = get_id_from_name(area_name, areas_df)
        if area_id:
            granularity = get_granularity_weight(area_id, areas_df)
            if granularity >= 0.9:
                area_count[area_name] = area_count.get(area_name, 0) + 1

    if len(area_count) == 0:
        reasoning = ["Only broad geographic areas found"]
    else:
        sorted_areas = sorted(area_count.items(), key=lambda x: x[1], reverse=True)
        total_low_level = sum(area_count.values())

        reasoning = []
        for area_name, count in sorted_areas[:10]:
            percentage = (count / total_low_level) * 100
            reasoning.append(f"{area_name}: {count} grants ({percentage:.1f}%)")

    return bonus, reasoning

#get list
user_areas = user_df["user_areas"].iloc[0].copy()

#get bonus and reasoning
areas_rp_bonus, areas_rp_reasoning = calculate_areas_bonus_rp(funder_grants_df, user_areas, areas_df, hierarchies_df)
print(f"Areas score: {areas_rp_bonus:.2f}\n\nReasoning:")
for reason in areas_rp_reasoning:
    print(reason)

Areas score: 1.14

Reasoning:
Manchester City: 23 grants (2.2%)
Newcastle Upon Tyne City: 21 grants (2.0%)
Scotland: 21 grants (2.0%)
Birmingham City: 20 grants (1.9%)
South Tyneside: 19 grants (1.8%)
Rochdale: 17 grants (1.6%)
Gateshead: 17 grants (1.6%)
Northern Ireland: 16 grants (1.5%)
Sandwell: 16 grants (1.5%)
Haringey: 15 grants (1.4%)


## Step 17: Keywords Bonus

In [27]:
def calculate_keywords_bonus_rp(funder_grants_df, user_keywords):
    """
    Calculates a bonus based on exact keyword matches between user and funder's recipients.
    """

    if funder_grants_df.empty:
        return 1.0, ["No grants history available"]

    #parse json
    if isinstance(user_keywords, str):
        user_keywords = json.loads(user_keywords)
    if not user_keywords:
        user_keywords = []

    if len(user_keywords) == 0:
        return 1.0, ["No user keywords to match"]

    #get all recipient keywords
    all_recipient_keywords = []
    for recipient_keywords in funder_grants_df["recipient_extracted_class"]:
        if isinstance(recipient_keywords, str):
            recipient_keywords = json.loads(recipient_keywords)
        if recipient_keywords:
            all_recipient_keywords.extend(recipient_keywords)

    if len(all_recipient_keywords) == 0:
        return 1.0, ["No recipient keywords available"]

    #find exact matches and count frequency
    matched_keywords = {}
    user_keywords_matched = set()

    for user_kw in user_keywords:
        if user_kw in all_recipient_keywords:
            user_keywords_matched.add(user_kw)
            matched_keywords[user_kw] = matched_keywords.get(user_kw, 0) + all_recipient_keywords.count(user_kw)

    #calculate match percentage
    match_percentage = len(user_keywords_matched) / len(user_keywords)

    #calculate bonus
    if match_percentage >= 0.9:
        bonus = 1.1
    elif match_percentage >= 0.5:
        bonus = 1.05
    else:
        bonus = 1.0 + (match_percentage * 0.2)

    #build reasoning from top 10
    if len(matched_keywords) == 0:
        reasoning = ["No exact keyword matches found"]
    else:
        sorted_matches = sorted(matched_keywords.items(), key=lambda x: x[1], reverse=True)

        reasoning = []
        for keyword, count in sorted_matches[:10]:
            reasoning.append(f"{keyword}: {count} occurrences")

    return bonus, reasoning

#get list
user_keywords = user_df["user_extracted_class"].iloc[0]

#get bonus and reasoning
keywords_rp_bonus, keywords_rp_reasoning = calculate_keywords_bonus_rp(funder_grants_df, user_keywords)
print(f"Keywords (RP) bonus: *{keywords_rp_bonus:.2f}\n\nReasoning:")
for reason in keywords_rp_reasoning:
    print(reason)

Keywords (RP) bonus: *1.10

Reasoning:
EDUCATION: 321 occurrences
INDIVIDUAL POVERTY: 251 occurrences
HEALTH: 247 occurrences
TRAINING: 219 occurrences
ADVICE AND INDIVIDUAL ADVOCACY: 183 occurrences
ABUSE: 157 occurrences
POLICY CAMPAIGNING AND ADVOCACY: 110 occurrences
MENTAL HEALTH: 96 occurrences
ASYLUM SEEKERS AND REFUGEES: 73 occurrences
MIGRANTS: 52 occurrences


## Step 18: Low-Variance Penalty

In [28]:
def calculate_lv_penalty(funder_grants_df):
    """
    Identifies low variance in a funder's previous giving and calculates a penalty.
    """

    #skip funders with low giving history
    if len(funder_grants_df) < 10:
        return 1.0

    total_grants = len(funder_grants_df)
    unique_recipients = funder_grants_df['recipient_name'].nunique()
    
    #find proportion of grants to unique recipients
    variance_proportion = unique_recipients / total_grants
    
    #calculate penalty
    if variance_proportion < 0.3:
        penalty = 0.7
    else:
        penalty = 1.0
    
    return penalty

#get penalty
lv_penalty = calculate_lv_penalty(funder_grants_df)
print(f"Low variance (RP) penalty: *{lv_penalty:.2f}")

Low variance (RP) penalty: *1.00
