# The Scoring Logic

# Setup

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import os
import sys
import time
import json
from datetime import datetime
from dotenv import load_dotenv
import warnings
warnings.filterwarnings("ignore")
from sentence_transformers import SentenceTransformer, util

project_root = os.path.abspath('..')
if project_root not in sys.path:
    sys.path.insert(0, project_root)
from utils import get_table_from_supabase, build_relationship_cols, build_financial_history
from logic_utils import extract_classifications, get_name_from_id, get_id_from_name, get_granularity_weight, check_if_parent

#get keys from env
load_dotenv()
url = os.getenv("SUPABASE_URL")
key = os.getenv("SUPABASE_KEY")

----

# Retrieving Data from Supabase and Building Dataframes

As with my EDA, I will connect to Supabase and retrieve all records, I will create one dataframe for funder information, and another for grants and recipients information. This will allow me to easily access funders' giving history, plus the classifications for both funders and recipients, to be used as part of the calculation of the alignment score.

In [2]:
#get tables and build dataframes
tables = ["funders", "causes", "areas", "beneficiaries", "grants",
               "funder_causes", "funder_areas", "funder_beneficiaries", "funder_grants", 
               "financials", "funder_financials",
               "embedding_pairs", "evaluation_pairs", "logic_pairs",
               "area_hierarchy"]

for table in tables:
    globals()[table] = get_table_from_supabase(url, key, table)

#get recipients with filter
recipients = get_table_from_supabase(url, key, "recipients", batch_size=50, filter_recipients=True)
all_recipient_ids = set(recipients["recipient_id"].unique())

#get and filter recipient join tables
recipient_join_tables = ["recipient_grants", "recipient_areas", "recipient_beneficiaries", "recipient_causes"]
for table in recipient_join_tables:
    df = get_table_from_supabase(url, key, table)
    globals()[table] = df[df["recipient_id"].isin(all_recipient_ids)]

## The Funders Dataframe

### Main Table

In [3]:
funders_df = funders.copy()

#define table relationships for funders
funder_rels = [
    {
        "join_table": funder_causes,
        "lookup_table": causes,
        "key": "cause_id",
        "value_col": "cause_name",
        "result_col": "causes"
    },
    {
        "join_table": funder_areas,
        "lookup_table": areas,
        "key": "area_id",
        "value_col": "area_name",
        "result_col": "areas"
    },
    {
        "join_table": funder_beneficiaries,
        "lookup_table": beneficiaries,
        "key": "ben_id",
        "value_col": "ben_name",
        "result_col": "beneficiaries"
    }
]

#add relationship columns
funders_df = build_relationship_cols(funders_df, "registered_num", funder_rels)

#round to 2 decimal places
funders_df = funders_df.round(2)
pd.set_option("display.float_format", "{:.2f}".format)

### Financial History Table

In [4]:
funders_df = build_financial_history(funders_df, "registered_num", funder_financials, financials)

### The List Entries

In [5]:
#get list entries
list_entries = get_table_from_supabase(url, key, "list_entries")
funder_list = get_table_from_supabase(url, key, "funder_list")
list_with_info = funder_list.merge(list_entries, on="list_id")

#get list of entries for each funder
list_grouped = list_with_info.groupby("registered_num")["list_type"].apply(list).reset_index()
list_grouped.columns = ["registered_num", "list_entries"]

#merge with funders and replace nans
funders_df = funders_df.merge(list_grouped, on="registered_num", how="left")
funders_df["list_entries"] = funders_df["list_entries"].apply(lambda x: x if isinstance(x, list) else [])

In [6]:
#get checkpoint folder
checkpoint_folder = Path("./10.1_checkpoints/")

#create checkpoint - save df to pickle
# funders_df.to_pickle(checkpoint_folder / "funders_df.pkl")
# print("Saved funders_df to checkpoint")

Saved funders_df to checkpoint


## The Grants Dataframe

### Main Table

In [9]:
grants_df = grants.copy()

#ddd funder info
grants_df = grants_df.merge(funder_grants, on="grant_id")
grants_df = grants_df.merge(funders[["registered_num", "name"]], on="registered_num")
grants_df = grants_df.rename(columns={"name": "funder_name", "registered_num": "funder_num"})

#ddd recipient info  
grants_df = grants_df.merge(recipient_grants, on="grant_id")
grants_df = grants_df.merge(recipients[["recipient_id", "recipient_name", "recipient_activities", "recipient_objectives", 
                                        "recipient_name_em", "recipient_activities_em", "recipient_objectives_em", "recipient_concat_em", "is_recipient"]], 
                        on="recipient_id", 
                        how="left")

#define relationships for recipients
recipient_rels = [
    {
        "join_table": recipient_areas,
        "lookup_table": areas,
        "key": "area_id",
        "value_col": "area_name",
        "result_col": "recipient_areas"
    },
    {
        "join_table": recipient_causes,
        "lookup_table": causes,
        "key": "cause_id",
        "value_col": "cause_name",
        "result_col": "recipient_causes"
    },
    {
        "join_table": recipient_beneficiaries,
        "lookup_table": beneficiaries,
        "key": "ben_id",
        "value_col": "ben_name",
        "result_col": "recipient_beneficiaries"
    }
]

#add relationship columns
grants_df = build_relationship_cols(grants_df, "recipient_id", recipient_rels)

#add source of grant
grants_df["source"] = grants_df["grant_id"].apply(lambda x: "Accounts" if str(x).startswith("2") else "360Giving")

#round to 2 decimal places
grants_df = grants_df.round(2)

In [11]:
#create checkpoint - save df to pickle
# grants_df.to_pickle(checkpoint_folder / "grants_df.pkl")
# print("Saved grants_df to checkpoint")

Saved grants_df to checkpoint


## The Pairs Dataframe

In [12]:
pairs_df = logic_pairs.copy()

In [13]:
#merge to enrich with funder data
pairs_enriched = pairs_df.merge(
    funders_df,
    left_on="funder_registered_num",
    right_on="registered_num",
    how="left",
    suffixes=("", "_funder")
)

#drop duplicate col
pairs_enriched = pairs_enriched.drop("registered_num", axis=1)

#merge to enrich with recipient data
pairs_enriched = pairs_enriched.merge(
    grants_df[["recipient_id", "recipient_name", "recipient_activities", "recipient_objectives",
                "recipient_areas", "recipient_causes",
"recipient_beneficiaries"]].drop_duplicates(subset=["recipient_id"]),
    on="recipient_id",
    how="left"
)

pairs_df = pairs_enriched.copy()

In [14]:
#create checkpoint - save df to pickle
# pairs_df.to_pickle(checkpoint_folder / "pairs_df.pkl")
# print("Saved pairs_df to checkpoint")

Saved pairs_df to checkpoint


## The Areas Dataframes

In [4]:
#get checkpoint folder
checkpoint_folder = Path("./10.1_checkpoints/")

# areas_df = areas.copy()
# hierarchies_df = area_hierarchy.copy()

#create checkpoint - save dfs to pickle
# areas_df.to_pickle(checkpoint_folder / "areas_df.pkl")
# hierarchies_df.to_pickle(checkpoint_folder / "hierarchies_df.pkl")
# print("Saved areas_df and hierarchies_df to checkpoint")

Saved areas_df and hierarchies_df to checkpoint


---

# Retrieving Data from Checkpoints

In [2]:
#get checkpoint folder
checkpoint_folder = Path("./10.1_checkpoints/")

#get checkpoint
funders_df = pd.read_pickle(checkpoint_folder / "funders_df.pkl")
grants_df = pd.read_pickle(checkpoint_folder / "grants_df.pkl")
pairs_df = pd.read_pickle(checkpoint_folder / "pairs_df.pkl")
areas_df = pd.read_pickle(checkpoint_folder / "areas_df.pkl")
hierarchies_df = pd.read_pickle(checkpoint_folder / "hierarchies_df.pkl")

----

# Preparation of User and Funder Data

## Users

I will use the data for the recipient at index 0 in `pairs_df` as a proxy for a real user's input, to simulate the functionality of the final artefact as I build the logic. 

First, the user will input information about their charity (the applicant), then embeddings will be created for the inputted text data. For the purposes of this development notebook, I will simulate the user's keyword input by using the same method for extracting classifications from funders' data, but in the final artefact, the user will be asked to enter their own keywords.

In [94]:
#simulate user's input
user_df = pairs_df.iloc[[0]][["recipient_id", "recipient_name", "recipient_activities", "recipient_objectives", "recipient_areas", "recipient_causes", "recipient_beneficiaries"]]
user_df["funder_num"] = "1124856"

In [95]:
#save registered numbers
user_num = user_df["recipient_id"].iloc[0]
funder_num = user_df["funder_num"].iloc[0]

### Creation of Embeddings from User Input

In [96]:
model = SentenceTransformer("all-roberta-large-v1")
user_cols = ["recipient_name", "recipient_activities", "recipient_objectives"]

for col in user_cols:
    #replace nans with empty string
    texts = user_df[col].fillna("").tolist()
    embeddings = model.encode(texts)
    
    #add to df
    user_df[f"{col}_em"] = list(embeddings)

user_df["concat_text"] = user_df[user_cols[0]].fillna("")
for col in user_cols[1:]:
    user_df["concat_text"] += " " + user_df[col].fillna("")

#make lowercase
user_df["concat_text"] = user_df["concat_text"].str.lower()

#create embeddings
texts = user_df["concat_text"].tolist()
embeddings = model.encode(texts)
user_df["concat_em"] = list(embeddings)

#drop concatenated text
user_df = user_df.drop(columns=["concat_text"])

### Simulation of User Keyword Input

In [97]:
section_cols = ["recipient_activities", "recipient_objectives"]
user_df["extracted_class"] = None

#load classifications data
ukcat_url = "https://raw.githubusercontent.com/lico27/ukcat/main/data/ukcat.csv"
ukcat_df = pd.read_csv(ukcat_url)

#extract keywords from user data
start_time = time.time()
user_df["user_keywords"] = user_df.apply(lambda row: extract_classifications(row, section_cols, ukcat_df, areas_df), axis=1)

elapsed_time = time.time() - start_time
print(f"Cause extraction complete. Total time: {elapsed_time:.2f}s")

Cause extraction complete. Total time: 0.08s


In [98]:
pd.set_option("display.max_columns", None)
user_df.head()

Unnamed: 0,recipient_id,recipient_name,recipient_activities,recipient_objectives,recipient_areas,recipient_causes,recipient_beneficiaries,funder_num,recipient_name_em,recipient_activities_em,recipient_objectives_em,concat_em,extracted_class,user_keywords
0,328729,ASYLUM AID,THE PROVISION OF LEGAL ADVICE AND REPRESENTATI...,2. OBJECTS2.1 THE CHARITY IS ESTABLISHED FOR T...,[Throughout England And Wales],"[Education/training, The Prevention Or Relief ...",[People Of A Particular Ethnic Or Racial Origi...,1124856,"[-0.008021089, 0.01503942, -0.022991765, -0.02...","[-0.022860372, 0.029296849, -0.017596042, -0.0...","[0.016410686, 0.00963383, -0.04015383, -0.0136...","[-0.0008344314, -0.01810797, -0.0055338936, -0...",,"[Asylum seekers and refugees, Migrants, Advice..."


## Funders

I will next build a dataframe for the funder selected by the user, and a separate dataframe to store details of previous grants given by, and recipients of, this funder.

In [99]:
#get funder data from number inputted by user
funder_df = funders_df[funders_df["registered_num"] == funder_num].copy()
funder_df.head()

Unnamed: 0,registered_num,name,website,activities,objectives,income_latest,expenditure_latest,objectives_activities,achievements_performance,grant_policy,is_potential_sbf,is_on_list,is_nua,name_em,activities_em,objectives_em,objectives_activities_em,achievements_performance_em,grant_policy_em,concat_em,extracted_class,causes,areas,beneficiaries,income_history,expenditure_history,list_entries
257,1124856,ROSA FUND,https://www.rosauk.org,ROSA IS THE FIRST UK-WIDE FUND FOR WOMEN'S INI...,THE OBJECTS OF THE CHARITY ARE TO FURTHER ANY ...,1407453.0,1372296.0,,,,False,False,False,"[0.036068745,0.02428467,-0.026885081,-0.001224...","[0.005698055,0.011768709,-0.015513399,0.004063...","[-0.023482107,-0.02466424,0.0036000705,-0.0259...","[-0.019817753,-0.00571729,0.022262126,-0.03666...","[-0.019817753,-0.00571729,0.022262126,-0.03666...","[-0.019817753,-0.00571729,0.022262126,-0.03666...","[0.00031545621,0.014991585,-0.003386881,0.0022...","[""Uk"",""Wales"",""Girls"",""Women"",""Charity and VCS...",[General Charitable Purposes],[Throughout England And Wales],"[Other Charities Or Voluntary Bodies, Other De...","{2020: 155612.0, 2021: 4478996.0, 2022: 237267...","{2020: 974678.0, 2021: 2118687.0, 2022: 266530...",[]


In [100]:
#get grants for selected funder
funder_grants_df = grants_df[grants_df["funder_num"] == funder_num].copy()
funder_grants_df.head(1)

Unnamed: 0,grant_title,grant_desc,amount,year,grant_id,source,grant_title_em,grant_desc_em,grant_concat_em,funder_num,funder_grants_id,funder_name,recipient_id,recipient_grants_id,recipient_name,recipient_activities,recipient_objectives,recipient_name_em,recipient_activities_em,recipient_objectives_em,recipient_concat_em,is_recipient,recipient_areas,recipient_causes,recipient_beneficiaries
30638,C19 R1-HGWA,FUNDING OVER 12 MONTHS TO RESPOND TO THE PRACT...,20000.0,2020,360G-RosaUK-1820-01-169601819,360Giving,"[-0.021811347,-0.014835423,-0.016724072,-0.012...","[-0.03757492,-0.0068165953,-0.025009565,0.0187...","[-0.048174538,-0.02058363,0.010783903,-0.01708...",1124856,42877,ROSA FUND,invalid_100,41687,PROJECT ONE,,,"[0.008081452,0.02795066,0.0020238636,0.0029561...","[-0.019817753,-0.00571729,0.022262126,-0.03666...","[-0.019817753,-0.00571729,0.022262126,-0.03666...","[-0.019817753,-0.00571729,0.022262126,-0.03666...",True,[],[],[]


In [101]:
#make dictionary of grants
funder_grants_list = funder_grants_df[[
    "grant_title", "grant_desc", "amount", "year",
    "grant_title_em", "grant_desc_em",
    "recipient_id", "recipient_name", "recipient_activities", "recipient_objectives",
    "recipient_name_em", "recipient_concat_em",
    "recipient_areas", "recipient_beneficiaries", "recipient_causes"
]].to_dict("records")

----

# Binary Criteria

## Single-Beneficiary Funders

In [102]:
#check if funder has a single beneficiary
is_sbf = funder_df["is_potential_sbf"].iloc[0]
print(f"Funder has a single beneficiary: {is_sbf}")

Funder has a single beneficiary: False


## No Unsolicited Applications

In [103]:
#check if funder states no unsolicited applications
is_nua = funder_df["is_nua"].iloc[0]
print(f"Funder states no unsolicited applications: {is_nua}")
print(f"NUA score: {is_nua * 1.0}")

Funder states no unsolicited applications: False
NUA score: 0.0


## The List

In [104]:
#check if funder is on the list
def check_is_on_list(funder_df):

    is_on_list = funder_df["is_on_list"].iloc[0]
    reasoning = []

    if is_on_list:
        reasoning = set(funder_df["list_entries"].iloc[0])
    else:
        reasoning = None
    
    score = 0.0 if is_on_list else 1.0
    
    return is_on_list, reasoning, score

is_on_list, list_reasoning, list_score = check_is_on_list(funder_df)
print(f"Funder is on The List: {is_on_list}")
print(f"Type of List entry: {list_reasoning}")
print(f"Score: {list_score}")

Funder is on The List: False
Type of List entry: None
Score: 1.0


## Existing Relationship

In [105]:
#check if funder has ever given a grant to applicant
existing_relationship = False
relationship = grants_df[
    (grants_df["funder_num"] == funder_num) &
    (grants_df["recipient_id"] == user_num)
]
num_grants = len(relationship)

if num_grants > 0:
    existing_relationship = True

print(f"Funder and user have existing relationship: {existing_relationship}")
print(f"Funder has given {num_grants} grant(s) to user")

Funder and user have existing relationship: True
Funder has given 1 grant(s) to user


# Classification Criteria

## Areas

Scoring for areas uses hierarchical matching to account for parent-child geographic relationships. The granularity of each area affects its weight - specific locations like local authorities score higher (1.0) than broad regions (0.7). 

I will check three types of matches:
- Exact matches where both funder and user state that they work in the same area
- Hierarchical matches where the funder's area contains the user's (e.g., funder says "Throughout England", user works in "Bristol")
- Hierarchical matches where the user's area contains the funder's specific location (e.g., user works throughout "Africa", funder focuses on "Kenya"). 

Each match will be weighted differently to reflect the strength of the match. The final score will average only the matched areas, ignoring non-matches, so that having some high-quality geographic alignment is valued over penalising for coverage gaps.

In [106]:
def check_areas(funder_list, user_list, areas_df, hierarchies_df):
    """
    Calculates a score based on matches between the funder's and user's stated areas.
    """

    #convert names to ids
    funder_ids = [get_id_from_name(name, areas_df) for name in funder_list if get_id_from_name(name, areas_df) is not None]
    user_ids = [get_id_from_name(name, areas_df) for name in user_list if get_id_from_name(name, areas_df) is not None]
    
    #avoid zero division
    if len(user_ids) == 0:
        return 0.0, []
    
    #store ids as set and scores/reasoning as lists
    funder_set = set(funder_ids)
    scores = []
    reasoning = []
    
    for user_area in user_ids:
        user_area_name = get_name_from_id(user_area, areas_df)
        
        #check for exact match
        if user_area in funder_set:
            score = get_granularity_weight(user_area, areas_df) * 1.0
            scores.append(score)
            reasoning.append(f"Exact match: {user_area_name}")
        
        #check if user area is within funder area
        else:
            hierarchy_user_in_funder = None
            for funder_area in funder_ids:
                if check_if_parent(funder_area, user_area, hierarchies_df):
                    hierarchy_user_in_funder = funder_area
                    break
            
            if hierarchy_user_in_funder:
                parent_name = get_name_from_id(hierarchy_user_in_funder, areas_df)
                score = get_granularity_weight(hierarchy_user_in_funder, areas_df) * 0.6
                scores.append(score)
                reasoning.append(f"Hierarchical match: {user_area_name} (user) within {parent_name} (funder)")
            
            #check if funder area is within user area
            else:
                hierarchy_funder_in_user = None
                for funder_area in funder_ids:
                    if check_if_parent(user_area, funder_area, hierarchies_df):
                        hierarchy_funder_in_user = funder_area
                        break
                
                if hierarchy_funder_in_user:
                    child_name = get_name_from_id(hierarchy_funder_in_user, areas_df)
                    score = get_granularity_weight(user_area, areas_df) * 0.4
                    scores.append(score)
                    reasoning.append(f"Hierarchical match: {child_name} (funder) within {user_area_name} (user)")
                
                #no match
                else:
                    scores.append(0.0)
                    reasoning.append(f"No match: {user_area_name}")
    
    matched_scores = [s for s in scores if s > 0]
    if len(matched_scores) > 0:
        areas_score = sum(matched_scores) / len(matched_scores)
    else:
        areas_score = 0.0
    
    return areas_score, reasoning

#get lists
funder_areas = funder_df["areas"].iloc[0].copy()
user_areas = user_df["recipient_areas"].iloc[0].copy()

#get score and reasoning
areas_score, areas_reasoning = check_areas(funder_areas, user_areas, areas_df, hierarchies_df)
print(f"Areas score: {areas_score:.2f}\n\nReasoning:")
for reason in areas_reasoning:
    print(reason)

Areas score: 0.70

Reasoning:
Exact match: Throughout England And Wales


## Beneficiaries

Scoring for beneficiaries is simpler than for areas. I will exclude the generic "Other Charities Or Voluntary Bodies" as it is likely that almost all funders will fall into this category, adding noise to the scoring. I will use a hierarchical scoring approach but without the granularity weighting, as the higher level categories in this classification are too broad as to offer real value to the calculation. 

In [113]:
def check_beneficiaries(funder_list, user_list):
    """
    Calculates a score based on matches between the funder's and user's stated beneficiaries.
    """

    #define categories and filter
    high_level_bens = {"Other Defined Groups", "The General Public/mankind"}
    exclude_bens = {"Other Charities Or Voluntary Bodies"}
    funder_bens = [ben for ben in funder_list if ben not in exclude_bens]
    user_bens = [ben for ben in user_list if ben not in exclude_bens]
    
    #avoid zero division
    if len(user_bens) == 0:
        return 0.0, []
    
    #categorise funder beneficiaries
    funder_specific = set(ben for ben in funder_bens if ben not in high_level_bens)
    has_high_level = any(ben in high_level_bens for ben in funder_bens)
    
    scores = []
    reasoning = []
    for user_ben in user_bens:
        if user_ben in funder_specific:
            scores.append(1.0)
            reasoning.append(f"Exact match: {user_ben}")
        elif has_high_level:
            scores.append(0.2)
            reasoning.append(f"Weak match: user states '{user_ben}' and funder supports broad categories")
        else:
            scores.append(0.0)
            reasoning.append(f"No match: {user_ben}")
    
        matched_scores = [s for s in scores if s > 0]
    if len(matched_scores) > 0:
        beneficiaries_score = sum(matched_scores) / len(matched_scores)
    else:
        beneficiaries_score = 0.0

    return beneficiaries_score, reasoning

#get lists
funder_beneficiaries = funder_df["beneficiaries"].iloc[0].copy()
user_beneficiaries = user_df["recipient_beneficiaries"].iloc[0].copy()

#get score and reasoning
beneficiaries_score, beneficiaries_reasoning = check_beneficiaries(funder_beneficiaries, user_beneficiaries)
print(f"Beneficiaries score: {beneficiaries_score:.2f}\n\nReasoning:")
for reason in beneficiaries_reasoning:
    print(reason)

Beneficiaries score: 0.20

Reasoning:
Weak match: user states 'People Of A Particular Ethnic Or Racial Origin' and funder supports broad categories
Weak match: user states 'Other Defined Groups' and funder supports broad categories


## Causes

For causes, I will exclude "Other Charitable Purposes" as it adds noise. However, I will not exclude "General Charitable Purposes" (GCP) as this is used by funders to indicate that they would be willing to consider any causes. I will use it as a fallback similar to how "Throughout England" works for areas. 

The scoring checks for exact matches between the user's and funder's causes first, which score 1.0. If no exact match exists but the funder lists GCP, this scores 0.5 as a weak indicator that the funder might support the cause. Non-matches score 0.0.

In [112]:
def check_causes(funder_list, user_list):
    """
    Calculates a score based on matches between the funder's and user's stated causes.
    """
    #define categories and filter
    gcp = "General Charitable Purposes"
    exclude_causes = {"Other Charitable Purposes"}
    funder_causes = [cause for cause in funder_list if cause not in exclude_causes]
    user_causes = [cause for cause in user_list if cause not in exclude_causes]
    
    #avoid zero division
    if len(user_causes) == 0:
        return 0.0, []
    
    #categorise funder causes
    funder_specific = set(cause for cause in funder_causes if cause != gcp)
    has_gcp = gcp in funder_causes
    
    scores = []
    reasoning = []
    
    for user_cause in user_causes:
        if user_cause in funder_specific:
            scores.append(1.0)
            reasoning.append(f"Exact match: {user_cause}")
        elif has_gcp:
            scores.append(0.6)
            reasoning.append(f"Weak match: user states '{user_cause}' and funder supports general charitable purposes")
        else:
            scores.append(0.0)
            reasoning.append(f"No match: {user_cause}")
    
    matched_scores = [s for s in scores if s > 0]
    if len(matched_scores) > 0:
        causes_score = sum(matched_scores) / len(matched_scores)
    else:
        causes_score = 0.0
    
    return causes_score, reasoning

#get lists
funder_causes = funder_df["causes"].iloc[0].copy()
user_causes = user_df["recipient_causes"].iloc[0].copy()

#get score and reasoning
causes_score, causes_reasoning = check_causes(funder_causes, user_causes)
print(f"Causes score: {causes_score:.2f}\n\nReasoning:")
for reason in causes_reasoning:
    print(reason)

Causes score: 0.60

Reasoning:
Weak match: user states 'Education/training' and funder supports general charitable purposes
Weak match: user states 'The Prevention Or Relief Of Poverty' and funder supports general charitable purposes


# Semantic Similarity Score

In [109]:
def calculate_similarity_score(funder_embedding, user_embedding):
    """
    Calculates semantic similarity between user and funder using pre-computed embeddings.
    """
    
    #parse json   
    if isinstance(funder_embedding, str):
        funder_embedding = json.loads(funder_embedding)
    if isinstance(user_embedding, str):
        user_embedding = json.loads(user_embedding)
    
    #calculate cosine similarity
    score = util.cos_sim(funder_embedding, user_embedding).item()
    
    return score

#get embeddings
funder_embedding = funder_df["concat_em"].iloc[0]
user_embedding = user_df["concat_em"].iloc[0]

#get score
similarity_score = calculate_similarity_score(funder_embedding, user_embedding)
print(f"Semantic similarity score: {similarity_score:.2f}")

Semantic similarity score: 0.53


# Penalties and Bonuses

## Step x - SBF Penalty

In [110]:
sbf_penalty = 0.2 if is_sbf else 1.0
print(f"Single-beneficiary penalty: *{sbf_penalty}")

Single-beneficiary penalty: *1.0


## Step x - Relationship Bonus

In [111]:
def calculate_relationship_bonus(relationship_df):
    """
    Calculates time since last grant and calculates a bonus. Only runs if there is a relationship.
    """

    #get time lapsed since last gift
    last_grant_year = relationship_df["year"].max()
    current_year = datetime.now().year
    time_lapsed = current_year - last_grant_year
    
    #normalise
    if time_lapsed <= 2:
        bonus = 1.5
    elif time_lapsed <= 3:
        bonus = 1.4
    elif time_lapsed <= 5:
        bonus = 1.3
    elif time_lapsed <= 10:
        bonus = 1.2
    else:
        bonus = 1.1
    
    #add uplift for recurring relationship
    num_grants = len(relationship_df)
    if num_grants >= 5:
        bonus += 0.1
    
    return time_lapsed, bonus, last_grant_year

#get bonus and reasoning
time_lapsed, relationship_bonus, last_grant_year = calculate_relationship_bonus(relationship)

print(f"Last gift given in {last_grant_year} ({time_lapsed} years ago)")
print(f"Relationship bonus: *{relationship_bonus}")

Last gift given in 2013 (12 years ago)
Relationship bonus: *1.1
