# The Scoring Logic - Development

# Setup

In [1]:
import pandas as pd
from pathlib import Path
import os
import json
from datetime import datetime
from dotenv import load_dotenv
import warnings
warnings.filterwarnings("ignore")
# from IPython.display import display, HTML
from sentence_transformers import SentenceTransformer
from logic_utils import get_name_from_id, get_id_from_name, get_granularity_weight, check_if_parent, calculate_similarity_score, format_score_test

#get keys from env
load_dotenv()
url = os.getenv("SUPABASE_URL")
key = os.getenv("SUPABASE_KEY")

----

# Retrieving Data from Checkpoints

In [2]:
#get checkpoint folder
checkpoint_folder = Path("./10.1_checkpoints/")

#get checkpoint
funders_df = pd.read_pickle(checkpoint_folder / "funders_df.pkl")
grants_df = pd.read_pickle(checkpoint_folder / "grants_df.pkl")
pairs_df = pd.read_pickle(checkpoint_folder / "pairs_df.pkl")
areas_df = pd.read_pickle(checkpoint_folder / "areas_df.pkl")
hierarchies_df = pd.read_pickle(checkpoint_folder / "hierarchies_df.pkl")

----

# Preparation of User and Funder Data

To test the logic, I will use the 12 funder-recipient pairs that I identifed earlier. As with the logic development process, I will use the recipients of these pairs as proxies for user input.

First, the user will input information about their charity (the applicant), then embeddings will be created for the inputted text data. For the purposes of this testing notebook, I will simulate users' keyword input by using extracted classifications from recipients' data, but in the final artefact, the user will be asked to enter their own keywords.

## Creation of Embeddings from User Input

In [3]:
model = SentenceTransformer("all-roberta-large-v1")
user_cols = ["recipient_name", "recipient_activities", "recipient_objectives"]

for col in user_cols:
    #replace nans with empty string
    texts = pairs_df[col].fillna("").tolist()
    embeddings = model.encode(texts)
    
    #add to df
    pairs_df[f"{col}_em"] = list(embeddings)

pairs_df["concat_text"] = pairs_df[user_cols[0]].fillna("")
for col in user_cols[1:]:
    pairs_df["concat_text"] += " " + pairs_df[col].fillna("")

#make lowercase
pairs_df["concat_text"] = pairs_df["concat_text"].str.lower()

#create embeddings
texts = pairs_df["concat_text"].tolist()
embeddings = model.encode(texts)
pairs_df["user_concat_em"] = list(embeddings)

#drop concatenated text
pairs_df = pairs_df.drop(columns=["concat_text"])

#change recipient_ to user_
pairs_df = pairs_df.rename(columns=lambda col: f"user_{col[len('recipient_'):]}" if col.startswith("recipient_") else col)

In [4]:
pd.set_option("display.max_columns", None)
pairs_df.head(1)

Unnamed: 0,id,funder_registered_num,user_id,name,website,activities,objectives,income_latest,expenditure_latest,objectives_activities,achievements_performance,grant_policy,is_potential_sbf,is_on_list,is_nua,name_em,activities_em,objectives_em,objectives_activities_em,achievements_performance_em,grant_policy_em,concat_em,extracted_class,causes,areas,beneficiaries,income_history,expenditure_history,list_entries,user_name,user_activities,user_objectives,user_areas,user_causes,user_beneficiaries,user_extracted_class,user_name_em,user_activities_em,user_objectives_em,user_concat_em
0,1,1124856,328729,ROSA FUND,https://www.rosauk.org,ROSA IS THE FIRST UK-WIDE FUND FOR WOMEN'S INI...,THE OBJECTS OF THE CHARITY ARE TO FURTHER ANY ...,1407453.0,1372296.0,,,,False,False,False,"[0.036068745,0.02428467,-0.026885081,-0.001224...","[0.005698055,0.011768709,-0.015513399,0.004063...","[-0.023482107,-0.02466424,0.0036000705,-0.0259...","[-0.019817753,-0.00571729,0.022262126,-0.03666...","[-0.019817753,-0.00571729,0.022262126,-0.03666...","[-0.019817753,-0.00571729,0.022262126,-0.03666...","[0.00031545621,0.014991585,-0.003386881,0.0022...","[""UK"",""WALES"",""GIRLS"",""WOMEN"",""CHARITY AND VCS...",[General Charitable Purposes],[Throughout England And Wales],"[Other Charities Or Voluntary Bodies, Other De...","{2020: 155612.0, 2021: 4478996.0, 2022: 237267...","{2020: 974678.0, 2021: 2118687.0, 2022: 266530...",[],ASYLUM AID,THE PROVISION OF LEGAL ADVICE AND REPRESENTATI...,2. OBJECTS2.1 THE CHARITY IS ESTABLISHED FOR T...,[Throughout England And Wales],"[Education/training, The Prevention Or Relief ...",[People Of A Particular Ethnic Or Racial Origi...,"[""UK"",""ASYLUM SEEKERS AND REFUGEES"",""MIGRANTS""...","[-0.008021083, 0.0150393685, -0.02299179, -0.0...","[-0.022860443, 0.029296884, -0.017596053, -0.0...","[0.016410686, 0.00963383, -0.04015383, -0.0136...","[-0.00083443156, -0.01810797, -0.0055338936, -..."


----

In [5]:
pairs_backup = pairs_df.copy()

# Binary Criteria (Stated Preferences)

## Step 4: Existing Relationship

In [6]:
def check_existing_relationship(grants_df, funder_num, user_num):
    """
    Checks if funder has ever given a grant to the user.
    """
    relationship = grants_df[
        (grants_df["funder_num"] == funder_num) &
        (grants_df["recipient_id"] == user_num)
    ]

    num_grants = len(relationship)
    existing_relationship = num_grants > 0

    return existing_relationship, num_grants, relationship

----

# Classification Criteria (Stated Preferences)

## Step 5: Areas

In [7]:
def check_areas(funder_list, user_list, areas_df, hierarchies_df):
    """
    Calculates a score based on matches between the funder's and user's stated areas.
    """

    #convert names to ids
    funder_ids = [get_id_from_name(name, areas_df) for name in funder_list if get_id_from_name(name, areas_df) is not None]
    user_ids = [get_id_from_name(name, areas_df) for name in user_list if get_id_from_name(name, areas_df) is not None]
    
    #avoid zero division
    if len(user_ids) == 0:
        return 0.0, []
    
    #store ids as set and scores/reasoning as lists
    funder_set = set(funder_ids)
    scores = []
    reasoning = []
    
    for user_area in user_ids:
        user_area_name = get_name_from_id(user_area, areas_df)
        
        #check for exact match
        if user_area in funder_set:
            score = get_granularity_weight(user_area, areas_df) * 1.0
            scores.append(score)
            reasoning.append(f"Exact match: {user_area_name}")
        
        #check if user area is within funder area
        else:
            hierarchy_user_in_funder = None
            for funder_area in funder_ids:
                if check_if_parent(funder_area, user_area, hierarchies_df):
                    hierarchy_user_in_funder = funder_area
                    break
            
            if hierarchy_user_in_funder:
                parent_name = get_name_from_id(hierarchy_user_in_funder, areas_df)
                score = get_granularity_weight(hierarchy_user_in_funder, areas_df) * 0.6
                scores.append(score)
                reasoning.append(f"Hierarchical match: {user_area_name} (user) within {parent_name} (funder)")
            
            #check if funder area is within user area
            else:
                hierarchy_funder_in_user = None
                for funder_area in funder_ids:
                    if check_if_parent(user_area, funder_area, hierarchies_df):
                        hierarchy_funder_in_user = funder_area
                        break
                
                if hierarchy_funder_in_user:
                    child_name = get_name_from_id(hierarchy_funder_in_user, areas_df)
                    score = get_granularity_weight(user_area, areas_df) * 0.4
                    scores.append(score)
                    reasoning.append(f"Hierarchical match: {child_name} (funder) within {user_area_name} (user)")
                
                #no match
                else:
                    scores.append(0.0)
                    reasoning.append(f"No match: {user_area_name}")
    
    matched_scores = [s for s in scores if s > 0]
    if len(matched_scores) > 0:
        score = sum(matched_scores) / len(matched_scores)
    else:
        score = 0.0
    
    return max(0.0, score), reasoning

## Step 6: Beneficiaries

Scoring for beneficiaries is simpler than for areas. I will exclude the generic "Other Charities Or Voluntary Bodies" as it is likely that almost all funders will fall into this category, adding noise to the scoring. I will use a hierarchical scoring approach but without the granularity weighting, as the higher level categories in this classification are too broad as to offer real value to the calculation. 

In [8]:
def check_beneficiaries(funder_list, user_list):
    """
    Calculates a score based on matches between the funder's and user's stated beneficiaries.
    """

    #define categories and filter
    high_level_bens = {"Other Defined Groups", "The General Public/mankind"}
    exclude_bens = {"Other Charities Or Voluntary Bodies"}
    funder_bens = [ben for ben in funder_list if ben not in exclude_bens]
    user_bens = [ben for ben in user_list if ben not in exclude_bens]
    
    #avoid zero division
    if len(user_bens) == 0:
        return 0.0, []
    
    #categorise funder beneficiaries
    funder_specific = set(ben for ben in funder_bens if ben not in high_level_bens)
    has_high_level = any(ben in high_level_bens for ben in funder_bens)
    
    scores = []
    reasoning = []
    for user_ben in user_bens:
        if user_ben in funder_specific:
            scores.append(1.0)
            reasoning.append(f"Exact match: {user_ben}")
        elif has_high_level:
            scores.append(0.2)
            reasoning.append(f"Weak match: user states '{user_ben}' and funder supports broad categories")
        else:
            scores.append(0.0)
            reasoning.append(f"No match: {user_ben}")
    
    matched_scores = [s for s in scores if s > 0]
    if len(matched_scores) > 0:
        score = sum(matched_scores) / len(matched_scores)
    else:
        score = 0.0

    return max(0.0, score), reasoning

## Step 7: Causes

For causes, I will exclude "Other Charitable Purposes" as it adds noise. However, I will not exclude "General Charitable Purposes" (GCP) as this is used by funders to indicate that they would be willing to consider any causes. I will use it as a fallback similar to how "Throughout England" works for areas. 

The scoring checks for exact matches between the user's and funder's causes first, which score 1.0. If no exact match exists but the funder lists GCP, this scores 0.5 as a weak indicator that the funder might support the cause. Non-matches score 0.0.

In [9]:
def check_causes(funder_list, user_list):
    """
    Calculates a score based on matches between the funder's and user's stated causes.
    """
    #define categories and filter
    gcp = "General Charitable Purposes"
    exclude_causes = {"Other Charitable Purposes"}
    funder_causes = [cause for cause in funder_list if cause not in exclude_causes]
    user_causes = [cause for cause in user_list if cause not in exclude_causes]
    
    #avoid zero division
    if len(user_causes) == 0:
        return 0.0, []
    
    #categorise funder causes
    funder_specific = set(cause for cause in funder_causes if cause != gcp)
    has_gcp = gcp in funder_causes
    
    scores = []
    reasoning = []
    
    for user_cause in user_causes:
        if user_cause in funder_specific:
            scores.append(1.0)
            reasoning.append(f"Exact match: {user_cause}")
        elif has_gcp:
            scores.append(0.6)
            reasoning.append(f"Weak match: user states '{user_cause}' and funder supports general charitable purposes")
        else:
            scores.append(0.0)
            reasoning.append(f"No match: {user_cause}")
    
    matched_scores = [s for s in scores if s > 0]
    if len(matched_scores) > 0:
        score = sum(matched_scores) / len(matched_scores)
    else:
        score = 0.0
    
    return max(0.0, score), reasoning

-----

# Semantic Similarity (Stated Preferences)

## Step 9: Keywords

In [10]:
def check_keywords(funder_keywords, user_keywords, model):
    """
    Calculates semantic similarity between funder (extracted) and user (inputted) keywords.
    """
    
    #parse json
    if isinstance(funder_keywords, str):
        funder_keywords = json.loads(funder_keywords)
    if isinstance(user_keywords, str):
        user_keywords = json.loads(user_keywords)
    
    #handle empty/nans
    if not funder_keywords:
        funder_keywords = []
    if not user_keywords:
        user_keywords = []
    
    if len(funder_keywords) == 0 or len(user_keywords) == 0:
        return 0.0, {}, ["No keywords to compare"], False
    
    #create embeddings for each keyword
    funder_keywords_em = {}
    for keyword in funder_keywords:
        embedding = model.encode(keyword)
        funder_keywords_em[keyword] = embedding

    user_keywords_em = {}
    for keyword in user_keywords:
        embedding = model.encode(keyword)
        user_keywords_em[keyword] = embedding

    #compare every funder keyword to every user keyword
    all_scores = []
    for funder_kw, funder_em in funder_keywords_em.items():
        for user_kw, user_em in user_keywords_em.items():
            similarity = calculate_similarity_score(funder_em, user_em)
            all_scores.append({
                "funder_keyword": funder_kw,
                "user_keyword": user_kw,
                "similarity": similarity
            })
    
    #sort and check for bonus (matches >= 0.9)
    all_scores.sort(key=lambda x: x["similarity"], reverse=True)
    gets_bonus = any(match["similarity"] >= 0.90 for match in all_scores)
    
    #get dictionary of matches >= 0.90
    strong_matches = {}
    for match in all_scores:
        if match["similarity"] >= 0.90:
            key = f"{match['funder_keyword']} & {match['user_keyword']}"
            strong_matches[key] = match["similarity"]
    
    #filter to top 10 matches <= 0.90 and get average
    scores_under_80 = [match for match in all_scores if match["similarity"] < 0.90]
    top_10 = scores_under_80[:10]

    if len(top_10) > 0:
        score = sum(match["similarity"] for match in top_10) / len(top_10)
    else:
        score = 0.0
    
    #build reasoning from medium matches
    reasoning = []
    for match in scores_under_80[:9]:
        reasoning.append(f"'{match['funder_keyword']}' & '{match['user_keyword']}': {match['similarity']:.3f}")
    
    return max(0.0, score), strong_matches, reasoning, gets_bonus

-----

# Semantic Similarity (Revealed Preferences)

## Step 10: Name Score

In [11]:
def check_name_rp(recipients_embedding_dict, user_embedding, user_name):
    """
    Calculates semantic similarity between the user's name and the names of the funder's previous recipients.
    """

    #handle empty/nan
    score = 0.0
    reasoning = []

    #compare every recipient name to the user's name
    all_scores = []
    for recipient_name, recipient_embedding in recipients_embedding_dict.items():
        if recipient_name != user_name:
            similarity = calculate_similarity_score(recipient_embedding, user_embedding)
            all_scores.append({
                "recipient_name": recipient_name,
                "similarity": similarity
            })

    #sort and calculate average of top 10
    all_scores.sort(key=lambda x: x["similarity"], reverse=True)
    top_10 = all_scores[:10]
    if len(top_10) > 0:
        score = sum(match["similarity"] for match in top_10) / len(top_10)
    else:
        score = 0.0

    #build reasoning from top 10 matches
    reasoning = []
    for match in top_10:
        reasoning.append(f"{match['recipient_name']}: {match['similarity']:.3f}")

    return max(0.0, score), reasoning

## Step 11: Grants Score

In [12]:
def check_grants_rp(grants_embedding_dict, user_embedding, user_name):
    """
    Calculates semantic similarity between the user's text sections and the funder's previous grants.
    """

    #handle empty/nan
    score = 0.0
    reasoning = []

    #compare every grant to the user's text
    all_scores = []
    for grant_recipient_name, grant_embedding in grants_embedding_dict.items():
        if grant_recipient_name != user_name:
            similarity = calculate_similarity_score(grant_embedding, user_embedding)
            all_scores.append({
                "grant_recipient_name": grant_recipient_name,
                "similarity": similarity
            })

    #sort and calculate average of top 10
    all_scores.sort(key=lambda x: x["similarity"], reverse=True)
    top_10 = all_scores[:10]
    if len(top_10) > 0:
        score = sum(match["similarity"] for match in top_10) / len(top_10)
    else:
        score = 0.0

    #build reasoning from top 10 matches
    reasoning = []
    for match in top_10:
        reasoning.append(f"{match['grant_recipient_name']}: {match['similarity']:.3f}")

    return max(0.0, score), reasoning

## Step 12: Recipients Score

In [13]:
def check_recipients_rp(recipients_embedding_dict, user_embedding, user_name):
    """
    Calculates semantic similarity between the user's text sections and those of the funder's previous recipients.
    """

    #handle empty/nan
    score = 0.0
    reasoning = []

    #compare every recipient's text to the user's text
    all_scores = []
    for recipient_name, recipient_embedding in recipients_embedding_dict.items():
        if recipient_name != user_name:
            similarity = calculate_similarity_score(recipient_embedding, user_embedding)
            all_scores.append({
                "grant_recipient_name": recipient_name,
                "similarity": similarity
            })

    #sort and calculate average of top 10
    all_scores.sort(key=lambda x: x["similarity"], reverse=True)
    top_10 = all_scores[:10]
    if len(top_10) > 0:
        score = sum(match["similarity"] for match in top_10) / len(top_10)
    else:
        score = 0.0

    #build reasoning from top 10 matches
    reasoning = []
    for match in top_10:
        reasoning.append(f"{match['grant_recipient_name']}: {match['similarity']:.3f}")

    return max(0.0, score), reasoning

----

# Penalties and Bonuses (Stated Preferences)

## Step 14: Keywords Bonus

In [14]:
def calculate_keywords_bonus(strong_matches, ukcat_df):
    """
    Calculates bonus based on keyword matches. Only runs if keywords with semantic scores above 0.8 exist.
    """

    #weight by specificity of ukcat level
    level_weights = {
        1: 0.3, 
        2: 0.7, 
        3: 1.0
    }
    
    weighted_scores = []
    for keyword, score in strong_matches.items():
        #find keyword in ukcat_df
        match = ukcat_df[ukcat_df['tag'].str.upper() == keyword.upper()]
        
        if not match.empty:
            level = match.iloc[0]['level']
            weighted_score = score * level_weights.get(level, 1.0)
        else:
            weighted_score = score * 0.3
        
        weighted_scores.append(weighted_score)
    
    avg_weighted = sum(weighted_scores) / len(weighted_scores)
    
    #calculate bonus
    bonus = 1.1 + (avg_weighted * 0.2)
    bonus = min(max(bonus, 1.1), 1.3)
    
    return bonus

## Step 15: Existing Relationship Bonus

In [15]:
def calculate_relationship_bonus(relationship_df):
    """
    Calculates time since last grant and calculates a bonus. Only runs if there is a relationship.
    """

    #get time lapsed since last gift
    last_grant_year = relationship_df["year"].max()
    current_year = datetime.now().year
    time_lapsed = current_year - last_grant_year
    
    #assign bands
    if time_lapsed <= 2:
        bonus = 1.5
    elif time_lapsed <= 3:
        bonus = 1.4
    elif time_lapsed <= 5:
        bonus = 1.3
    elif time_lapsed <= 10:
        bonus = 1.2
    else:
        bonus = 1.1
    
    #add uplift for recurring relationship
    num_grants = len(relationship_df)
    if num_grants >= 5:
        bonus += 0.1
    
    return time_lapsed, bonus, last_grant_year

-----

# Penalties and Bonuses (Revealed Preferences)

## Step 16: Areas Bonus

In [16]:
def calculate_areas_bonus_rp(funder_grants_df, user_areas, areas_df, hierarchies_df):
    """
    Calculates a bonus based on how well the user's areas match the funder's recipient's areas.
    """

    if funder_grants_df.empty:
        return 0.0, ["No grants history available"]

    #get unique areas from recipients
    all_areas = []
    for areas_list in funder_grants_df["recipient_areas"]:
        if isinstance(areas_list, list):
            all_areas.extend(areas_list)

    if len(all_areas) == 0:
        return 0.0, ["No area data available"]

    recipient_areas = list(set(all_areas))

    #check areas
    match_score, _ = check_areas(recipient_areas, user_areas, areas_df, hierarchies_df)

    #convert to bonus multiplier
    bonus = 1.0 + (match_score * 0.2)

    #get reasoning from top 10 (low level tiers only)
    area_count = {}
    for area_name in all_areas:
        area_id = get_id_from_name(area_name, areas_df)
        if area_id:
            granularity = get_granularity_weight(area_id, areas_df)
            if granularity >= 0.9:
                area_count[area_name] = area_count.get(area_name, 0) + 1

    if len(area_count) == 0:
        reasoning = ["Only broad geographic areas found"]
    else:
        sorted_areas = sorted(area_count.items(), key=lambda x: x[1], reverse=True)
        total_low_level = sum(area_count.values())

        reasoning = []
        for area_name, count in sorted_areas[:10]:
            percentage = (count / total_low_level) * 100
            reasoning.append(f"{area_name}: {count} grants ({percentage:.1f}%)")

    return bonus, reasoning

## Step 17: Keywords Bonus

In [17]:
def calculate_keywords_bonus_rp(funder_grants_df, user_keywords):
    """
    Calculates a bonus based on exact keyword matches between user and funder's recipients.
    """

    if funder_grants_df.empty:
        return 1.0, ["No grants history available"]

    #parse json
    if isinstance(user_keywords, str):
        user_keywords = json.loads(user_keywords)
    if not user_keywords:
        user_keywords = []

    if len(user_keywords) == 0:
        return 1.0, ["No user keywords to match"]

    #get all recipient keywords
    all_recipient_keywords = []
    for recipient_keywords in funder_grants_df["recipient_extracted_class"]:
        if isinstance(recipient_keywords, str):
            recipient_keywords = json.loads(recipient_keywords)
        if recipient_keywords:
            all_recipient_keywords.extend(recipient_keywords)

    if len(all_recipient_keywords) == 0:
        return 1.0, ["No recipient keywords available"]

    #find exact matches and count frequency
    matched_keywords = {}
    user_keywords_matched = set()

    for user_kw in user_keywords:
        if user_kw in all_recipient_keywords:
            user_keywords_matched.add(user_kw)
            matched_keywords[user_kw] = matched_keywords.get(user_kw, 0) + all_recipient_keywords.count(user_kw)

    #calculate match percentage
    match_percentage = len(user_keywords_matched) / len(user_keywords)

    #calculate bonus
    if match_percentage >= 0.9:
        bonus = 1.1
    elif match_percentage >= 0.5:
        bonus = 1.05
    else:
        bonus = 1.0 + (match_percentage * 0.2)

    #build reasoning from top 10
    if len(matched_keywords) == 0:
        reasoning = ["No exact keyword matches found"]
    else:
        sorted_matches = sorted(matched_keywords.items(), key=lambda x: x[1], reverse=True)

        reasoning = []
        for keyword, count in sorted_matches[:10]:
            reasoning.append(f"{keyword}: {count} occurrences")

    return bonus, reasoning

## Step 18: Low-Variance Penalty

In [18]:
def calculate_lv_penalty(funder_grants_df):
    """
    Identifies low variance in a funder's previous giving and calculates a penalty.
    """

    #skip funders with low giving history
    if len(funder_grants_df) < 10:
        return 1.0

    total_grants = len(funder_grants_df)
    unique_recipients = funder_grants_df['recipient_name'].nunique()
    
    #find proportion of grants to unique recipients
    variance_proportion = unique_recipients / total_grants
    
    #calculate penalty
    if variance_proportion < 0.3:
        penalty = 0.7
    else:
        penalty = 1.0
    
    return penalty

-----

# The Alignment Score Calculator

In [19]:
def calculate_alignment_score(pairs_df, idx, grants_df, areas_df, hierarchies_df, model):
    """
    Combines all 18 scoring steps to produce one final alignment score. 
    """

    #get funder's data
    funder_num = pairs_df["funder_registered_num"].iloc[idx]
    funder_grants_df = grants_df[grants_df["funder_num"] == funder_num].copy()
    
    #1 check if funder has a single beneficiary
    is_sbf = pairs_df["is_potential_sbf"].iloc[idx]

    #2 check if funder states no unsolicited applications
    is_nua = pairs_df["is_nua"].iloc[idx]

    #3 check if funder is on the list
    is_on_list = pairs_df["is_on_list"].iloc[idx]
    list_reasoning = set(pairs_df["list_entries"].iloc[idx]) if is_on_list else None

    #4 check if funder has ever given a grant to applicant
    user_num = pairs_df["user_id"].iloc[idx]
    existing_relationship, num_grants, relationship = check_existing_relationship(grants_df, funder_num, user_num)

    #5 get areas score
    funder_areas = pairs_df["areas"].iloc[idx].copy()
    user_areas = pairs_df["user_areas"].iloc[idx].copy()
    areas_score, areas_reasoning = check_areas(funder_areas, user_areas, areas_df, hierarchies_df)

    #6 get beneficiaries score
    funder_beneficiaries = pairs_df["beneficiaries"].iloc[idx].copy()
    user_beneficiaries = pairs_df["user_beneficiaries"].iloc[idx].copy()
    beneficiaries_score, beneficiaries_reasoning = check_beneficiaries(funder_beneficiaries, user_beneficiaries)

    #7 get causes score
    funder_causes = pairs_df["causes"].iloc[idx].copy()
    user_causes = pairs_df["user_causes"].iloc[idx].copy()
    causes_score, causes_reasoning = check_causes(funder_causes, user_causes)

    #8 get text semantic similarity score
    funder_embedding = pairs_df["concat_em"].iloc[idx]
    user_embedding = pairs_df["user_concat_em"].iloc[idx]
    text_similarity_score = calculate_similarity_score(funder_embedding, user_embedding)

    #9 get keyword semantic similarity score
    funder_keywords = pairs_df["extracted_class"].iloc[idx]
    user_keywords = pairs_df["user_extracted_class"].iloc[idx]
    keyword_similarity_score, keyword_strong_matches, keyword_reasoning, keyword_gets_bonus = check_keywords(funder_keywords, user_keywords, model)

    #10 get name (RP) semantic similarity score
    recipients_name_all_em = dict(zip(funder_grants_df["recipient_name"], funder_grants_df["recipient_name_em"]))
    user_name_em = pairs_df["user_name_em"].iloc[idx]
    user_name = pairs_df["user_name"].iloc[idx]
    name_score_rp, name_rp_reasoning = check_name_rp(recipients_name_all_em, user_name_em, user_name)

    #11 get grants (RP) semantic similarity score
    non_empty_grants = funder_grants_df[
        (funder_grants_df["grant_title"].notna() & (funder_grants_df["grant_title"] != "")) |
        (funder_grants_df["grant_desc"].notna() & (funder_grants_df["grant_desc"] != ""))
    ]
    grants_all_em = dict(zip(non_empty_grants["recipient_name"], non_empty_grants["grant_concat_em"]))
    user_concat_em = pairs_df["user_concat_em"].iloc[idx]
    user_name = pairs_df["user_name"].iloc[idx]
    grants_rp_score, grants_rp_reasoning = check_grants_rp(grants_all_em, user_concat_em, user_name)

    #12 get recipients (RP) semantic similarity score
    recipients_all_em = dict(zip(funder_grants_df["recipient_name"], funder_grants_df["recipient_concat_em"]))
    user_concat_em = pairs_df["user_concat_em"].iloc[idx]
    user_name = pairs_df["user_name"].iloc[idx]
    recipients_rp_score, recipients_rp_reasoning = check_recipients_rp(recipients_all_em, user_concat_em, user_name)

    #13 get sbf penalty
    sbf_penalty = 0.2 if is_sbf else 1.0

    #14 get keywords bonus
    if keyword_strong_matches:
        ukcat_url = "https://raw.githubusercontent.com/lico27/ukcat/main/data/ukcat.csv"
        ukcat_df = pd.read_csv(ukcat_url)
        keywords_bonus = calculate_keywords_bonus(keyword_strong_matches, ukcat_df)
    else:
        keywords_bonus = 1.0

    #15 get relationship bonus
    if existing_relationship:
        time_lapsed, relationship_bonus, last_grant_year = calculate_relationship_bonus(relationship)
    else:
        time_lapsed = None
        relationship_bonus = 1.0
        last_grant_year = None

    #16 get areas (RP) bonus
    user_areas = pairs_df["user_areas"].iloc[idx].copy()
    areas_rp_bonus, areas_rp_reasoning = calculate_areas_bonus_rp(funder_grants_df, user_areas, areas_df, hierarchies_df)

    #17 get keywords (RP) bonus
    user_keywords = pairs_df["user_extracted_class"].iloc[idx]
    keywords_rp_bonus, keywords_rp_reasoning = calculate_keywords_bonus_rp(funder_grants_df, user_keywords)

    #18 get low variance penalty
    lv_penalty = calculate_lv_penalty(funder_grants_df)
    
    return (is_sbf, is_nua, is_on_list, list_reasoning, existing_relationship, num_grants, relationship, areas_score, areas_reasoning, 
            beneficiaries_score, beneficiaries_reasoning, causes_score, causes_reasoning, text_similarity_score,
            keyword_similarity_score, keyword_strong_matches, keyword_reasoning, keyword_gets_bonus, name_score_rp, name_rp_reasoning,
            grants_rp_score, grants_rp_reasoning, recipients_rp_score, recipients_rp_reasoning, sbf_penalty, keywords_bonus,
            time_lapsed, relationship_bonus, last_grant_year, areas_rp_bonus, areas_rp_reasoning, keywords_rp_bonus, keywords_rp_reasoning, lv_penalty
    )

In [20]:
#calculate score for all pairs and display
for idx, row in pairs_df.iterrows():
    result = calculate_alignment_score(pairs_df, idx, grants_df, areas_df, hierarchies_df, model)
    format_score_test(idx, row, result)