# Data Preprocessing

# Setup

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import os
import sys
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
import warnings
warnings.filterwarnings("ignore")
import time
import re
import spacy
from prep_utils import extract_areas, extract_classifications

project_root = os.path.abspath('..')
if project_root not in sys.path:
    sys.path.insert(0, project_root)
from utils import get_table_from_supabase
from data_importer import pipe_to_supabase

#get keys from env
load_dotenv()
url = os.getenv("SUPABASE_URL")
key = os.getenv("SUPABASE_KEY")

----

# Retrieving Data from Supabase and Building Dataframes

In [2]:
#get tables and build dataframes
funders_df = get_table_from_supabase(url, key, "funders")
grants_df = get_table_from_supabase(url, key, "grants")
areas_df = get_table_from_supabase(url, key, "areas")

#get recipients with filter
recipients_df = get_table_from_supabase(url, key, "recipients", batch_size=50, filter_recipients=True)

In [3]:
#get checkpoint folder
checkpoint_folder = Path("./8.1_checkpoints/")

#create checkpoint - save dfs to pickle
# recipients_df.to_pickle(checkpoint_folder / "recipients_df.pkl")
# funders_df.to_pickle(checkpoint_folder / "funders_df.pkl")
# grants_df.to_pickle(checkpoint_folder / "grants_df.pkl")

----

# Retrieving Data from Checkpoints (pre-embedding)

In [None]:
#get checkpoint folder
checkpoint_folder = Path("./8.1_checkpoints/")

# recipients_df = pd.read_pickle(checkpoint_folder / "recipients_df.pkl")
# funders_df = pd.read_pickle(checkpoint_folder / "funders_df.pkl")
# grants_df = pd.read_pickle(checkpoint_folder / "grants_df.pkl")

-----

# Embedding Creation - Single Columns

I will create embeddings for the relevant columns. I have chosen to run each dataframe one at a time to separate these time- and compute-heavy processes.

In [2]:
model = SentenceTransformer("all-roberta-large-v1")

In [5]:
funders_cols = ["name", "activities", "objectives", "objectives_activities", "achievements_performance", "grant_policy"]

#create embeddings
start_time = time.time()
for col in funders_cols:
    #replace nans with empty string
    texts = funders_df[col].fillna("").tolist()
    embeddings = model.encode(texts)
    
    #add to df
    funders_df[f"{col}_em"] = list(embeddings)
    
    print(f"{col} embedded successfully {embeddings.shape}")

elapsed_time = time.time() - start_time
print(f"All embeddings created for funders_df!")
print(f"Total time: {elapsed_time:.2f}s")

name embedded successfully (996, 1024)
activities embedded successfully (996, 1024)
objectives embedded successfully (996, 1024)
objectives_activities embedded successfully (996, 1024)
achievements_performance embedded successfully (996, 1024)
grant_policy embedded successfully (996, 1024)
All embeddings created for funders_df!
Total time: 138.14s


In [6]:
recipients_cols = ["recipient_name", "recipient_activities", "recipient_objectives"]

#create embeddings
start_time = time.time()
for col in recipients_cols:
    #replace nans with empty string
    texts = recipients_df[col].fillna("").tolist()
    embeddings = model.encode(texts)
    
    #add to df
    recipients_df[f"{col}_em"] = list(embeddings)
    
    print(f"{col} embedded successfully {embeddings.shape}")

elapsed_time = time.time() - start_time
print(f"All embeddings created for recipients_df!")
print(f"Total time: {elapsed_time:.2f}s")

recipient_name embedded successfully (17169, 1024)
recipient_activities embedded successfully (17169, 1024)
recipient_objectives embedded successfully (17169, 1024)
All embeddings created for recipients_df!
Total time: 952.63s


In [7]:
grants_cols = ["grant_title", "grant_desc"]

#create embeddings
start_time = time.time()
for col in grants_cols:
    #replace nans with empty string
    texts = grants_df[col].fillna("").tolist()
    embeddings = model.encode(texts)
    
    #add to df
    grants_df[f"{col}_em"] = list(embeddings)
    
    print(f"{col} embedded successfully {embeddings.shape}")

elapsed_time = time.time() - start_time
print(f"All embeddings created for grants_df!")
print(f"Total time: {elapsed_time:.2f}s")

grant_title embedded successfully (32816, 1024)
grant_desc embedded successfully (32816, 1024)
All embeddings created for grants_df!
Total time: 759.85s


-----

# Embedding Creation - Concatenated Columns

In [15]:
#concatenate funder columns
funder_text_cols = ["activities", "objectives", "achievements_performance", "grant_policy"]

funders_df["concat_text"] = funders_df[funder_text_cols[0]].fillna("")
for col in funder_text_cols[1:]:
    funders_df["concat_text"] += " " + funders_df[col].fillna("")

#make lowercase
funders_df["concat_text"] = funders_df["concat_text"].str.lower()

#create embeddings
start_time = time.time()
texts = funders_df["concat_text"].tolist()
embeddings = model.encode(texts)
funders_df["concat_em"] = list(embeddings)

elapsed_time = time.time() - start_time
print(f"Funders concatenated text embedded successfully {embeddings.shape}")
print(f"Total time: {elapsed_time:.2f}s")

Funders concatenated text embedded successfully (996, 1024)
Total time: 55.38s


In [19]:
#concatenate recipient columns
recipient_text_cols = ["recipient_activities", "recipient_objectives"]

recipients_df["concat_text"] = recipients_df[recipient_text_cols[0]].fillna("")
for col in recipient_text_cols[1:]:
    recipients_df["concat_text"] += " " + recipients_df[col].fillna("")

#make lowercase
recipients_df["concat_text"] = recipients_df["concat_text"].str.lower()

#create embeddings
start_time = time.time()
texts = recipients_df["concat_text"].tolist()
embeddings = model.encode(texts)
recipients_df["recipient_concat_em"] = list(embeddings)

elapsed_time = time.time() - start_time
print(f"Recipients concatenated text embedded successfully {embeddings.shape}")
print(f"Total time: {elapsed_time:.2f}s")

Recipients concatenated text embedded successfully (17169, 1024)
Total time: 497.92s


In [22]:
#concatenate grant columns
grant_text_cols = ["grant_title", "grant_desc"]

grants_df["concat_text"] = grants_df[grant_text_cols[0]].fillna("")
for col in grant_text_cols[1:]:
    grants_df["concat_text"] += " " + grants_df[col].fillna("")

#make lowercase
grants_df["concat_text"] = grants_df["concat_text"].str.lower()

#create embeddings
start_time = time.time()
texts = grants_df["concat_text"].tolist()
embeddings = model.encode(texts)
grants_df["grant_concat_em"] = list(embeddings)

elapsed_time = time.time() - start_time
print(f"Grants concatenated text embedded successfully {embeddings.shape}")
print(f"Total time: {elapsed_time:.2f}s")

Grants concatenated text embedded successfully (32816, 1024)
Total time: 432.19s


In [24]:
#drop concatenated text
funders_df = funders_df.drop(columns=["concat_text"])
recipients_df = recipients_df.drop(columns=["concat_text"])
grants_df = grants_df.drop(columns=["concat_text"])

In [25]:
#create checkpoint - save dfs to pickle
# recipients_df.to_pickle(checkpoint_folder / "recipients_df_em.pkl")
# funders_df.to_pickle(checkpoint_folder / "funders_df_em.pkl")
# grants_df.to_pickle(checkpoint_folder / "grants_df_em.pkl")

----

# Retrieving Data from Checkpoints (post-embedding)

In [3]:
#get checkpoint folder
checkpoint_folder = Path("./8.1_checkpoints/")

recipients_df = pd.read_pickle(checkpoint_folder / "recipients_df_em.pkl")
funders_df = pd.read_pickle(checkpoint_folder / "funders_df_em.pkl")
grants_df = pd.read_pickle(checkpoint_folder / "grants_df_em.pkl")

-----

# Boolean Feature Creation - Single-Beneficiary Funders

When extracting grants from PDF accounts, the LLM at times failed to interpret entries properly, extracting purchase descriptions as recipient names (e.g. 'cricket balls for the Year 7 team' from a PTA's accounts) or recording vague summaries as distinct recipients (e.g. 'five various causes'). These errors are particularly prevalent in accounts from single-beneficiary funders where expenditure descriptions differ structurally from multi-recipient grant listings. I will seek to flag funders that are likely to be single-beneficiary, as this is information that can be shared with the end user if they want to check their alignment score with such a funder (e.g. 'ABC Trust seems to only give grants to one recipient, ABC School, so may not be a suitable funder to approach with an application'.)

I will use some typical "clue" words that indicate a single-beneficiary funder (school, friends of, PTA, church, pupils) and, if these words are present in the name/activities of a funder, `is_potential_sbf` will be assigned True. 

In [4]:
#check name and actitivities for giveaway single-beneficiary words
name_check = funders_df["name"].str.contains(
    r"SCHOOL|FRIENDS OF|PTA|CHURCH|HOSPITAL",
    case=False,
    na=False
)
activities_check = funders_df["activities"].str.contains(
    r"PUPIL?",
    case=False,
    na=False
)

funders_df["is_potential_sbf"] = name_check | activities_check

print(f"Flagged as potential SBFs: {funders_df['is_potential_sbf'].sum()}")

Flagged as potential SBFs: 121


In [5]:
#view an sbf funder
funders_df[funders_df["registered_num"] == "1081835"]

Unnamed: 0,registered_num,name,website,activities,objectives,income_latest,expenditure_latest,objectives_activities,achievements_performance,grant_policy,is_potential_sbf,is_on_list,is_nua,name_em,activities_em,objectives_em,objectives_activities_em,achievements_performance_em,grant_policy_em,concat_em
122,1081835,FRIENDS OF THAMES DITTON INFANT SCHOOL,https://www.friendsoftdis.co.uk,FUND RAISING ACTIVITIES TO SUPPLY OUR SCHOOL W...,TO ADVANCE THE EDUCATION OF THE PUPILS AT THAM...,31638.0,42665.0,,,,True,False,False,"[0.003010355, 0.016931988, -0.0035435099, -0.0...","[0.0031424793, 0.008433894, -0.0058086696, -0....","[-0.004439995, 0.024636827, -0.002953147, -0.0...","[-0.019817753, -0.00571729, 0.022262126, -0.03...","[-0.019817753, -0.00571729, 0.022262126, -0.03...","[-0.019817753, -0.00571729, 0.022262126, -0.03...","[-0.031251892, 0.0021451209, 0.027800791, -0.0..."


---

# Boolean Feature Creation - 'No Unsolicited Applications'

An incredibly important factor in prospecting is whether funders accept unsolicited applications. This information is frequently absent from formal eligibility criteria and instead appears subtly in trustees' reports or governance statements, often using varied terminology such as 'by invitation only', 'Trustees proactively identify causes to support', or 'we do not encourage unsolicited applications'. I will create a boolean feature to flag funders where their text sections contain such phrases to indicate that they may not be accommodating to unsolicited applications.

In [6]:
#define nua patterns
nua_pattern = (
    r"unsolicited|by invitation only|proactive in seeking|"
    r"unusual .{0,20} unsolicited|no .{0,20} application process|trustees identify|"
    r"proactively identify|not currently accepting|closed to applications"
)

#check text sections for nua phrases
text_cols = ["activities", "objectives", "objectives_activities", "achievements_performance", "grant_policy"]
funders_df["is_nua"] = funders_df[text_cols].apply(lambda row: any(row[col] and re.search(nua_pattern, str(row[col]), re.IGNORECASE) for col in text_cols), axis=1)

print(f"Flagged as NUA: {funders_df['is_nua'].sum()}")

Flagged as NUA: 13


In [7]:
#view an nua funder
funders_df.loc[funders_df["registered_num"] == "299918"]

Unnamed: 0,registered_num,name,website,activities,objectives,income_latest,expenditure_latest,objectives_activities,achievements_performance,grant_policy,is_potential_sbf,is_on_list,is_nua,name_em,activities_em,objectives_em,objectives_activities_em,achievements_performance_em,grant_policy_em,concat_em
908,299918,C A REDFERN CHARITABLE FOUNDATION,,THE FOUNDATION IS A GRANT MAKING CHARITY THE T...,SUCH EXCLUSIVELY CHARITABLE OBJECTS AND PURPOS...,161278.0,250394.0,THE TRUST IS A GRANT MAKING CHARITY AND THE CA...,"DURING THE YEAR THE TRUST MADE GRANTS OF £205,...",THE TRUSTEES MEET REGULARLY TO DISCUSS THE MAK...,False,False,True,"[-0.015236849, 0.009997091, -0.020667668, -0.0...","[-0.008504445, 0.01241487, -0.0189871, -0.0055...","[-0.025489898, 0.045070395, -0.013425814, -0.0...","[0.0023606042, 0.005608919, -0.014298492, -0.0...","[-0.0056722607, 0.0015541456, 0.01542277, -0.0...","[0.0016173957, -0.00044143357, -0.022090923, 2...","[-0.0042508687, 0.003165915, 0.014033873, -0.0..."


---

# Feature Creation - Extracted Classifications

Crucial information about funders' priorities is frequently buried within the narrative sections of annual accounts rather than captured in the Charity Commission's classification system. The structured data fields provided by the Charity Commission often contain only broad, generic categories that can actually offer little practical value to applicants.

For example, the John Swire 1989 Charitable Trust lists its classifications as:
- **Causes**: General Charitable Purposes
- **Beneficiaries**: Other Charities Or Voluntary Bodies
- **Areas**: Throughout England

However, the Achievements and Performance section in the annual accounts states that the Trustees focused on certain causes in 2024 such as habitat conservation and NEET young people - and crucially, that the Trust 'also has a geographic focus to its grant making, with a particular interest in Kent and the Isle of Wight'. This discrepancy exemplifies why relying solely on structured classification data would produce misleading alignment scores. The formal categories suggest nationwide scope and universal charitable purposes, whilst the accounts reveal targeted geographic and thematic priorities. 

As such, I will attempt to capture any areas of focus that appear in the text sections of funders' data.

In [8]:
section_cols = ["activities", "objectives", "objectives_activities", "achievements_performance", "grant_policy"]

## Named Entity Recognition

In [9]:
#load spacy
nlp = spacy.load("en_core_web_lg")

#extract location entities from funders df
start_time = time.time()
funders_df["extracted_class"] = funders_df.apply(lambda row: extract_areas(row, section_cols, nlp), axis=1)

elapsed_time = time.time() - start_time
print(f"Extraction complete. Total time: {elapsed_time:.2f}s")

Extraction complete. Total time: 27.68s


In [10]:
funders_df.head()

Unnamed: 0,registered_num,name,website,activities,objectives,income_latest,expenditure_latest,objectives_activities,achievements_performance,grant_policy,...,is_on_list,is_nua,name_em,activities_em,objectives_em,objectives_activities_em,achievements_performance_em,grant_policy_em,concat_em,extracted_class
0,1001255,MARY ELIZABETH SIEBEL CHARITY,https://www.siebelcharity.co.uk,TO PROVIDE BENEFITS FOR PEOPLE OVER 60 YEARS O...,FOR THE RELIEF OF SUCH OLD PEOPLE WHO ARE ILL ...,111003.0,127864.0,THE OBJECTS OF THE CHARITY ARE TO PROVIDE BENE...,THE TRUSTEES CONSIDER THAT THE PERFORMANCE OF ...,GRANT APPLICATION FORMS ARE AVAILABLE FROM THE...,...,False,False,"[-0.00885222, 0.002967594, -0.023896331, -0.04...","[-0.01666838, 0.060163833, -0.014278077, 0.000...","[-0.024755005, 0.07678002, -0.03508068, 0.0127...","[-0.018843979, 0.025924828, -0.030358432, -0.0...","[-0.0039577875, 0.013076805, -0.0073595895, -0...","[-0.019841895, 0.0031771406, -0.027174626, -0....","[-0.016057273, 0.03979668, -0.01749037, -0.009...","[Newark, Town Hall, Newark Town Hall]"
1,1001423,ROBERT B MASSEY MEMORIAL TRUST,,1. TO PROVIDE IN INTERESTS OF SOCIAL WELFARE A...,A. TO PROVIDE IN THE AREA OF BENEFIT A PUBLIC ...,169806.0,177241.0,THE ROBERT B MASSEY MEMORIAL TRUST WAS ESTABLI...,THE MANAGEMENT COMMITTEE HAS CONTINUED TO UPKE...,,...,False,False,"[0.023935895, 0.02577662, -0.053395335, -0.014...","[0.00015603875, 0.047771093, -0.05095264, -0.0...","[-0.029350128, 0.0048018326, -0.005340717, -0....","[-0.014953703, 0.027399069, -0.04600057, -0.01...","[-0.0073936884, 0.01763689, -0.022457168, -0.0...","[-0.019817753, -0.00571729, 0.022262126, -0.03...","[-0.016407313, 0.015341417, 0.010400977, -0.01...",[The County Of East Yorkshire]
2,1001456,DWEK FAMILY CHARITABLE TRUST,,THE TRUST IS ESTABLISHED FOR THE BENEFIT OF SU...,GENERAL CHARITABLE PURPOSES,34295.0,43777.0,THE TRUST IS ESTABLISHED FOR THE BENEFIT OF SU...,THE TRUSTEES CONTINUE TO SUPPORT MANY WORTHY C...,,...,False,False,"[0.032613873, -0.024674801, -0.030953802, -0.0...","[0.01056072, 0.040060367, 0.0013433765, -0.026...","[-0.012934899, 0.022771241, -0.018087821, -0.0...","[-0.0064151892, 0.040589783, -0.008932285, -0....","[-0.017403448, -0.004005901, -0.020300724, -0....","[-0.019817753, -0.00571729, 0.022262126, -0.03...","[-0.017596856, -0.008768029, -0.013156795, -0....",[]
3,1001761,FREE SCHOOL CHARITY,,MAINTAINS THE FREE SCHOOL AND INSTITUTE FOR US...,1 MAINTENANCE AND REPAIR OF THE PARISH CHURCH ...,54965.0,38000.0,THE CHARITY HAS THE FOLLOWING ACTIVITIES AND O...,THE CHARITY HAS CONTINUED TO CARRY OUT ITS OBJ...,,...,False,False,"[-0.035740297, 0.0031257013, 0.0030229306, -0....","[-0.007762254, 0.01976979, -0.021402722, -0.05...","[-0.019827114, 0.03187082, -0.022778586, -0.04...","[0.016433666, 0.03430659, -0.025083642, -0.043...","[0.024749815, 0.025092356, -0.013807025, -0.02...","[-0.019817753, -0.00571729, 0.022262126, -0.03...","[-0.023906833, 0.0008785544, 0.027913362, -0.0...",[]
4,1002295,FRIENDS OF BULGARIA,https://www.friendsofbulgaria.org.uk,FRIENDS OF BULGARIA PROVIDES HUMANITARIAN AID ...,ADVANCEMENT OF EDUCATION IN BULGARIA AND FOR T...,13399.0,19929.0,,,,...,False,False,"[0.014371116, 0.057511616, -0.015682492, -0.00...","[0.007626344, 0.04028357, -0.014303379, -0.044...","[-0.0038710502, 0.055107836, -0.008044696, 0.0...","[-0.019817753, -0.00571729, 0.022262126, -0.03...","[-0.019817753, -0.00571729, 0.022262126, -0.03...","[-0.019817753, -0.00571729, 0.022262126, -0.03...","[0.009647214, 0.05919301, 0.017981695, -0.0154...",[Bulgaria]


## Keyword Matching

In [11]:
#load classifications data
ukcat_url = "https://raw.githubusercontent.com/lico27/ukcat/main/data/ukcat.csv"
ukcat_df = pd.read_csv(ukcat_url)

In [14]:
#extract classifications from funders
start_time = time.time()
funders_df["extracted_class"] = funders_df.apply(lambda row: extract_classifications(row, section_cols, ukcat_df, areas_df), axis=1)

elapsed_time = time.time() - start_time
print(f"Cause extraction complete. Total time: {elapsed_time:.2f}s")

Cause extraction complete. Total time: 64.91s


In [18]:
#verify extraction using john swire 1989 as an example
print(funders_df[funders_df["registered_num"] == "802142"]["extracted_class"].values[0])

['The United Kingdom', 'Families', 'Young people', 'Grant making', 'Grants to organisations', 'Charity and VCS support', 'Unemployment', 'Education', 'Training', 'Employability training', 'Wildlife', 'Historical conservation and restoration', 'Individual poverty', 'Hampshire', 'Isle Of Wight', 'Kent']


In [19]:
#create checkpoint - save df to pickle
# funders_df.to_pickle(checkpoint_folder / "funders_df_em.pkl")

----

# Exporting Dataframes back to Supabase

In [60]:
#convert arrays to lists
for df in [funders_df, recipients_df, grants_df]:
    embedding_cols = [col for col in df.columns if col.endswith('_em')]
    for col in embedding_cols:
        df[col] = df[col].apply(lambda x: x.tolist() if isinstance(x, np.ndarray) else x)

#dictionary to hold tables and their keys
tables = {
    "funders": (funders_df, "registered_num"),
    "grants": (grants_df, "grant_id"),
    "recipients": (recipients_df, "recipient_id")
}

#pipe data to supabase
for table_name, (df, unique_key) in tables.items():
    pipe_to_supabase(df, table_name, unique_key, url, key, batch_size=50)

Successfully upserted all 996 records to funders
Successfully upserted all 32816 records to grants
Successfully upserted all 17169 records to recipients
