# Data Preprocessing

# Setup

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import os
import sys
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
import warnings
warnings.filterwarnings("ignore")
import time
import re
import spacy
from prep_utils import extract_areas, extract_classifications

project_root = os.path.abspath('..')
if project_root not in sys.path:
    sys.path.insert(0, project_root)
from utils import get_table_from_supabase, extract_areas, extract_classifications
from data_importer import pipe_to_supabase

#get keys from env
load_dotenv()
url = os.getenv("SUPABASE_URL")
key = os.getenv("SUPABASE_KEY")

----

# Retrieving Data from Supabase and Building Dataframes

In [7]:
#get tables and build dataframes
funders_df = get_table_from_supabase(url, key, "funders")
grants_df = get_table_from_supabase(url, key, "grants")
areas_df = get_table_from_supabase(url, key, "areas")

#get recipients with filter
recipients_df = get_table_from_supabase(url, key, "recipients", batch_size=50, filter_recipients=True)

In [8]:
#get checkpoint folder
checkpoint_folder = Path("./8.1_checkpoints/")

#create checkpoint - save dfs to pickle
# recipients_df.to_pickle(checkpoint_folder / "recipients_df.pkl")
# funders_df.to_pickle(checkpoint_folder / "funders_df.pkl")
# grants_df.to_pickle(checkpoint_folder / "grants_df.pkl")
# areas_df.to_pickle(checkpoint_folder / "areas_df.pkl")

----

# Retrieving Data from Checkpoints (pre-embedding)

In [2]:
#get checkpoint folder
checkpoint_folder = Path("./8.1_checkpoints/")

recipients_df = pd.read_pickle(checkpoint_folder / "recipients_df.pkl")
funders_df = pd.read_pickle(checkpoint_folder / "funders_df.pkl")
grants_df = pd.read_pickle(checkpoint_folder / "grants_df.pkl")
areas_df = pd.read_pickle(checkpoint_folder / "areas_df.pkl")

-----

# Feature Creation - Extracted Classifications

Crucial information about funders' priorities is frequently buried within the narrative sections of annual accounts rather than captured in the Charity Commission's classification system. The structured data fields provided by the Charity Commission often contain only broad, generic categories that can actually offer little practical value to applicants.

For example, the John Swire 1989 Charitable Trust lists its classifications as:
- **Causes**: General Charitable Purposes
- **Beneficiaries**: Other Charities Or Voluntary Bodies
- **Areas**: Throughout England

However, the Achievements and Performance section in the annual accounts states that the Trustees focused on certain causes in 2024 such as habitat conservation and NEET young people - and crucially, that the Trust 'also has a geographic focus to its grant making, with a particular interest in Kent and the Isle of Wight'. This discrepancy exemplifies why relying solely on structured classification data would produce misleading alignment scores. The formal categories suggest nationwide scope and universal charitable purposes, whilst the accounts reveal targeted geographic and thematic priorities. 

As such, I will attempt to capture any areas of focus that appear in the text sections of funders' data.

Similarly, data extracted from funders' accounts is likely to be incomplete, so extracting classifications data from grants and recipient information will be useful in filling gaps and revealing the funder's actual giving patterns beyond what they explicitly state. This grants-derived classification data can then be compared against both the user's input and the funder's stated criteria to identify alignment based on demonstrated behaviour rather than declared policy.

In [3]:
funder_sections = ["activities", "objectives", "objectives_activities", "achievements_performance", "grant_policy"]
grant_sections = ["grant_title", "grant_desc"]
recipient_sections = ["recipient_name", "recipient_activities", "recipient_objectives"]

## Named Entity Recognition

In [4]:
#load spacy
nlp = spacy.load("en_core_web_lg")

#define elements to process
ner_data = [(funders_df, funder_sections, "funders"), (grants_df, grant_sections, "grants"), (recipients_df, recipient_sections, "recipients")]

#extract location entities
for df, sections, name in ner_data:
    start_time = time.time()
    df["extracted_class"] = df.apply(lambda row: extract_areas(row, sections, nlp), axis=1)
    elapsed_time = time.time() - start_time
    print(f"Extraction complete for {name}. Total time: {elapsed_time:.2f}s")

Extraction complete for funders. Total time: 27.79s
Extraction complete for grants. Total time: 127.78s
Extraction complete for recipients. Total time: 183.37s


## Keyword Matching

In [5]:
#load classifications data
ukcat_url = "https://raw.githubusercontent.com/lico27/ukcat/main/data/ukcat.csv"
ukcat_df = pd.read_csv(ukcat_url)

In [9]:
#define elements to process
keyword_data = [(funders_df, funder_sections, "funders"), (grants_df, grant_sections, "grants"), (recipients_df, recipient_sections, "recipients")]

#extract classifications
for df, sections, name in keyword_data:
    start_time = time.time()
    df["extracted_class"] = df.apply(lambda row: extract_classifications(row, sections, ukcat_df, areas_df), axis=1)
    elapsed_time = time.time() - start_time
    print(f"Classification extraction complete for {name}. Total time: {elapsed_time:.2f}s")

Classification extraction complete for funders. Total time: 65.88s
Classification extraction complete for grants. Total time: 1038.02s
Classification extraction complete for recipients. Total time: 1020.46s


In [10]:
#verify extraction using john swire 1989 as an example
print(funders_df[funders_df["registered_num"] == "802142"]["extracted_class"].values[0])

['The United Kingdom', 'Families', 'Young people', 'Grant making', 'Grants to organisations', 'Charity and VCS support', 'Unemployment', 'Education', 'Training', 'Employability training', 'Wildlife', 'Historical conservation and restoration', 'Individual poverty', 'Hampshire', 'Isle Of Wight', 'Kent']


In [11]:
#capitalise extracted classifications
for df in [funders_df, grants_df, recipients_df]:
    df["extracted_class"] = df["extracted_class"].apply(lambda classifications: [phrase.upper() for phrase in classifications] if isinstance(classifications, list) else [])

#remove "grant making" from grants and recipients
for df in [grants_df, recipients_df]:
    df["extracted_class"] = df["extracted_class"].apply(lambda classifications: [phrase for phrase in classifications if phrase != "GRANT MAKING"] if isinstance(classifications, list) else [])

#rename columns
grants_df = grants_df.rename(columns={"extracted_class": "grant_extracted_class"})
recipients_df = recipients_df.rename(columns={"extracted_class": "recipient_extracted_class"})

# Embedding Creation - Single Columns

I will create embeddings for the relevant columns. I have chosen to run each dataframe one at a time to separate these time- and compute-heavy processes.

In [12]:
model = SentenceTransformer("all-roberta-large-v1")

In [13]:
funders_cols = ["name", "activities", "objectives", "objectives_activities", "achievements_performance", "grant_policy"]

#create embeddings
start_time = time.time()
for col in funders_cols:
    #replace nans with empty string
    texts = funders_df[col].fillna("").tolist()
    embeddings = model.encode(texts)
    
    #add to df
    funders_df[f"{col}_em"] = list(embeddings)
    
    print(f"{col} embedded successfully {embeddings.shape}")

elapsed_time = time.time() - start_time
print(f"All embeddings created for funders_df!")
print(f"Total time: {elapsed_time:.2f}s")

name embedded successfully (996, 1024)
activities embedded successfully (996, 1024)
objectives embedded successfully (996, 1024)
objectives_activities embedded successfully (996, 1024)
achievements_performance embedded successfully (996, 1024)
grant_policy embedded successfully (996, 1024)
All embeddings created for funders_df!
Total time: 139.82s


In [14]:
recipients_cols = ["recipient_name", "recipient_activities", "recipient_objectives"]

#create embeddings
start_time = time.time()
for col in recipients_cols:
    #replace nans with empty string
    texts = recipients_df[col].fillna("").tolist()
    embeddings = model.encode(texts)
    
    #add to df
    recipients_df[f"{col}_em"] = list(embeddings)
    
    print(f"{col} embedded successfully {embeddings.shape}")

elapsed_time = time.time() - start_time
print(f"All embeddings created for recipients_df!")
print(f"Total time: {elapsed_time:.2f}s")

recipient_name embedded successfully (17169, 1024)
recipient_activities embedded successfully (17169, 1024)
recipient_objectives embedded successfully (17169, 1024)
All embeddings created for recipients_df!
Total time: 1005.15s


In [15]:
grants_cols = ["grant_title", "grant_desc"]

#create embeddings
start_time = time.time()
for col in grants_cols:
    #replace nans with empty string
    texts = grants_df[col].fillna("").tolist()
    embeddings = model.encode(texts)
    
    #add to df
    grants_df[f"{col}_em"] = list(embeddings)
    
    print(f"{col} embedded successfully {embeddings.shape}")

elapsed_time = time.time() - start_time
print(f"All embeddings created for grants_df!")
print(f"Total time: {elapsed_time:.2f}s")

grant_title embedded successfully (32816, 1024)
grant_desc embedded successfully (32816, 1024)
All embeddings created for grants_df!
Total time: 786.82s


-----

# Embedding Creation - Concatenated Columns

In [16]:
#concatenate funder columns
funder_text_cols = ["activities", "objectives", "achievements_performance", "grant_policy"]

funders_df["concat_text"] = funders_df[funder_text_cols[0]].fillna("")
for col in funder_text_cols[1:]:
    funders_df["concat_text"] += " " + funders_df[col].fillna("")

#make lowercase
funders_df["concat_text"] = funders_df["concat_text"].str.lower()

#create embeddings
start_time = time.time()
texts = funders_df["concat_text"].tolist()
embeddings = model.encode(texts)
funders_df["concat_em"] = list(embeddings)

elapsed_time = time.time() - start_time
print(f"Funders concatenated text embedded successfully {embeddings.shape}")
print(f"Total time: {elapsed_time:.2f}s")

Funders concatenated text embedded successfully (996, 1024)
Total time: 79.67s


In [17]:
#concatenate recipient columns
recipient_text_cols = ["recipient_activities", "recipient_objectives"]

recipients_df["concat_text"] = recipients_df[recipient_text_cols[0]].fillna("")
for col in recipient_text_cols[1:]:
    recipients_df["concat_text"] += " " + recipients_df[col].fillna("")

#make lowercase
recipients_df["concat_text"] = recipients_df["concat_text"].str.lower()

#create embeddings
start_time = time.time()
texts = recipients_df["concat_text"].tolist()
embeddings = model.encode(texts)
recipients_df["recipient_concat_em"] = list(embeddings)

elapsed_time = time.time() - start_time
print(f"Recipients concatenated text embedded successfully {embeddings.shape}")
print(f"Total time: {elapsed_time:.2f}s")

Recipients concatenated text embedded successfully (17169, 1024)
Total time: 600.73s


In [18]:
#concatenate grant columns
grant_text_cols = ["grant_title", "grant_desc"]

grants_df["concat_text"] = grants_df[grant_text_cols[0]].fillna("")
for col in grant_text_cols[1:]:
    grants_df["concat_text"] += " " + grants_df[col].fillna("")

#make lowercase
grants_df["concat_text"] = grants_df["concat_text"].str.lower()

#create embeddings
start_time = time.time()
texts = grants_df["concat_text"].tolist()
embeddings = model.encode(texts)
grants_df["grant_concat_em"] = list(embeddings)

elapsed_time = time.time() - start_time
print(f"Grants concatenated text embedded successfully {embeddings.shape}")
print(f"Total time: {elapsed_time:.2f}s")

Grants concatenated text embedded successfully (32816, 1024)
Total time: 470.03s


In [19]:
#drop concatenated text
funders_df = funders_df.drop(columns=["concat_text"])
recipients_df = recipients_df.drop(columns=["concat_text"])
grants_df = grants_df.drop(columns=["concat_text"])

In [20]:
#create checkpoint - save dfs to pickle
# recipients_df.to_pickle(checkpoint_folder / "recipients_df_em.pkl")
# funders_df.to_pickle(checkpoint_folder / "funders_df_em.pkl")
# grants_df.to_pickle(checkpoint_folder / "grants_df_em.pkl")

----

# Retrieving Data from Checkpoints (post-embedding)

In [21]:
#get checkpoint folder
checkpoint_folder = Path("./8.1_checkpoints/")

# recipients_df = pd.read_pickle(checkpoint_folder / "recipients_df_em.pkl")
# funders_df = pd.read_pickle(checkpoint_folder / "funders_df_em.pkl")
# grants_df = pd.read_pickle(checkpoint_folder / "grants_df_em.pkl")

-----

# Boolean Feature Creation - Single-Beneficiary Funders

When extracting grants from PDF accounts, the LLM at times failed to interpret entries properly, extracting purchase descriptions as recipient names (e.g. 'cricket balls for the Year 7 team' from a PTA's accounts) or recording vague summaries as distinct recipients (e.g. 'five various causes'). These errors are particularly prevalent in accounts from single-beneficiary funders where expenditure descriptions differ structurally from multi-recipient grant listings. I will seek to flag funders that are likely to be single-beneficiary, as this is information that can be shared with the end user if they want to check their alignment score with such a funder (e.g. 'ABC Trust seems to only give grants to one recipient, ABC School, so may not be a suitable funder to approach with an application'.)

I will use some typical "clue" words that indicate a single-beneficiary funder (school, friends of, PTA, church, pupils) and, if these words are present in the name/activities of a funder, `is_potential_sbf` will be assigned True. 

In [22]:
#check name and actitivities for giveaway single-beneficiary words
name_check = funders_df["name"].str.contains(
    r"SCHOOL|FRIENDS OF|PTA|CHURCH|HOSPITAL",
    case=False,
    na=False
)
activities_check = funders_df["activities"].str.contains(
    r"PUPIL?",
    case=False,
    na=False
)

funders_df["is_potential_sbf"] = name_check | activities_check

print(f"Flagged as potential SBFs: {funders_df['is_potential_sbf'].sum()}")

Flagged as potential SBFs: 121


In [23]:
#view an sbf funder
funders_df[funders_df["registered_num"] == "1081835"]

Unnamed: 0,registered_num,name,website,activities,objectives,income_latest,expenditure_latest,objectives_activities,achievements_performance,grant_policy,...,is_on_list,is_nua,name_em,activities_em,objectives_em,objectives_activities_em,achievements_performance_em,grant_policy_em,concat_em,extracted_class
122,1081835,FRIENDS OF THAMES DITTON INFANT SCHOOL,https://www.friendsoftdis.co.uk,FUND RAISING ACTIVITIES TO SUPPLY OUR SCHOOL W...,TO ADVANCE THE EDUCATION OF THE PUPILS AT THAM...,31638.0,42665.0,,,,...,False,False,"[0.003010355, 0.016931988, -0.0035435099, -0.0...","[0.0031424793, 0.008433894, -0.0058086696, -0....","[-0.004439995, 0.024636827, -0.002953147, -0.0...","[-0.019817753, -0.00571729, 0.022262126, -0.03...","[-0.019817753, -0.00571729, 0.022262126, -0.03...","[-0.019817753, -0.00571729, 0.022262126, -0.03...","[-0.031251892, 0.0021451209, 0.027800791, -0.0...","[FUNDRAISING, NURSERY, EDUCATION, SCHOOLS]"


---

# Boolean Feature Creation - 'No Unsolicited Applications'

An incredibly important factor in prospecting is whether funders accept unsolicited applications. This information is frequently absent from formal eligibility criteria and instead appears subtly in trustees' reports or governance statements, often using varied terminology such as 'by invitation only', 'Trustees proactively identify causes to support', or 'we do not encourage unsolicited applications'. I will create a boolean feature to flag funders where their text sections contain such phrases to indicate that they may not be accommodating to unsolicited applications.

In [24]:
#define nua patterns
nua_pattern = (
    r"unsolicited|by invitation only|proactive in seeking|"
    r"unusual .{0,20} unsolicited|no .{0,20} application process|trustees identify|"
    r"proactively identify|not currently accepting|closed to applications"
)

#check text sections for nua phrases
text_cols = ["activities", "objectives", "objectives_activities", "achievements_performance", "grant_policy"]
funders_df["is_nua"] = funders_df[text_cols].apply(lambda row: any(row[col] and re.search(nua_pattern, str(row[col]), re.IGNORECASE) for col in text_cols), axis=1)

print(f"Flagged as NUA: {funders_df['is_nua'].sum()}")

Flagged as NUA: 13


In [25]:
#view an nua funder
funders_df.loc[funders_df["registered_num"] == "299918"]

Unnamed: 0,registered_num,name,website,activities,objectives,income_latest,expenditure_latest,objectives_activities,achievements_performance,grant_policy,...,is_on_list,is_nua,name_em,activities_em,objectives_em,objectives_activities_em,achievements_performance_em,grant_policy_em,concat_em,extracted_class
908,299918,C A REDFERN CHARITABLE FOUNDATION,,THE FOUNDATION IS A GRANT MAKING CHARITY THE T...,SUCH EXCLUSIVELY CHARITABLE OBJECTS AND PURPOS...,161278.0,250394.0,THE TRUST IS A GRANT MAKING CHARITY AND THE CA...,"DURING THE YEAR THE TRUST MADE GRANTS OF £205,...",THE TRUSTEES MEET REGULARLY TO DISCUSS THE MAK...,...,False,True,"[-0.015236849, 0.009997091, -0.020667668, -0.0...","[-0.008504445, 0.01241487, -0.0189871, -0.0055...","[-0.025489898, 0.045070395, -0.013425814, -0.0...","[0.0023606042, 0.005608919, -0.014298492, -0.0...","[-0.0056722607, 0.0015541456, 0.01542277, -0.0...","[0.0016173957, -0.00044143357, -0.022090923, 2...","[-0.0042508687, 0.003165915, 0.014033873, -0.0...","[GRANT MAKING, GRANTS TO ORGANISATIONS, POLICY..."


In [26]:
#create checkpoint - save dfs to pickle
funders_df.to_pickle(checkpoint_folder / "funders_df_em.pkl")
grants_df.to_pickle(checkpoint_folder / "grants_df_em.pkl")
recipients_df.to_pickle(checkpoint_folder / "recipients_df_em.pkl")

----

# Exporting Dataframes back to Supabase

In [31]:
#convert arrays to lists
for df in [funders_df, grants_df, recipients_df]:
    embedding_cols = [col for col in df.columns if col.endswith('_em')]
    for col in embedding_cols:
        df[col] = df[col].apply(lambda x: x.tolist() if isinstance(x, np.ndarray) else x)

#dictionary to hold tables and their keys
tables = {
    "funders": (funders_df, "registered_num"),
    "grants": (grants_df, "grant_id"),
    "recipients": (recipients_df, "recipient_id")
}

#pipe data to supabase
for table_name, (df, unique_key) in tables.items():
    pipe_to_supabase(df, table_name, unique_key, url, key, batch_size=5)

Successfully upserted all 17169 records to recipients
