# Data Preprocessing

# Setup

In [44]:
import pandas as pd
import numpy as np
from pathlib import Path
import os
import sys
from dotenv import load_dotenv
import matplotlib.pyplot as plt
import seaborn as sns
colours = sns.color_palette("Set2")
from sentence_transformers import SentenceTransformer, util
import warnings
warnings.filterwarnings("ignore")
import time
import re

project_root = os.path.abspath('..')
if project_root not in sys.path:
    sys.path.insert(0, project_root)
from utils import get_table_from_supabase
from data_importer import pipe_to_supabase

#get keys from env
load_dotenv()
url = os.getenv("SUPABASE_URL")
key = os.getenv("SUPABASE_KEY")

----

# Retrieving Data from Supabase and Building Dataframes

In [2]:
#get tables and build dataframes
funders_df = get_table_from_supabase(url, key, "funders")
grants_df = get_table_from_supabase(url, key, "grants")

#get recipients with filter
recipients_df = get_table_from_supabase(url, key, "recipients", batch_size=50, filter_recipients=True)

In [3]:
#get checkpoint folder
checkpoint_folder = Path("./8.1_checkpoints/")

#create checkpoint - save dfs to pickle
# recipients_df.to_pickle(checkpoint_folder / "recipients_df.pkl")
# funders_df.to_pickle(checkpoint_folder / "funders_df.pkl")
# grants_df.to_pickle(checkpoint_folder / "grants_df.pkl")

----

# Retrieving Data from Checkpoints (pre-embedding)

In [None]:
#get checkpoint folder
checkpoint_folder = Path("./8.1_checkpoints/")

# recipients_df = pd.read_pickle(checkpoint_folder / "recipients_df.pkl")
# funders_df = pd.read_pickle(checkpoint_folder / "funders_df.pkl")
# grants_df = pd.read_pickle(checkpoint_folder / "grants_df.pkl")

-----

# Embedding Creation - Single Columns

I will create embeddings for the relevant columns. I have chosen to run each dataframe one at a time to separate these time- and compute-heavy processes.

In [2]:
model = SentenceTransformer("all-roberta-large-v1")

In [5]:
funders_cols = ["name", "activities", "objectives", "objectives_activities", "achievements_performance", "grant_policy"]

#create embeddings
start_time = time.time()
for col in funders_cols:
    #replace nans with empty string
    texts = funders_df[col].fillna("").tolist()
    embeddings = model.encode(texts)
    
    #add to df
    funders_df[f"{col}_em"] = list(embeddings)
    
    print(f"{col} embedded successfully {embeddings.shape}")

elapsed_time = time.time() - start_time
print(f"All embeddings created for funders_df!")
print(f"Total time: {elapsed_time:.2f}s")

name embedded successfully (996, 1024)
activities embedded successfully (996, 1024)
objectives embedded successfully (996, 1024)
objectives_activities embedded successfully (996, 1024)
achievements_performance embedded successfully (996, 1024)
grant_policy embedded successfully (996, 1024)
All embeddings created for funders_df!
Total time: 138.14s


In [6]:
recipients_cols = ["recipient_name", "recipient_activities", "recipient_objectives"]

#create embeddings
start_time = time.time()
for col in recipients_cols:
    #replace nans with empty string
    texts = recipients_df[col].fillna("").tolist()
    embeddings = model.encode(texts)
    
    #add to df
    recipients_df[f"{col}_em"] = list(embeddings)
    
    print(f"{col} embedded successfully {embeddings.shape}")

elapsed_time = time.time() - start_time
print(f"All embeddings created for recipients_df!")
print(f"Total time: {elapsed_time:.2f}s")

recipient_name embedded successfully (17169, 1024)
recipient_activities embedded successfully (17169, 1024)
recipient_objectives embedded successfully (17169, 1024)
All embeddings created for recipients_df!
Total time: 952.63s


In [7]:
grants_cols = ["grant_title", "grant_desc"]

#create embeddings
start_time = time.time()
for col in grants_cols:
    #replace nans with empty string
    texts = grants_df[col].fillna("").tolist()
    embeddings = model.encode(texts)
    
    #add to df
    grants_df[f"{col}_em"] = list(embeddings)
    
    print(f"{col} embedded successfully {embeddings.shape}")

elapsed_time = time.time() - start_time
print(f"All embeddings created for grants_df!")
print(f"Total time: {elapsed_time:.2f}s")

grant_title embedded successfully (32816, 1024)
grant_desc embedded successfully (32816, 1024)
All embeddings created for grants_df!
Total time: 759.85s


-----

# Embedding Creation - Concatenated Columns

In [15]:
#concatenate funder columns
funder_text_cols = ["activities", "objectives", "achievements_performance", "grant_policy"]

funders_df["concat_text"] = funders_df[funder_text_cols[0]].fillna("")
for col in funder_text_cols[1:]:
    funders_df["concat_text"] += " " + funders_df[col].fillna("")

#make lowercase
funders_df["concat_text"] = funders_df["concat_text"].str.lower()

#create embeddings
start_time = time.time()
texts = funders_df["concat_text"].tolist()
embeddings = model.encode(texts)
funders_df["concat_em"] = list(embeddings)

elapsed_time = time.time() - start_time
print(f"Funders concatenated text embedded successfully {embeddings.shape}")
print(f"Total time: {elapsed_time:.2f}s")

Funders concatenated text embedded successfully (996, 1024)
Total time: 55.38s


In [19]:
#concatenate recipient columns
recipient_text_cols = ["recipient_activities", "recipient_objectives"]

recipients_df["concat_text"] = recipients_df[recipient_text_cols[0]].fillna("")
for col in recipient_text_cols[1:]:
    recipients_df["concat_text"] += " " + recipients_df[col].fillna("")

#make lowercase
recipients_df["concat_text"] = recipients_df["concat_text"].str.lower()

#create embeddings
start_time = time.time()
texts = recipients_df["concat_text"].tolist()
embeddings = model.encode(texts)
recipients_df["recipient_concat_em"] = list(embeddings)

elapsed_time = time.time() - start_time
print(f"Recipients concatenated text embedded successfully {embeddings.shape}")
print(f"Total time: {elapsed_time:.2f}s")

Recipients concatenated text embedded successfully (17169, 1024)
Total time: 497.92s


In [22]:
#concatenate grant columns
grant_text_cols = ["grant_title", "grant_desc"]

grants_df["concat_text"] = grants_df[grant_text_cols[0]].fillna("")
for col in grant_text_cols[1:]:
    grants_df["concat_text"] += " " + grants_df[col].fillna("")

#make lowercase
grants_df["concat_text"] = grants_df["concat_text"].str.lower()

#create embeddings
start_time = time.time()
texts = grants_df["concat_text"].tolist()
embeddings = model.encode(texts)
grants_df["grant_concat_em"] = list(embeddings)

elapsed_time = time.time() - start_time
print(f"Grants concatenated text embedded successfully {embeddings.shape}")
print(f"Total time: {elapsed_time:.2f}s")

Grants concatenated text embedded successfully (32816, 1024)
Total time: 432.19s


In [24]:
#drop concatenated text
funders_df = funders_df.drop(columns=["concat_text"])
recipients_df = recipients_df.drop(columns=["concat_text"])
grants_df = grants_df.drop(columns=["concat_text"])

In [25]:
#create checkpoint - save dfs to pickle
# recipients_df.to_pickle(checkpoint_folder / "recipients_df_em.pkl")
# funders_df.to_pickle(checkpoint_folder / "funders_df_em.pkl")
# grants_df.to_pickle(checkpoint_folder / "grants_df_em.pkl")

----

# Retrieving Data from Checkpoints (post-embedding)

In [3]:
#get checkpoint folder
checkpoint_folder = Path("./8.1_checkpoints/")

recipients_df = pd.read_pickle(checkpoint_folder / "recipients_df_em.pkl")
funders_df = pd.read_pickle(checkpoint_folder / "funders_df_em.pkl")
grants_df = pd.read_pickle(checkpoint_folder / "grants_df_em.pkl")

-----

# Boolean Feature Creation - Single-Beneficiary Funders

When extracting grants from PDF accounts, the LLM at times failed to interpret entries properly, extracting purchase descriptions as recipient names (e.g. 'cricket balls for the Year 7 team' from a PTA's accounts) or recording vague summaries as distinct recipients (e.g. 'five various causes'). These errors are particularly prevalent in accounts from single-beneficiary funders where expenditure descriptions differ structurally from multi-recipient grant listings. I will seek to flag funders that are likely to be single-beneficiary, as this is information that can be shared with the end user if they want to check their alignment score with such a funder (e.g. 'ABC Trust seems to only give grants to one recipient, ABC School, so may not be a suitable funder to approach with an application'.)

I will use some typical "clue" words that indicate a single-beneficiary funder (school, friends of, PTA, church, pupils) and, if these words are present in the name/activities of a funder, `is_potential_sbf` will be assigned True. 

In [61]:
#check name and actitivities for giveaway single-beneficiary words
name_check = funders_df["name"].str.contains(
    r"SCHOOL|FRIENDS OF|PTA|CHURCH|HOSPITAL",
    case=False,
    na=False
)
activities_check = funders_df["activities"].str.contains(
    r"PUPIL?",
    case=False,
    na=False
)

funders_df["is_potential_sbf"] = name_check | activities_check

print(f"Flagged as potential SBFs: {funders_df['is_potential_sbf'].sum()}")

Flagged as potential SBFs: 121


In [32]:
#view an sbf funder
funders_df[funders_df["registered_num"] == "1081835"]

Unnamed: 0,registered_num,name,website,activities,objectives,income_latest,expenditure_latest,objectives_activities,achievements_performance,grant_policy,is_potential_sbf,is_on_list,is_nua,name_em,activities_em,objectives_em,objectives_activities_em,achievements_performance_em,grant_policy_em,concat_em
122,1081835,FRIENDS OF THAMES DITTON INFANT SCHOOL,https://www.friendsoftdis.co.uk,FUND RAISING ACTIVITIES TO SUPPLY OUR SCHOOL W...,TO ADVANCE THE EDUCATION OF THE PUPILS AT THAM...,31638.0,42665.0,,,,True,False,False,"[0.003010355, 0.016931988, -0.0035435099, -0.0...","[0.0031424793, 0.008433894, -0.0058086696, -0....","[-0.004439995, 0.024636827, -0.002953147, -0.0...","[-0.019817753, -0.00571729, 0.022262126, -0.03...","[-0.019817753, -0.00571729, 0.022262126, -0.03...","[-0.019817753, -0.00571729, 0.022262126, -0.03...","[-0.031251892, 0.0021451209, 0.027800791, -0.0..."


---

# Boolean Feature Creation - 'No Unsolicited Applications'

An incredibly important factor in prospecting is whether funders accept unsolicited applications. This information is frequently absent from formal eligibility criteria and instead appears subtly in trustees' reports or governance statements, often using varied terminology such as 'by invitation only', 'Trustees proactively identify causes to support', or 'we do not encourage unsolicited applications'. I will create a boolean feature to flag funders where their text sections contain such phrases to indicate that they may not be accommodating to unsolicited applications.

In [52]:
#define nua patterns
nua_pattern = (
    r"unsolicited|by invitation only|proactive in seeking|"
    r"unusual .{0,20} unsolicited|no .{0,20} application process|trustees identify|"
    r"proactively identify|not currently accepting|closed to applications"
)

#check text sections for nua phrases
text_cols = ["activities", "objectives", "objectives_activities", "achievements_performance", "grant_policy"]
funders_df["is_nua"] = funders_df[text_cols].apply(lambda row: any(row[col] and re.search(nua_pattern, str(row[col]), re.IGNORECASE) for col in text_cols), axis=1)

print(f"Flagged as NUA: {funders_df['is_nua'].sum()}")

Flagged as NUA: 13


In [53]:
#view an nua funder
funders_df.loc[funders_df["registered_num"] == "299918"]

Unnamed: 0,registered_num,name,website,activities,objectives,income_latest,expenditure_latest,objectives_activities,achievements_performance,grant_policy,is_potential_sbf,is_on_list,is_nua,name_em,activities_em,objectives_em,objectives_activities_em,achievements_performance_em,grant_policy_em,concat_em
908,299918,C A REDFERN CHARITABLE FOUNDATION,,THE FOUNDATION IS A GRANT MAKING CHARITY THE T...,SUCH EXCLUSIVELY CHARITABLE OBJECTS AND PURPOS...,161278.0,250394.0,THE TRUST IS A GRANT MAKING CHARITY AND THE CA...,"DURING THE YEAR THE TRUST MADE GRANTS OF £205,...",THE TRUSTEES MEET REGULARLY TO DISCUSS THE MAK...,False,False,True,"[-0.015236848965287209, 0.009997091256082058, ...","[-0.008504444733262062, 0.012414869852364063, ...","[-0.025489898398518562, 0.04507039487361908, -...","[0.0023606042377650738, 0.0056089190766215324,...","[-0.005672260653227568, 0.0015541455941274762,...","[0.0016173956682905555, -0.0004414335708133876...","[-0.0042508686892688274, 0.003165914909914136,..."


---

# Feature Creation - Extracted Classifications

In [None]:
funders_df["extracted_class"] = None

----

# Exporting Dataframes back to Supabase

In [60]:
#convert arrays to lists
for df in [funders_df, recipients_df, grants_df]:
    embedding_cols = [col for col in df.columns if col.endswith('_em')]
    for col in embedding_cols:
        df[col] = df[col].apply(lambda x: x.tolist() if isinstance(x, np.ndarray) else x)

#dictionary to hold tables and their keys
tables = {
    "funders": (funders_df, "registered_num"),
    "grants": (grants_df, "grant_id"),
    "recipients": (recipients_df, "recipient_id")
}

#pipe data to supabase
for table_name, (df, unique_key) in tables.items():
    pipe_to_supabase(df, table_name, unique_key, url, key, batch_size=50)

Successfully upserted all 996 records to funders
Successfully upserted all 32816 records to grants
Successfully upserted all 17169 records to recipients
