# Data Preprocessing

# Setup

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import os
import sys
from dotenv import load_dotenv
import matplotlib.pyplot as plt
import seaborn as sns
colours = sns.color_palette("Set2")
from sentence_transformers import SentenceTransformer, util
import warnings
warnings.filterwarnings("ignore")
import time

project_root = os.path.abspath('..')
if project_root not in sys.path:
    sys.path.insert(0, project_root)
from utils import get_table_from_supabase

#get keys from env
load_dotenv()
url = os.getenv("SUPABASE_URL")
key = os.getenv("SUPABASE_KEY")

----

# Retrieving Data from Supabase and Building Dataframes

In [2]:
#get tables and build dataframes
funders_df = get_table_from_supabase(url, key, "funders")
grants_df = get_table_from_supabase(url, key, "grants")

#get recipients with filter
recipients_df = get_table_from_supabase(url, key, "recipients", batch_size=50, filter_recipients=True)

In [3]:
#get checkpoint folder
checkpoint_folder = Path("./8.1_checkpoints/")

#create checkpoint - save dfs to pickle
# recipients_df.to_pickle(checkpoint_folder / "recipients_df.pkl")
# funders_df.to_pickle(checkpoint_folder / "funders_df.pkl")
# grants_df.to_pickle(checkpoint_folder / "grants_df.pkl")

----

# Retreiving Data from Checkpoints

In [None]:
#get checkpoint folder
checkpoint_folder = Path("./8.1_checkpoints/")

# recipients_df = pd.read_pickle(checkpoint_folder / "recipients_df.pkl")
# funders_df = pd.read_pickle(checkpoint_folder / "funders_df.pkl")
# grants_df = pd.read_pickle(checkpoint_folder / "grants_df.pkl")

-----

# Embedding Creation - Single Columns

I will create embeddings for the relevant columns. I have chosen to run each dataframe one at a time to separate these time- and compute-heavy processes.

In [2]:
model = SentenceTransformer("all-roberta-large-v1")

In [5]:
funders_cols = ["name", "activities", "objectives", "objectives_activities", "achievements_performance", "grant_policy"]

#create embeddings
start_time = time.time()
for col in funders_cols:
    #replace nans with empty string
    texts = funders_df[col].fillna("").tolist()
    embeddings = model.encode(texts)
    
    #add to df
    funders_df[f"{col}_em"] = list(embeddings)
    
    print(f"{col} embedded successfully {embeddings.shape}")

elapsed_time = time.time() - start_time
print(f"All embeddings created for funders_df!")
print(f"Total time: {elapsed_time:.2f}s")

name embedded successfully (996, 1024)
activities embedded successfully (996, 1024)
objectives embedded successfully (996, 1024)
objectives_activities embedded successfully (996, 1024)
achievements_performance embedded successfully (996, 1024)
grant_policy embedded successfully (996, 1024)
All embeddings created for funders_df!
Total time: 138.14s


In [6]:
recipients_cols = ["recipient_name", "recipient_activities", "recipient_objectives"]

#create embeddings
start_time = time.time()
for col in recipients_cols:
    #replace nans with empty string
    texts = recipients_df[col].fillna("").tolist()
    embeddings = model.encode(texts)
    
    #add to df
    recipients_df[f"{col}_em"] = list(embeddings)
    
    print(f"{col} embedded successfully {embeddings.shape}")

elapsed_time = time.time() - start_time
print(f"All embeddings created for recipients_df!")
print(f"Total time: {elapsed_time:.2f}s")

recipient_name embedded successfully (17169, 1024)
recipient_activities embedded successfully (17169, 1024)
recipient_objectives embedded successfully (17169, 1024)
All embeddings created for recipients_df!
Total time: 952.63s


In [7]:
grants_cols = ["grant_title", "grant_desc"]

#create embeddings
start_time = time.time()
for col in grants_cols:
    #replace nans with empty string
    texts = grants_df[col].fillna("").tolist()
    embeddings = model.encode(texts)
    
    #add to df
    grants_df[f"{col}_em"] = list(embeddings)
    
    print(f"{col} embedded successfully {embeddings.shape}")

elapsed_time = time.time() - start_time
print(f"All embeddings created for grants_df!")
print(f"Total time: {elapsed_time:.2f}s")

grant_title embedded successfully (32816, 1024)
grant_desc embedded successfully (32816, 1024)
All embeddings created for grants_df!
Total time: 759.85s


-----

# Embedding Creation - Concatenated Columns

In [15]:
#concatenate funder columns
funder_text_cols = ["activities", "objectives", "achievements_performance", "grant_policy"]

funders_df["concat_text"] = funders_df[funder_text_cols[0]].fillna("")
for col in funder_text_cols[1:]:
    funders_df["concat_text"] += " " + funders_df[col].fillna("")

#make lowercase
funders_df["concat_text"] = funders_df["concat_text"].str.lower()

#create embeddings
start_time = time.time()
texts = funders_df["concat_text"].tolist()
embeddings = model.encode(texts)
funders_df["concat_em"] = list(embeddings)

elapsed_time = time.time() - start_time
print(f"Funders concatenated text embedded successfully {embeddings.shape}")
print(f"Total time: {elapsed_time:.2f}s")

Funders concatenated text embedded successfully (996, 1024)
Total time: 55.38s


In [16]:
funders_df.head(1)

Unnamed: 0,registered_num,name,website,activities,objectives,income_latest,expenditure_latest,objectives_activities,achievements_performance,grant_policy,...,is_on_list,is_nua,name_em,activities_em,objectives_em,objectives_activities_em,achievements_performance_em,grant_policy_em,concat_em,concat_text
0,1001255,MARY ELIZABETH SIEBEL CHARITY,https://www.siebelcharity.co.uk,TO PROVIDE BENEFITS FOR PEOPLE OVER 60 YEARS O...,FOR THE RELIEF OF SUCH OLD PEOPLE WHO ARE ILL ...,111003.0,127864.0,THE OBJECTS OF THE CHARITY ARE TO PROVIDE BENE...,THE TRUSTEES CONSIDER THAT THE PERFORMANCE OF ...,GRANT APPLICATION FORMS ARE AVAILABLE FROM THE...,...,False,False,"[-0.00885222, 0.002967594, -0.023896331, -0.04...","[-0.01666838, 0.060163833, -0.014278077, 0.000...","[-0.024755005, 0.07678002, -0.03508068, 0.0127...","[-0.018843979, 0.025924828, -0.030358432, -0.0...","[-0.0039577875, 0.013076805, -0.0073595895, -0...","[-0.019841895, 0.0031771406, -0.027174626, -0....","[-0.016057273, 0.03979668, -0.01749037, -0.009...",to provide benefits for people over 60 years o...


In [10]:
#create checkpoint - save dfs to pickle
# recipients_df.to_pickle(checkpoint_folder / "recipients_df_em.pkl")
# funders_df.to_pickle(checkpoint_folder / "funders_df_em.pkl")
# grants_df.to_pickle(checkpoint_folder / "grants_df_em.pkl")

In [3]:
#get checkpoint folder
checkpoint_folder = Path("./8.1_checkpoints/")

recipients_df = pd.read_pickle(checkpoint_folder / "recipients_df_em.pkl")
funders_df = pd.read_pickle(checkpoint_folder / "funders_df_em.pkl")
grants_df = pd.read_pickle(checkpoint_folder / "grants_df_em.pkl")

-----

# Boolean Feature Creation - Single-Beneficiary Funders

When extracting grants from PDF accounts, the LLM at times failed to interpret entries properly, extracting purchase descriptions as recipient names (e.g. 'cricket balls for the Year 7 team' from a PTA's accounts) or recording vague summaries as distinct recipients (e.g. 'five various causes'). These errors are particularly prevalent in accounts from single-beneficiary funders where expenditure descriptions differ structurally from multi-recipient grant listings. I will seek to flag funders that are likely to be single-beneficiary, as this is information that can be shared with the end user if they want to check their alignment score with such a funder (e.g. 'ABC Trust seems to only give grants to one recipient, ABC School, so may not be a suitable funder to approach with an application'.)

I will use some typical "clue" words that indicate a single-beneficiary funder (school, friends of, PTA, church, pupils) and, if these words are present in the name/activities of a funder, `is_potential_sbf` will be assigned True. 

In [None]:
#check name and actitivities for giveaway single-beneficiary words
name_check = funders_df["name"].str.contains(
    r'SCHOOL|FRIENDS OF|PTA|CHURCH|HOSPITAL',
    case=False,
    na=False
)
activities_check = funders_df["activities"].str.contains(
    r'PUPIL?',
    case=False,
    na=False
)

funders_df["is_potential_sbf"] = name_check | activities_check

print(f"Flagged as potential SBFs: {funders_df['is_potential_sbf'].sum()}")

# Boolean Feature Creation - 'No Unsolicited Applications'