# Exploratory Data Analysis

## Setup

In [1]:
import pandas as pd
import janitor
import os
import sys
from dotenv import load_dotenv
import time
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

project_root = os.path.abspath('..')
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from eda_utils import add_gbp_columns, explode_lists
from stats_builder import make_summary_df, calculate_stats, make_calculated_df, format_stats, format_df
from plots_builder import make_bar_chart
from utils import get_table_from_supabase

#get keys from env
load_dotenv()
url = os.getenv("SUPABASE_URL")
key = os.getenv("SUPABASE_KEY")

## Retrieving Data from Supabase and Building Dataframes

I will connect to Supabase and retrieve all records, in order to start building my analysis dataframes. I will create one dataframe for funder information, and another for grants and recipients information.

In [2]:
#get tables and build dataframes
tables = ["funders", "causes", "areas", "beneficiaries", "grants",
               "funder_causes", "funder_areas", "funder_beneficiaries", "funder_grants", 
               "financials", "funder_financials"]

for table in tables:
    globals()[table] = get_table_from_supabase(url, key, table)

#get recipients with filter
recipients = get_table_from_supabase(url, key, "recipients", batch_size=50, filter_recipients=True)
all_recipient_ids = set(recipients["recipient_id"].unique())

#get and filter recipient join tables
recipient_join_tables = ["recipient_grants", "recipient_areas", "recipient_beneficiaries", "recipient_causes"]
for table in recipient_join_tables:
    df = get_table_from_supabase(url, key, table)
    globals()[table] = df[df["recipient_id"].isin(all_recipient_ids)]

print(f"Loaded {len(funders)} funders, {len(recipients)} recipients, {len(grants)} grants")
print(f"Filtered recipient join tables to {len(all_recipient_ids)} valid recipients")

KeyError: 'recipient_id'

### The Funders Dataframe

#### Main Table

In [None]:
funders_df = funders.copy()

#define table relationships for funders
funder_rels = [
    {
        "join_table": funder_causes,
        "lookup_table": causes,
        "key": "cause_id",
        "value_col": "cause_name",
        "result_col": "causes"
    },
    {
        "join_table": funder_areas,
        "lookup_table": areas,
        "key": "area_id",
        "value_col": "area_name",
        "result_col": "areas"
    },
    {
        "join_table": funder_beneficiaries,
        "lookup_table": beneficiaries,
        "key": "ben_id",
        "value_col": "ben_name",
        "result_col": "beneficiaries"
    }
]

#group and merge
for rel in funder_rels:
    grouped = rel["join_table"].merge(rel["lookup_table"], on=rel["key"])
    grouped = grouped.groupby("registered_num")[rel["value_col"]].apply(list).reset_index()
    grouped.columns = ["registered_num", rel["result_col"]]
    funders_df = funders_df.merge(grouped, on="registered_num", how="left")

#add grant statistics columns 
grants_stats = funder_grants.merge(grants, on="grant_id")
grants_agg = grants_stats.groupby("registered_num").agg({
    "grant_id": "count",
    "amount": ["sum", "mean", "median"]
}).reset_index()
grants_agg.columns = ["registered_num", "num_grants", "total_given", "avg_grant", "median_grant"]

funders_df = funders_df.merge(grants_agg, on="registered_num", how="left")
funders_df["num_grants"] = funders_df["num_grants"].astype("Int64")

#replace nan values with empty lists
funders_df["causes"] = funders_df["causes"].apply(lambda x: x if isinstance(x, list) else [])
funders_df["areas"] = funders_df["areas"].apply(lambda x: x if isinstance(x, list) else [])
funders_df["beneficiaries"] = funders_df["beneficiaries"].apply(lambda x: x if isinstance(x, list) else [])

#round to 2 decimal places
funders_df = funders_df.round(2)
pd.set_option("display.float_format", "{:.2f}".format)

#format financial columns
float_cols = ["income_latest", "expenditure_latest", "total_given", "avg_grant", "median_grant"]
for col in float_cols:
    if col in funders_df.columns:
        funders_df[col + "_gbp"] = funders_df[col].apply(add_gbp_columns)

#### Financial History Table

In [None]:
#get full financial records and separate into income and expenditure
financial_history = funder_financials.merge(financials, on="financials_id")
income_history = financial_history[financial_history["financials_type"] == "income"]
expenditure_history = financial_history[financial_history["financials_type"] == "expenditure"]

#make financials dicts
income_by_funder = income_history.groupby("registered_num").apply(
    lambda x: dict(zip(x["financials_year"], x["financials_value"]))
).reset_index()
income_by_funder.columns = ["registered_num", "income_history"]

expenditure_by_funder = expenditure_history.groupby("registered_num").apply(
    lambda x: dict(zip(x["financials_year"], x["financials_value"]))
).reset_index()
expenditure_by_funder.columns = ["registered_num", "expenditure_history"]

#merge with funders and replace nans
funders_df = funders_df.merge(income_by_funder, on="registered_num", how="left")
funders_df = funders_df.merge(expenditure_by_funder, on="registered_num", how="left")
funders_df["income_history"] = funders_df["income_history"].apply(lambda x: x if isinstance(x, dict) else {})
funders_df["expenditure_history"] = funders_df["expenditure_history"].apply(lambda x: x if isinstance(x, dict) else {})

In [None]:
#extend column view, sort and preview funders
pd.set_option("display.max_columns", 100)
funders_df = funders_df.sort_values("total_given_gbp", ascending=False)
funders_df.head()

### The Grants Dataframe

#### Main Table

In [None]:
grants_df = grants.copy()

#add funder info
grants_df = grants_df.merge(funder_grants, on="grant_id")
grants_df = grants_df.merge(funders[["registered_num", "name"]], on="registered_num")
grants_df = grants_df.rename(columns={"name": "funder_name"})
grants_df = grants_df.rename(columns={"registered_num": "funder_num"})

#add recipient info  
grants_df = grants_df.merge(recipient_grants, on="grant_id")
grants_df = grants_df.merge(recipients[["recipient_id", "recipient_name", "recipient_activities"]], 
                        on="recipient_id", 
                        how="left")

#add recipient areas
recip_areas_grouped = recipient_areas.merge(areas, on="area_id")
recip_areas_grouped = recip_areas_grouped.groupby("recipient_id")["area_name"].apply(list).reset_index()
recip_areas_grouped.columns = ["recipient_id", "recipient_areas"]
grants_df = grants_df.merge(recip_areas_grouped, on="recipient_id", how="left")

#add recipient causes
recip_causes_grouped = recipient_causes.merge(causes, on="cause_id")
recip_causes_grouped = recip_causes_grouped.groupby("recipient_id")["cause_name"].apply(list).reset_index()
recip_causes_grouped.columns = ["recipient_id", "recipient_causes"]
grants_df = grants_df.merge(recip_causes_grouped, on="recipient_id", how="left")

#add recipient beneficiaries
recip_beneficiaries_grouped = recipient_beneficiaries.merge(beneficiaries, on="ben_id")
recip_beneficiaries_grouped = recip_beneficiaries_grouped.groupby("recipient_id")["ben_name"].apply(list).reset_index()
recip_beneficiaries_grouped.columns = ["recipient_id", "recipient_beneficiaries"]
grants_df = grants_df.merge(recip_beneficiaries_grouped, on="recipient_id", how="left")

#replace nan values with empty lists
if "recipient_areas" in grants_df.columns:
    grants_df["recipient_areas"] = grants_df["recipient_areas"].apply(lambda x: x if isinstance(x, list) else [])
if "recipient_causes" in grants_df.columns:
    grants_df["recipient_causes"] = grants_df["recipient_causes"].apply(lambda x: x if isinstance(x, list) else [])
if "recipient_beneficiaries" in grants_df.columns:
    grants_df["recipient_beneficiaries"] = grants_df["recipient_beneficiaries"].apply(lambda x: x if isinstance(x, list) else [])

#round to 2 decimal places
grants_df = grants_df.round(2)

#format financial columns
grants_df["amount_gbp"] = grants_df["amount"].apply(add_gbp_columns)

In [None]:
#sort and preview grants
grants_df = grants_df.sort_values("recipient_areas", ascending=False)
grants_df.head()

## Summary Statistics

In [None]:
#build df of summary statistics
summary_data = make_summary_df(funders_df, grants_df)
summary_df = pd.DataFrame(summary_data)
summary_df["Value"] = summary_df.apply(format_stats, axis=1)
summary_df = format_df(summary_df)

## Calculated Statistics

In [None]:
#get calculated stats and build df
stats = calculate_stats(funders_df, grants_df)
calculated_data = make_calculated_df(stats)
calculated_df = pd.DataFrame(calculated_data)
calculated_df["Value"] = calculated_df.apply(format_stats, axis=1)
calculated_df = format_df(calculated_df)

## Data Quality

### Missingness

#### 05/11
**`funders_df`:**
There are two funders in the database with empty `activities`. I have checked the Charity Commission website and it does appear that these funders have simply not declared any activities. They are both relatively new having submitted only one set of accounts. If the accounts are accessible to my script, the `activities_objectives` column will be populated and this issue will be redundant. One of these funders has a website so, if I am able to achieve my stretch target of scraping websites, this may be a further source of information (although at the time of writing, the website does not exist).

**`grants_df`:**
There are grants missing a title and description which is to be expected as they are pulled from accounts. There are grants missing recipient names which is fine as the database hasn't been built properly yet. There doesn't appear to be any missing data of concern.

**Accounts problem:**
My database_builder_pdfs scripts have a serious limitation as I am unable to scrape the Charity Commission website for accounts where the page contains JavaScript. The older pages, which are basic HTML, are accessible but the newer ones are not, and I am unable to tell which charities have been updated to the new system until the script attempts to scrape them and fails. I will calculate the proportion of accounts that are unavailable and will consider whether those funders should be excluded from the database.

In [None]:
funders_df.info()

In [None]:
grants_df.info()

### Word Counts

I will check the lengths of the shortest and longest text entries, to ensure that they have been imported correctly and are not too short or long for unexpected reasons. Many funders provide very short explanations of their activities/objectives etc., such as simply "grant-giving" which is just one word - so this would not be abnormal. I will confirm that particularly long entries are not corrupted or the result of multiple documents being combined accidentally.

In [None]:
#check word counts for text columns in funders df
funders_text_cols = ["activities", "objectives", "objectives_activities", "achievements_performance", "grant_policy"]

#create columns
for col in funders_text_cols:
    funders_df[f"word_count_{col}"] = funders_df[col].str.split().str.len()
for col in funders_text_cols:
    print(f"{col.upper()}")
    print(f"{'_'*30}\n")

    word_count_col = f"word_count_{col}"
    not_nas = funders_df[funders_df[word_count_col].notna()]

    #get minimums and maximums for each text column
    if len(not_nas) > 0:
        min_idx = not_nas[word_count_col].idxmin()
        max_idx = not_nas[word_count_col].idxmax()

        examples = funders_df.loc[[min_idx, max_idx],
                                    ["registered_num", "name", word_count_col, col]]
        examples.index = ["Minimum", "Maximum"]

        display(examples)
    else:
        print("No data available\n")

In [None]:
#check word counts for text columns in grants df
grants_text_cols = ["grant_title", "grant_desc", "recipient_activities"]

#create columns
for col in grants_text_cols:
    grants_df[f"word_count_{col}"] = grants_df[col].str.split().str.len()
for col in grants_text_cols:
    print(f"{col.upper()}")
    print(f"{'_'*30}\n")

    word_count_col = f"word_count_{col}"
    not_nas = grants_df[grants_df[word_count_col].notna()]

    #get minimums and maximums for each text column
    if len(not_nas) > 0:
        min_idx = not_nas[word_count_col].idxmin()
        max_idx = not_nas[word_count_col].idxmax()

        examples = grants_df.loc[[min_idx, max_idx],
                                    ["funder_num", "funder_name", "recipient_id", "recipient_name", word_count_col, col]]
        examples.index = ["Minimum", "Maximum"]

        display(examples)
    else:
        print("No data available\n")

Having manually checked the minimum and maximum values against their Charity Commission records, I am confident that there are no entries that are concerningly long. There do appear to be unusual values when it comes to looking at the minimum lengths (for example, a grant entitled simply 'ALPHA'), which I will scrutinise in more detail and then clean.

### Recipient Names

The `recipient_name` variable has been built from two sources. The first is the [Charity Commission's Public Extract](https://register-of-charities.charitycommission.gov.uk/en/register/full-register-download), which is updated daily and includes registered charities' names as logged with the Charity Commission. These names are generally clean and standardised, though charities occasionally use trading names that differ from their official registered names.

The second source is `recipient_name` as extracted from funders' accounts by the Claude API. This is significantly less reliable and is a noted limitation of this project. The reliability is affected in part by the reliance on a large language model and its fuzzy intepretations of messy text, but also due to the fact that funders' accounts are not held to any standard of enforcement in terms of the correct spelling, uniformity, or indeed accuracy of recipients' names. Examples that have been noted during the course of this project include inconsistency in pluralisation (e.g. *hospices* vs *hospice*); missing words (e.g. *British Red Cross Society* vs *British Red Cross*); and errors in punctuation (e.g. *soldiers'* vs *solider's*). 

The LLM has also at times failed to interpret account entries properly, extracting purchase descriptions as recipient names (e.g. 'cricket balls for the Year 7 team' from a PTA's accounts) or recording vague summaries as distinct recipients (e.g. 'five various causes'). These errors are particularly prevalent in accounts from single-beneficiary funders where expenditure descriptions differ structurally from multi-recipient grant listings.

Manual spot-checking is therefore essential to identify and address these inconsistencies.

## Exploration of Giving Patterns and Funder Characteristics

### Categorical Variables

One of the key objectives of prospie is to help fundraisers navigate the confusing trusts landscape, which is confusing largely due to the mismatch of information that is available. Funders may indicate a particular cause or area of activity, but these do not always align with their actual giving habits. I will therefore compare the classifications that funders state (their identified causes, beneficiaries, and areas of activity) with those of the recipients who are awarded their grants.

In [None]:
#display plots to compare funders' and recipients' classifications
fig, axes = plt.subplots(3, 2, figsize=(18, 15))

#causes
make_bar_chart(grants_df, "recipient_causes", "Recipients: Most Popular Causes", color="#2E86AB", ax=axes[0, 0])
make_bar_chart(funders_df, "causes", "Funders: Most Popular Causes", color="#2E86AB", ax=axes[0, 1])

#areas
make_bar_chart(grants_df, "recipient_areas", "Recipients: Most Popular Areas", color="#A23B72", ax=axes[1, 0])
make_bar_chart(funders_df, "areas", "Funders: Most Popular Areas", color="#A23B72", ax=axes[1, 1])

#beneficiaries
make_bar_chart(grants_df, "recipient_beneficiaries", "Recipients: Most Popular Beneficiaries", color="#F18F01", ax=axes[2, 0])
make_bar_chart(funders_df, "beneficiaries", "Funders: Most Popular Beneficiaries", color="#F18F01", ax=axes[2, 1])

plt.tight_layout()
plt.subplots_adjust(hspace=0.4)
plt.show()

It can be observed that funders often state an interest in general causes, areas and beneficiaries - many do not specify particular interests and state that they will consider applications from any area of the sector. Further analysis will be useful, particularly following the creation of embeddings, to understand the practical reality of their funding priorities, which may reveal implicit preferences or local biases not reflected in their published criteria.

### Relationships between Categories