# Exploratory Data Analysis

## Setup

In [1]:
import pandas as pd
import janitor
import os
import sys
from dotenv import load_dotenv
import time
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

#import modules etc
project_root = os.path.abspath('..')
if project_root not in sys.path:
    sys.path.insert(0, project_root)
from utils import get_table_from_supabase

import importlib.util
spec = importlib.util.spec_from_file_location("eda_utils", os.path.join(os.getcwd(), "utils.py"))
eda_utils = importlib.util.module_from_spec(spec)
spec.loader.exec_module(eda_utils)

add_gbp_columns = eda_utils.add_gbp_columns
explode_lists = eda_utils.explode_lists
format_stats = eda_utils.format_stats
make_summary_df = eda_utils.make_summary_df

#get keys from env
load_dotenv()
url = os.getenv("SUPABASE_URL")
key = os.getenv("SUPABASE_KEY")

## Retrieving Data from Supabase and Building Dataframes

I will connect to Supabase and retrieve all records, in order to start building my analysis dataframes. I will create one dataframe for funder information, and another for grants and recipients information.

In [2]:
#get tables and build dataframes
tables = ["funders", "causes", "areas", "beneficiaries", "grants", "recipients",
               "funder_causes", "funder_areas", "funder_beneficiaries", "funder_grants", 
               "recipient_grants", "recipient_areas"]

for table in tables:
    if table == "recipients":
        globals()[table] = get_table_from_supabase(url, key, table, batch_size=50, filter_recipients=True)
    else:
        globals()[table] = get_table_from_supabase(url, key, table)

print(f"Loaded {len(funders)} funders, {len(recipients)} recipients, {len(grants)} grants")

Loaded 499 funders, 295 recipients, 12637 grants


### The Funders Dataframe

In [3]:
funders_df = funders.copy()

#define table relationships for funders
funder_rels = [
    {
        "join_table": funder_causes,
        "lookup_table": causes,
        "key": "cause_id",
        "value_col": "cause_name",
        "result_col": "causes"
    },
    {
        "join_table": funder_areas,
        "lookup_table": areas,
        "key": "area_id",
        "value_col": "area_name",
        "result_col": "areas"
    },
    {
        "join_table": funder_beneficiaries,
        "lookup_table": beneficiaries,
        "key": "ben_id",
        "value_col": "ben_name",
        "result_col": "beneficiaries"
    }
]

#group and merge
for rel in funder_rels:
    grouped = rel["join_table"].merge(rel["lookup_table"], on=rel["key"])
    grouped = grouped.groupby("registered_num")[rel["value_col"]].apply(list).reset_index()
    grouped.columns = ["registered_num", rel["result_col"]]
    funders_df = funders_df.merge(grouped, on="registered_num", how="left")

#add grant statistics columns 
grants_stats = funder_grants.merge(grants, on="grant_id")
grants_agg = grants_stats.groupby("registered_num").agg({
    "grant_id": "count",
    "amount": ["sum", "mean", "median"]
}).reset_index()
grants_agg.columns = ["registered_num", "num_grants", "total_given", "avg_grant", "median_grant"]

funders_df = funders_df.merge(grants_agg, on="registered_num", how="left")
funders_df["num_grants"] = funders_df["num_grants"].astype("Int64")

#replace nan values with empty lists
funders_df["causes"] = funders_df["causes"].apply(lambda x: x if isinstance(x, list) else [])
funders_df["areas"] = funders_df["areas"].apply(lambda x: x if isinstance(x, list) else [])
funders_df["beneficiaries"] = funders_df["beneficiaries"].apply(lambda x: x if isinstance(x, list) else [])

#round to 2 decimal places
funders_df = funders_df.round(2)
pd.set_option("display.float_format", "{:.2f}".format)

#format financial columns
float_cols = ["income", "expenditure", "total_given", "avg_grant", "median_grant"]
for col in float_cols:
    if col in funders_df.columns:
        funders_df[col + "_gbp"] = funders_df[col].apply(add_gbp_columns)

In [4]:
#sort and preview funders
funders_df = funders_df.sort_values("total_given_gbp", ascending=False)
funders_df.head()

Unnamed: 0,registered_num,name,website,activities,objectives,income,expenditure,causes,areas,beneficiaries,num_grants,total_given,avg_grant,median_grant,income_gbp,expenditure_gbp,total_given_gbp,avg_grant_gbp,median_grant_gbp
265,200051,Esmee Fairbairn Foundation,https://www.esmeefairbairn.org.uk,We are one of the largest independent foundati...,To further such charitable purpose or purposes...,9483000.0,56416000.0,"[General Charitable Purposes, Education/traini...","[Wigan, Oxfordshire, Greater Manchester]","[Children/young People, Elderly/old People, Pe...",6967,717939197.42,103048.54,54000.0,"£9,483,000.00","£56,416,000.00","£717,939,197.42","£103,048.54","£54,000.00"
173,294629,The Fenton Arts Trust,https://www.fentonartstrust.org.uk,Awards grants to individuals and organisations...,(a) the advancement of public education more p...,180898.0,178450.0,[Arts/culture/heritage/science],"[Warrington, Nottingham City, Oxfordshire, Lei...",[Other Defined Groups],175,663582.0,3791.9,4000.0,"£180,898.00","£178,450.00","£663,582.00","£3,791.90","£4,000.00"
489,274100,The Clothworkers' Foundation,https://clothworkersfoundation.org.uk,To make grants to uk frontline charities to su...,For the advancement of such charitable purpose...,12329000.0,12607000.0,[General Charitable Purposes],[],[Other Charities Or Voluntary Bodies],3237,66349023.6,20497.07,10000.0,"£12,329,000.00","£12,607,000.00","£66,349,023.60","£20,497.07","£10,000.00"
488,263207,John Ellerman Foundation,https://www.ellerman.org.uk,John ellerman foundation is a general grantmak...,1) the object of the charity is for such chari...,3431000.0,6239000.0,[General Charitable Purposes],[],[Other Charities Or Voluntary Bodies],764,65215922.9,85361.16,90000.0,"£3,431,000.00","£6,239,000.00","£65,215,922.90","£85,361.16","£90,000.00"
468,283813,The London Marathon Charitable Trust Limited,https://www.londonmarathonfoundation.org,"London marathon foundation, the operating name...",2.1 the charity's objects are restricted speci...,62715273.0,57810948.0,"[Disability, Amateur Sport, Human Rights/relig...",[],"[Children/young People, Elderly/old People, Pe...",775,62403416.0,80520.54,30000.0,"£62,715,273.00","£57,810,948.00","£62,403,416.00","£80,520.54","£30,000.00"


### The Grants Dataframe

In [5]:
grants_df = grants.copy()

#add funder info
grants_df = grants_df.merge(funder_grants, on="grant_id")
grants_df = grants_df.merge(funders[["registered_num", "name"]], on="registered_num")
grants_df = grants_df.rename(columns={"name": "funder_name"})
grants_df = grants_df.rename(columns={"registered_num": "funder_num"})

#add recipient info  
grants_df = grants_df.merge(recipient_grants, on="grant_id")
grants_df = grants_df.merge(recipients[["recipient_id", "recipient_name", "recipient_activities"]], 
                        on="recipient_id", 
                        how="left")

#add recipient areas
recip_areas_grouped = recipient_areas.merge(areas, on="area_id")
recip_areas_grouped = recip_areas_grouped.groupby("recipient_id")["area_name"].apply(list).reset_index()
recip_areas_grouped.columns = ["recipient_id", "recipient_areas"]
grants_df = grants_df.merge(recip_areas_grouped, on="recipient_id", how="left")

#replace nan values with empty lists
if "recipient_areas" in grants_df.columns:
    grants_df["recipient_areas"] = grants_df["recipient_areas"].apply(lambda x: x if isinstance(x, list) else [])

#round to 2 decimal places
grants_df = grants_df.round(2)

#format financial columns
grants_df["amount_gbp"] = grants_df["amount"].apply(add_gbp_columns)

In [6]:
#extend column view, sort and preview grants
pd.set_option("display.max_columns", 100)
grants_df = grants_df.sort_values("amount", ascending=False)
grants_df.head()

Unnamed: 0,grant_title,grant_desc,amount,year,grant_id,funder_num,funder_grants_id,funder_name,recipient_id,recipient_grants_id,recipient_name,recipient_activities,recipient_areas,amount_gbp
648,Grant To Museums Association,Towards delegated grant-making for the esmée f...,4178557.0,2019,360G-EFF-19-0186,200051,4891,Esmee Fairbairn Foundation,360G-EFF-001b000003VLsX5,3820,,,[],"£4,178,557.00"
790,Grant To Museums Association,Towards delegated grant-making towards the con...,3941914.0,2016,360G-EFF-15-2657,200051,6740,Esmee Fairbairn Foundation,360G-EFF-001b000003VLsX5,5669,,,[],"£3,941,914.00"
3457,Grant To Museums Association,Towards delegated grant-making towards the con...,3253420.0,2013,360G-EFF-13-0948,200051,9368,Esmee Fairbairn Foundation,360G-EFF-001b000003VLsX5,8297,,,[],"£3,253,420.00"
6836,Grant To Museums Association,Towards delegated grant-making to run the esme...,2720550.0,2024,360G-EFF-23-1758,200051,6454,Esmee Fairbairn Foundation,360G-EFF-001b000003VLsX5,5383,,,[],"£2,720,550.00"
6018,Grant To Museums Association,Towards delegated grant-making to develop a ne...,2683794.0,2022,360G-EFF-22-0530,200051,5624,Esmee Fairbairn Foundation,360G-EFF-001b000003VLsX5,4553,,,[],"£2,683,794.00"


## Summary Statistics

In [7]:
#build df of summary statistics
summary_data = make_summary_df(funders_df, grants_df)
summary_df = pd.DataFrame(summary_data)
summary_df["Value"] = summary_df.apply(format_stats, axis=1)
display(summary_df.style
    .set_properties(**{"text-align": "left"})
    .set_table_styles([
        {"selector": "th", "props": [("font-weight", "bold"), ("text-align", "left"), ("border-bottom", "1px solid")]},
        {"selector": "td", "props": [("padding", "6px")]}
    ])
    .hide(axis="index")
)

Metric,Value
Total funders,499
Total recipients,7272
Total grants,12561
Total grant value,"£936,274,840.92"
Mean grants per funder,1799
Most grants given by a funder,6967
Fewest grants given by a funder,5
Mean recipients per funder,1141.4
Mean areas per funder,4.5
Mean funder income,"£925,052.52"


## Calculated Statistics