# Exploratory Data Analysis

## Setup

In [1]:
import pandas as pd
import janitor
import os
from dotenv import load_dotenv
import time
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from utils import get_table_from_supabase, add_gbp_columns

#get keys from env
load_dotenv()
url = os.getenv("SUPABASE_URL")
key = os.getenv("SUPABASE_KEY")

## Retrieving Data from Supabase

I will connect to Supabase and retrieve all records, in order to start building my analysis dataframe.

In [2]:
#get tables and build dataframes
tables = ["funders", "causes", "areas", "beneficiaries", "grants", "recipients",
               "funder_causes", "funder_areas", "funder_beneficiaries", "funder_grants", 
               "recipient_grants", "recipient_areas"]

for table in tables:
    if table == "recipients":
        globals()[table] = get_table_from_supabase(url, key, table, batch_size=100, filter_recipients=True)
    else:
        globals()[table] = get_table_from_supabase(url, key, table)

df = funders.copy()

#define table relationships
relationships = [
    {
        "join_table": funder_causes,
        "lookup_table": causes,
        "key": "cause_id",
        "value_col": "cause_name",
        "result_col": "causes"
    },
    {
        "join_table": funder_areas,
        "lookup_table": areas,
        "key": "area_id",
        "value_col": "area_name",
        "result_col": "areas"
    },
    {
        "join_table": funder_beneficiaries,
        "lookup_table": beneficiaries,
        "key": "ben_id",
        "value_col": "ben_name",
        "result_col": "beneficiaries"
    }
]

#group and merge
for rel in relationships:
    grouped = rel["join_table"].merge(rel["lookup_table"], on=rel["key"])
    grouped = grouped.groupby("registered_num")[rel["value_col"]].apply(list).reset_index()
    grouped.columns = ["registered_num", rel["result_col"]]
    df = df.merge(grouped, on="registered_num", how="left")

## Adding Computed Columns

I will add some computed columns to the dataframe to help me with my analysis. I will also apply some simple transformations to the data to ensure that they are in a suitable format.

In [3]:
#add grant statistics columns 
grants_stats = funder_grants.merge(grants, on="grant_id")
grants_agg = grants_stats.groupby("registered_num").agg({
    "grant_id": "count",
    "amount": ["sum", "mean", "median"]
}).reset_index()
grants_agg.columns = ["registered_num", "num_grants", "total_given", "avg_grant", "median_grant"]

df = df.merge(grants_agg, on="registered_num", how="left")
df["num_grants"] = df["num_grants"].astype("Int64")

In [4]:
#replace nan values with empty lists
df["causes"] = df["causes"].apply(lambda x: x if isinstance(x, list) else [])
df["areas"] = df["areas"].apply(lambda x: x if isinstance(x, list) else [])
df["beneficiaries"] = df["beneficiaries"].apply(lambda x: x if isinstance(x, list) else [])

#round to 2 decimal places and remove scientific notation
df = df.round(2)
pd.set_option("display.float_format", "{:.2f}".format)

#format financial columns
float_cols = ["income", "expenditure", "total_given", "avg_grant", "median_grant"]
for col in float_cols:
    df[col + "_gbp"] = df[col].apply(add_gbp_columns)
df2 = df.drop(columns=float_cols)

In [None]:
#sort by number of grants
df2 = df2.sort_values(by="num_grants", ascending=False)

In [7]:
df2.head()

Unnamed: 0,registered_num,name,website,activities,objectives,causes,areas,beneficiaries,num_grants,income_gbp,expenditure_gbp,total_given_gbp,avg_grant_gbp,median_grant_gbp
265,200051,Esmee Fairbairn Foundation,https://www.esmeefairbairn.org.uk,We are one of the largest independent foundati...,To further such charitable purpose or purposes...,"[General Charitable Purposes, Education/traini...","[Wigan, Oxfordshire, Greater Manchester]","[Children/young People, Elderly/old People, Pe...",6967,"£9,483,000.00","£56,416,000.00","£717,939,197.42","£103,048.54","£54,000.00"
489,274100,The Clothworkers' Foundation,https://clothworkersfoundation.org.uk,To make grants to uk frontline charities to su...,For the advancement of such charitable purpose...,[General Charitable Purposes],[],[Other Charities Or Voluntary Bodies],3237,"£12,329,000.00","£12,607,000.00","£66,349,023.60","£20,497.07","£10,000.00"
468,283813,The London Marathon Charitable Trust Limited,https://www.londonmarathonfoundation.org,"London marathon foundation, the operating name...",2.1 the charity's objects are restricted speci...,"[Disability, Amateur Sport, Human Rights/relig...",[],"[Children/young People, Elderly/old People, Pe...",775,"£62,715,273.00","£57,810,948.00","£62,403,416.00","£80,520.54","£30,000.00"
488,263207,John Ellerman Foundation,https://www.ellerman.org.uk,John ellerman foundation is a general grantmak...,1) the object of the charity is for such chari...,[General Charitable Purposes],[],[Other Charities Or Voluntary Bodies],764,"£3,431,000.00","£6,239,000.00","£65,215,922.90","£85,361.16","£90,000.00"
52,1093844,THE JOSEPH RANK TRUST,https://www.ranktrust.org,THE TRUST'S OBJECTS AND PRINCIPAL ACTIVITIES A...,1) TO ADVANCE THE CHRISTIAN FAITH;\r2) TO FURT...,"[General Charitable Purposes, Religious Activi...",[Throughout England And Wales],"[Children/young People, Elderly/old People, Pe...",669,"£2,936,000.00","£2,775,000.00","£21,082,150.00","£31,512.93","£30,000.00"


## Basic Analysis

In [36]:
print(f"""
============================================================================
                            SUMMARY STATISTICS
============================================================================
      
Total funders: {len(df2)}
Total recipients: {len(recipients)}
Total grants: {len(grants)}

Mean grants per funder: {df2["num_grants"].mean():.0f}
Mean recipients per funder: 
Mean grants per recipient: 

Mean grant size: £{grants["amount"].mean():.2f}
Median grant size: £{grants["amount"].median():.2f}
Smallest grant: £{grants.loc[grants['amount'] > 0, 'amount'].min():,.2f}
Largest grant: £{grants["amount"].max():.2f}

      """)


                            SUMMARY STATISTICS
      
Total funders: 499
Total recipients: 665
Total grants: 12637

Mean grants per funder: 1799
Mean recipients per funder:
Mean grants per recipient: 

Mean grant size: £73904.90
Median grant size: £30000.00
Smallest grant: £300.00
Largest grant: £4178557.00

      
