# Exploratory Data Analysis

## Setup

In [3]:
import pandas as pd
import janitor
import os
from dotenv import load_dotenv
import time
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from utils import get_table_from_supabase

#get keys from env
load_dotenv()
url = os.getenv("SUPABASE_URL")
key = os.getenv("SUPABASE_KEY")

## Retrieving data from Supabase

I will connect to Supabase and retrieve all records, compiling them into a wide dataframe. I will also apply some simple transformations to the data to ensure that they are in a suitable format.

In [4]:
#get tables and build dataframes
tables = ["funders", "causes", "areas", "beneficiaries", "grants", "recipients",
               "funder_causes", "funder_areas", "funder_beneficiaries", "funder_grants", 
               "recipient_grants", "recipient_areas"]

for table in tables:
    if table == "recipients":
        # Only get recipients where is_recipient = True
        globals()[table] = get_table_from_supabase(url, key, table, filter_recipients=True)
    else:
        globals()[table] = get_table_from_supabase(url, key,table)

df = funders.copy()

#define table relationships
relationships = [
    {
        "join_table": funder_causes,
        "lookup_table": causes,
        "key": "cause_id",
        "value_col": "cause_name",
        "result_col": "causes"
    },
    {
        "join_table": funder_areas,
        "lookup_table": areas,
        "key": "area_id",
        "value_col": "area_name",
        "result_col": "areas"
    },
    {
        "join_table": funder_beneficiaries,
        "lookup_table": beneficiaries,
        "key": "ben_id",
        "value_col": "ben_name",
        "result_col": "beneficiaries"
    }
]

#group and merge
for rel in relationships:
    grouped = rel["join_table"].merge(rel["lookup_table"], on=rel["key"])
    grouped = grouped.groupby("registered_num")[rel["value_col"]].apply(list).reset_index()
    grouped.columns = ["registered_num", rel["result_col"]]
    df = df.merge(grouped, on="registered_num", how="left")

#replace nan values with empty lists
df["causes"] = df["causes"].apply(lambda x: x if isinstance(x, list) else [])
df["areas"] = df["areas"].apply(lambda x: x if isinstance(x, list) else [])
df["beneficiaries"] = df["beneficiaries"].apply(lambda x: x if isinstance(x, list) else [])

#round to 2 decimal places and remove scientific notation
df = df.round(2)
pd.set_option("display.float_format", "{:.2f}".format)