In [None]:
# import etl libraries
import requests
import pandas as pd, numpy as np
from bs4 import BeautifulSoup

In [None]:
# fetch data
iata_data = pd.read_csv("data/country-development-finance/csv/iata_data.csv")

In [None]:
iata_data.head()

In [None]:
# create time dimension table

MIN_YEAR = 1970
MAX_YEAR = 2024

years = list(range(MIN_YEAR, MAX_YEAR))
quarters = ["Q1", "Q2", "Q3", "Q4"]

dim_time = pd.DataFrame([(y, q) for y in years for q in quarters],
                        columns=["year", "quarter"])
dim_time["time_id"] = dim_time.index + 1

dim_time.head()

In [None]:
# filter data by year range listed in dim_time

iata_data_filtered = iata_data[
    (iata_data["Calendar Year"] >= MIN_YEAR) &
    (iata_data["Calendar Year"] <= MAX_YEAR)
].copy()

iata_data_filtered.head()

In [None]:
# see countries column
countries = iata_data_filtered["Recipient Country or Region"].unique()
country_names = pd.Series(countries)

country_names

In [None]:
# convert countries to simplier country names
country_map = {
    "AF - Afghanistan": "Afghanistan",
    "BD - Bangladesh": "Bangladesh",
    "BO - Bolivia (Plurinational State of)": "Bolivia",
    "CO - Colombia": "Colombia",
    "EG - Egypt": "Egypt",
    "ET - Ethiopia": "Ethiopia",
    "GB - United Kingdom of Great Britain and Northern Ireland (the)": "United Kingdom",
    "GE - Georgia": "Georgia",
    "GT - Guatemala": "Guatemala",
    "HN - Honduras": "Honduras",
    "HT - Haiti": "Haiti",
    "ID - Indonesia": "Indonesia",
    "JO - Jordan": "Jordan",
    "KE - Kenya": "Kenya",
    "KH - Cambodia": "Cambodia",
    "LB - Lebanon": "Lebanon",
    "MD - Moldova (the Republic of)": "Moldova",
    "MG - Madagascar": "Madagascar",
    "NG - Nigeria": "Nigeria",
    "NI - Nicaragua": "Nicaragua",
    "NP - Nepal": "Nepal",
    "PH - Philippines (the)": "Philippines",
    "PK - Pakistan": "Pakistan",
    "RW - Rwanda": "Rwanda",
    "SL - Sierra Leone": "Sierra Leone",
    "SN - Senegal": "Senegal",
    "TZ - Tanzania, the United Republic of": "Tanzania",
    "UA - Ukraine": "Ukraine",
    "UG - Uganda": "Uganda",
    "US - United States of America (the)": "United States",
    "VN - Viet Nam": "Vietnam",
    "YE - Yemen": "Yemen"
}


# get iso alpha id
iata_data_filtered.loc[:, "country"] = (
    iata_data_filtered["Recipient Country or Region"]
    .map(country_map)
)

# map countries to simpler form
iata_data_filtered.loc[:, "iso_alpha2"] = (
    iata_data_filtered["Recipient Country or Region"]
    .str.split(" - ", expand=True)[0]
)

In [None]:
iata_data_filtered["iso_alpha2"].unique()

In [None]:
iata_data_filtered["country"].unique()

In [None]:
# dim_country table
dim_country = (
    iata_data_filtered[["country", "iso_alpha2"]]
        .drop_duplicates()
        .reset_index(drop=True)
)

# add pk
dim_country["country_id"] = dim_country.index + 1


# reorder columns
dim_country = dim_country[["country_id", "country", "iso_alpha2"]]

dim_country.head()


In [None]:
# extract organizations, types, roles via pivot longer
organizations_long = iata_data.melt(
    id_vars = ["IATI Identifier"],
    value_vars = [
        "Reporting Organisation",
        "Provider Organisation",
        "Receiver Organisation"
    ],
    var_name = "organization_role",
    value_name = "organization_name"
)

organization_types_long = iata_data.melt(
    id_vars = ["IATI Identifier"],
    value_vars = [
        "Reporting Organisation Type",
        "Provider Organisation Type",
        "Receiver Organisation Type"
    ],
    var_name = "role_type",
    value_name = "organization_type"
)

In [None]:
organizations_long.head()

In [None]:
organization_types_long.head()

In [None]:
organizations_combined = pd.concat([organizations_long, organization_types_long["organization_type"]], axis=1)
organizations_combined = organizations_combined.dropna(subset=["organization_name"])
organizations_combined["organization_role"] = organizations_combined["organization_role"].str.split().str[0]

organizations_combined.head()

In [None]:
# deduplicate rows

organizations_deduped = (
    organizations_combined[
        ["organization_name", "organization_type"]
    ]
    .drop_duplicates()
    .reset_index(drop=True)
)

organizations_deduped.head()

In [None]:
# extract ids from org name and type, store in seperate columns

organizations_deduped["organization_iati_id"] = (
    organizations_deduped["organization_name"]
    .str.extract(r"\[(.*?)\]")
)

organizations_deduped["organization_name_clean"] = (
    organizations_deduped["organization_name"]
    .str.replace(r"\s*\[\d+\]", "", regex=True)
)

organizations_deduped["organization_type_code"] = (
    organizations_deduped["organization_type"]
    .str.extract(r"^(\d+)")
)

organizations_deduped["organization_type_name"] = (
    organizations_deduped["organization_type"]
    .str.replace(r"^\d+\s*-\s*", "", regex=True)
)

organizations_deduped.head()

In [None]:
# build dim organization
dim_organization = organizations_deduped[
    [
        "organization_name_clean",
        "organization_iati_id",
        "organization_type_code",
        "organization_type_name",
        "organization_name"
    ]
].reset_index(drop=True)

dim_organization["organization_id"] = dim_organization.index + 1


# add pk
dim_organization["organization_id"] = dim_organization.index + 1

# rearrange org columns
dim_organization = dim_organization[
    [
        "organization_id",
        "organization_name_clean",
        "organization_iati_id",
        "organization_type_code",
        "organization_type_name",
        "organization_name"
    ]
]

dim_organization.head()

In [None]:
dim_organization[dim_organization["organization_name"] == "No data"]


In [None]:
# get sector fields for dim sector

sector_staging = iata_data[["Sector Category", "Sector"]].copy()
sector_staging = sector_staging.dropna().drop_duplicates()

sector_staging.head()

In [None]:
# split sector category into code + name
sector_staging["sector_category_code"] = (
    sector_staging["Sector Category"].str.extract(r"^(\d+)")
)

sector_staging["sector_category"] = (
    sector_staging["Sector Category"].str.replace(r"^\d+\s*-\s*", "", regex=True)
)


In [None]:
# split sector into code + name
sector_staging["sector_code"] = (
    sector_staging["Sector"].str.extract(r"^(\d+)")
)

sector_staging["sector_name"] = (
    sector_staging["Sector"].str.replace(r"^\d+\s*-\s*", "", regex=True)
)

sector_staging.head()

In [None]:
# set up sector dimension

dim_sector = sector_staging[
    [
        "sector_code",
        "sector_name",
        "sector_category_code",
        "sector_category"
    ]
].drop_duplicates().reset_index(drop=True)

# add primary key
dim_sector["sector_id"] = dim_sector.index + 1

# reorder cols
dim_sector = dim_sector[
    [
        "sector_id",
        "sector_code",
        "sector_name",
        "sector_category_code",
        "sector_category"
    ]
]

dim_sector.head()

In [None]:
aid_staging = iata_data[["Aid Type"]].dropna().drop_duplicates()
aid_staging.head()

In [None]:
# split aid type into code and name


aid_staging["aid_type_code"] = aid_staging["Aid Type"].str.extract(r"^(\S+)")
aid_staging["aid_type_name"] = aid_staging["Aid Type"].str.replace(r"^\S+\s*-\s*", "", regex=True)


# set type code to none for int
aid_staging["aid_type_code"] = aid_staging["aid_type_code"].replace("No", None)
aid_staging.head()

In [None]:
# build aid dimension

dim_aid_type = (
    aid_staging[["aid_type_code", "aid_type_name"]]
    .drop_duplicates()
    .reset_index(drop=True)
)

# add pk
dim_aid_type["aid_type_id"] = dim_aid_type.index + 1

dim_aid_type = dim_aid_type[
    ["aid_type_id", "aid_type_code", "aid_type_name"]
]

dim_aid_type.head()


In [None]:
# flow type staging
flow_staging = iata_data[["Flow Type"]].copy()
flow_staging = flow_staging.drop_duplicates()


flow_staging.head()

In [None]:
# split flow type into code and name
flow_staging["flow_type_code"] = flow_staging["Flow Type"].str.extract(r"^(\S+)")

flow_staging["flow_type_name"] = flow_staging["Flow Type"].str.replace(
    r"^\S+\s*-\s*", "", regex=True
)

# set type code to none as well to support int
flow_staging["flow_type_code"] = flow_staging["flow_type_code"].replace("No", None)

flow_staging.head()

In [None]:
# build flow dimension

dim_flow_type = (
    flow_staging[["flow_type_code", "flow_type_name"]]
    .drop_duplicates()
    .reset_index(drop=True)
)

# add pk
dim_flow_type["flow_type_id"] = dim_flow_type.index + 1

dim_flow_type = dim_flow_type[
    ["flow_type_id", "flow_type_code", "flow_type_name"]
]


dim_flow_type.head()


In [None]:
transaction_staging = iata_data[["Transaction Type"]].copy()
transaction_staging = transaction_staging.drop_duplicates()


transaction_staging.head()

In [None]:
transaction_staging["transaction_type_code"] = transaction_staging["Transaction Type"].str.extract(r"^(\S+)")
transaction_staging["transaction_type_name"] = transaction_staging["Transaction Type"].str.replace(r"^\S+\s*-\s*", "", regex=True)


transaction_staging.head()

In [None]:
# build transaction dimension
dim_transaction_type = (
    transaction_staging[["transaction_type_code", "transaction_type_name"]]
    .drop_duplicates()
    .reset_index(drop=True)
)
# pk
dim_transaction_type["transaction_type_id"] = dim_transaction_type.index + 1

# reorder cols
dim_transaction_type = dim_transaction_type[
    ["transaction_type_id", "transaction_type_code", "transaction_type_name"]
]

dim_transaction_type.head()



In [None]:
# finance type staging

finance_staging = iata_data[["Finance Type"]].copy()
finance_staging = finance_staging.drop_duplicates()

finance_staging.head()

In [None]:
# extract code and name from finance type

finance_staging["finance_type_code"] = finance_staging["Finance Type"].str.extract(r"^(\S+)")
finance_staging["finance_type_name"] = finance_staging["Finance Type"].str.replace(r"^\S+\s*-\s*", "", regex=True)

# set type code to no data as well for better null consistency
finance_staging["finance_type_code"] = finance_staging["finance_type_code"].replace("No", None)


finance_staging.head()

In [None]:
# build finance dimension

dim_finance_type = (
    finance_staging[["finance_type_code", "finance_type_name"]]
    .drop_duplicates()
    .reset_index(drop=True)
)

dim_finance_type["finance_type_id"] = dim_finance_type.index + 1


dim_finance_type = dim_finance_type[
    ["finance_type_id", "finance_type_code", "finance_type_name"]
]

dim_finance_type.head()

In [None]:
world_indicator_data = pd.read_csv("data/world-development-indicators/data2.csv")

world_indicator_data.head()

In [None]:
# long pivot the years

indicator_long = world_indicator_data.melt(
    id_vars=["Country Name", "Country Code", "Series Name", "Series Code"],
    var_name="year",
    value_name="value"
)

indicator_long.head()

In [None]:
# year has weird values so convert

indicator_long["year"] = indicator_long["year"].str.extract(r"(\d{4})").astype(int)

indicator_long.head(1)

In [None]:
# look at distinct series

indicator_long["Series Name"].unique()

In [None]:
# make a map to convert the names to readable format

indicator_map = {
    "Population, total": "population",
    "Population density (people per sq. km of land area)": "population_density",
    "GDP per capita (current US$)": "gdp_per_capita",
    "Net ODA received per capita (current US$)": "net_oda_per_capita",
    "Life expectancy at birth, total (years)": "life_expectancy",
    "Mortality rate, under-5 (per 1,000 live births)": "under5_mortality",
    "Primary completion rate, total (% of relevant age group)": "primary_completion_rate",
    "Lower secondary completion rate, total (% of relevant age group)": "lower_secondary_completion_rate",
    "School enrollment, primary (% net)": "school_enrollment_primary"
}


indicator_long["metric"] = indicator_long["Series Name"].map(indicator_map)

indicator_long.head()

In [None]:
# theres weird rows where theres no data at all so filter it out (verified using R)

indicator_long = indicator_long[indicator_long["metric"].notna()].copy()
indicator_long = indicator_long[indicator_long["year"] < 2024].copy()

# theres "..." entries as opposed to n/a entries
indicator_long["value"] = indicator_long["value"].replace("..", None)

# convert numeric
indicator_long["value"] = pd.to_numeric(indicator_long["value"], errors="coerce")



In [None]:
# pivot again to convert metrics to seperate columns

indicator_pivoted = indicator_long.pivot_table(
    index=["Country Name", "Country Code", "year"],
    columns="metric",
    values="value"
).reset_index()

indicator_pivoted.head()

In [None]:
# verify country names
indicator_pivoted["Country Name"].unique()

In [None]:
# map country names to

country_mapping = {
    "Afghanistan": "Afghanistan",
    "Bangladesh": "Bangladesh",
    "Bolivia": "Bolivia",
    "Cambodia": "Cambodia",
    "Colombia": "Colombia",
    "Egypt, Arab Rep.": "Egypt",
    "Ethiopia": "Ethiopia",
    "Georgia": "Georgia",
    "Guatemala": "Guatemala",
    "Haiti": "Haiti",
    "Honduras": "Honduras",
    "Indonesia": "Indonesia",
    "Jordan": "Jordan",
    "Kenya": "Kenya",
    "Lebanon": "Lebanon",
    "Madagascar": "Madagascar",
    "Moldova": "Moldova",
    "Nepal": "Nepal",
    "Nicaragua": "Nicaragua",
    "Nigeria": "Nigeria",
    "Pakistan": "Pakistan",
    "Philippines": "Philippines",
    "Rwanda": "Rwanda",
    "Senegal": "Senegal",
    "Sierra Leone": "Sierra Leone",
    "Tanzania": "Tanzania",
    "Uganda": "Uganda",
    "Ukraine": "Ukraine",
    "United Kingdom": "United Kingdom",
    "United States": "United States",
    "Viet Nam": "Vietnam",
    "Yemen, Rep.": "Yemen"
}

indicator_pivoted["country_clean"] = indicator_pivoted["Country Name"].map(country_mapping)

indicator_pivoted.head()

In [None]:
indicator_pivoted["country_clean"].unique()

In [None]:
# add quarter for joining
indicator_pivoted["quarter"] = "Q1"

In [None]:
fact_indicator = indicator_pivoted.merge(
    dim_country[["country_id", "country"]],
    left_on="country_clean",
    right_on="country",
    how="left"
)

fact_indicator.head()

In [None]:
fact_indicator = fact_indicator.merge(
    dim_time,
    on=["year", "quarter"],
    how="left"
)
fact_indicator.head()


In [None]:
# clean table, drop unnecessary colss and reorder
fact_indicator = fact_indicator[
    [
        "country_id",
        "time_id",
        "population",
        "population_density",
        "gdp_per_capita",
        "net_oda_per_capita",
        "life_expectancy",
        "under5_mortality",
        "primary_completion_rate",
        "lower_secondary_completion_rate",
        "school_enrollment_primary"
    ]
]

fact_indicator.head()


In [None]:
# check columns
iata_data_filtered.head()

In [None]:
fact_iata = iata_data_filtered.rename(columns={
    "IATI Identifier": "iati_id",
    "Title": "aid_title",
    "Value (USD)": "value_usd",
    "Value (EUR)": "value_euro",
    "Value (Local currrency)": "value_local",
    "Humanitarian": "humanitarian",
    "Calendar Year": "year",
    "Calendar Quarter": "quarter",
})

fact_iata.head()


In [None]:
# add country id to fact table by merging
fact_iata = fact_iata.merge(
    dim_country[["country_id", "country"]],
    on="country",
    how="left"
)

In [None]:
fact_iata.head()

In [None]:
# add time id to fact table by merging
fact_iata = fact_iata.merge(
    dim_time,
    left_on=["year", "quarter"],
    right_on=["year", "quarter"],
    how="left"
)

In [None]:
fact_iata = fact_iata.merge(
    dim_organization[[
        "organization_id",
        "organization_name",
        "organization_type_name"
    ]],
    left_on=["Reporting Organisation", "Reporting Organisation Type"],
    right_on=["organization_name", "organization_type_name"],
    how="left"
).rename(columns={"organization_id": "reporting_org_id"})

fact_iata = fact_iata.drop(columns=["organization_name", "organization_type_name"])


In [None]:
fact_iata.head()

In [None]:
fact_iata = fact_iata.merge(
    dim_organization[[
        "organization_id",
        "organization_name",
        "organization_type_name"
    ]],
    left_on=["Provider Organisation", "Provider Organisation Type"],
    right_on=["organization_name", "organization_type_name"],
    how="left"
).rename(columns={"organization_id": "provider_org_id"})

fact_iata = fact_iata.drop(columns=["organization_name", "organization_type_name"])


In [None]:
fact_iata = fact_iata.merge(
    dim_organization[[
        "organization_id",
        "organization_name",
        "organization_type_name"
    ]],
    left_on=["Receiver Organisation", "Receiver Organisation Type"],
    right_on=["organization_name", "organization_type_name"],
    how="left"
).rename(columns={"organization_id": "receiving_org_id"})

fact_iata = fact_iata.drop(columns=["organization_name", "organization_type_name"])


In [None]:
fact_iata.head()

In [None]:
dim_organization.head()