In [1]:
# import etl libraries
import requests
import pandas as pd, numpy as np
from bs4 import BeautifulSoup

In [2]:
# fetch data
iata_data = pd.read_csv("data/country-development-finance/csv/iata_data.csv")

In [3]:
iata_data.head()

Unnamed: 0,IATI Identifier,Title,Reporting Organisation Group,Reporting Organisation,Reporting Organisation Type,Aid Type,Finance Type,Flow Type,Provider Organisation,Provider Organisation Type,...,Sector Category,Sector,Humanitarian,Calendar Year,Calendar Quarter,Calendar Year and Quarter,URL,Value (USD),Value (EUR),Value (Local currrency)
0,41120-100879,ROAP/AFG04/22/Secure Communal HLP rights in Af...,UN - United Nations,UN-Habitat [41120],40 - Multilateral,No data,No data,No data,UN-Habitat [41120],40 - Multilateral,...,160 - Other Social Infrastructure & Services,16030 - Housing policy and administrative mana...,0,2025,Q4,2025 Q4,https://d-portal.org/q.html?aid=41120-100879,499924.15,432684.914315,38821610.0
1,41120-100879,ROAP/AFG04/22/Secure Communal HLP rights in Af...,UN - United Nations,UN-Habitat [41120],40 - Multilateral,No data,No data,No data,UNOCHA-New York,No data,...,160 - Other Social Infrastructure & Services,16030 - Housing policy and administrative mana...,0,2025,Q4,2025 Q4,https://d-portal.org/q.html?aid=41120-100879,499940.75,432699.281634,38822900.0
2,41120-102631,ROAP/AFG01/22/Adaptive responses: Evidence bas...,UN - United Nations,UN-Habitat [41120],40 - Multilateral,No data,No data,No data,UN-Habitat [41120],40 - Multilateral,...,730 - Reconstruction Relief & Rehabilitation,73010 - Immediate post-emergency reconstructio...,0,2025,Q4,2025 Q4,https://d-portal.org/q.html?aid=41120-102631,314966.64,272603.981305,24458730.0
3,41120-102631,ROAP/AFG01/22/Adaptive responses: Evidence bas...,UN - United Nations,UN-Habitat [41120],40 - Multilateral,No data,No data,No data,UNHCR-Afghanistan,No data,...,730 - Reconstruction Relief & Rehabilitation,73010 - Immediate post-emergency reconstructio...,0,2025,Q4,2025 Q4,https://d-portal.org/q.html?aid=41120-102631,314966.64,272603.981305,24458730.0
4,41120-102645,ROAP/AFG05/22/People-friendly Streets in Afgha...,UN - United Nations,UN-Habitat [41120],40 - Multilateral,No data,No data,No data,UN-Habitat [41120],40 - Multilateral,...,430 - Other Multisector,43030 - Urban development and management,0,2025,Q4,2025 Q4,https://d-portal.org/q.html?aid=41120-102645,494995.06,428418.781374,38438840.0


In [4]:
# create time dimension table

MIN_YEAR = 1970
MAX_YEAR = 2024

years = list(range(MIN_YEAR, MAX_YEAR + 1))
quarters = ["Q1", "Q2", "Q3", "Q4"]

dim_time = pd.DataFrame([(y, q) for y in years for q in quarters],
                        columns=["year", "quarter"])
dim_time["time_id"] = dim_time.index + 1

dim_time.head()

Unnamed: 0,year,quarter,time_id
0,1970,Q1,1
1,1970,Q2,2
2,1970,Q3,3
3,1970,Q4,4
4,1971,Q1,5


In [5]:
# filter data by year range listed in dim_time

iata_data_filtered = iata_data[
    (iata_data["Calendar Year"] >= MIN_YEAR) &
    (iata_data["Calendar Year"] <= MAX_YEAR)
].copy()

iata_data_filtered.head()

Unnamed: 0,IATI Identifier,Title,Reporting Organisation Group,Reporting Organisation,Reporting Organisation Type,Aid Type,Finance Type,Flow Type,Provider Organisation,Provider Organisation Type,...,Sector Category,Sector,Humanitarian,Calendar Year,Calendar Quarter,Calendar Year and Quarter,URL,Value (USD),Value (EUR),Value (Local currrency)
36,41AAA-11295-001,Procurement and Supply Support of Anti-Tubercu...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,No data,No data,No data,United Nations Office for Project Services (UN...,No data,...,120 - Health,12263 - Tuberculosis control,0,2016,Q1,2016 Q1,https://d-portal.org/q.html?aid=41AAA-11295-001,2986309.0,2743005.0,203218300.0
37,41AAA-11295-014,Diagnostics Procurement Support to Stop Tuberc...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,No data,No data,No data,United Nations Office for Project Services (UN...,No data,...,120 - Health,12263 - Tuberculosis control,0,2018,Q1,2018 Q1,https://d-portal.org/q.html?aid=41AAA-11295-014,2895425.0,2414262.0,201212600.0
38,41AAA-11295-032,Support for the Stop Tuberculosis Partnership ...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,No data,No data,No data,United Nations Office for Project Services (UN...,No data,...,120 - Health,12263 - Tuberculosis control,0,2021,Q2,2021 Q2,https://d-portal.org/q.html?aid=41AAA-11295-032,514886.0,422003.1,39983470.0
39,41AAA-11960-007,Support to the Global Environment Facility (GE...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,No data,No data,No data,United Nations Office for Project Services (UN...,No data,...,410 - General Environment Protection,41010 - Environmental policy and administrativ...,0,2015,Q3,2015 Q3,https://d-portal.org/q.html?aid=41AAA-11960-007,1668446.0,1491148.0,101308000.0
40,41AAA-20431-001,Project to Support Improvement of Agricultural...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,No data,No data,No data,United Nations Office for Project Services (UN...,No data,...,"310 - Agriculture, Forestry, Fishing",31191 - Agricultural services,0,2017,Q1,2017 Q1,https://d-portal.org/q.html?aid=41AAA-20431-001,9475434.0,8941619.0,637222900.0


In [6]:
# filter out rows with no data in key fields
cols_to_null = [
    "Aid Type",
    "Finance Type",
    "Flow Type",
    "Reporting Organisation Type",
    "Provider Organisation Type",
    "Receiver Organisation Type",
    "Reporting Organisation",
    "Provider Organisation",
    "Receiver Organisation",
    "Sector Category",
    "Sector",
    "Humanitarian",
    "Calendar Quarter",
    "Calendar Year and Quarter",
    "URL"
]


for col in cols_to_null:
    iata_data_filtered[col] = (
        iata_data_filtered[col]
        .replace(["No data", "None", ""], None)
    )

iata_data_filtered.head()


Unnamed: 0,IATI Identifier,Title,Reporting Organisation Group,Reporting Organisation,Reporting Organisation Type,Aid Type,Finance Type,Flow Type,Provider Organisation,Provider Organisation Type,...,Sector Category,Sector,Humanitarian,Calendar Year,Calendar Quarter,Calendar Year and Quarter,URL,Value (USD),Value (EUR),Value (Local currrency)
36,41AAA-11295-001,Procurement and Supply Support of Anti-Tubercu...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,120 - Health,12263 - Tuberculosis control,0,2016,Q1,2016 Q1,https://d-portal.org/q.html?aid=41AAA-11295-001,2986309.0,2743005.0,203218300.0
37,41AAA-11295-014,Diagnostics Procurement Support to Stop Tuberc...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,120 - Health,12263 - Tuberculosis control,0,2018,Q1,2018 Q1,https://d-portal.org/q.html?aid=41AAA-11295-014,2895425.0,2414262.0,201212600.0
38,41AAA-11295-032,Support for the Stop Tuberculosis Partnership ...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,120 - Health,12263 - Tuberculosis control,0,2021,Q2,2021 Q2,https://d-portal.org/q.html?aid=41AAA-11295-032,514886.0,422003.1,39983470.0
39,41AAA-11960-007,Support to the Global Environment Facility (GE...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,410 - General Environment Protection,41010 - Environmental policy and administrativ...,0,2015,Q3,2015 Q3,https://d-portal.org/q.html?aid=41AAA-11960-007,1668446.0,1491148.0,101308000.0
40,41AAA-20431-001,Project to Support Improvement of Agricultural...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,"310 - Agriculture, Forestry, Fishing",31191 - Agricultural services,0,2017,Q1,2017 Q1,https://d-portal.org/q.html?aid=41AAA-20431-001,9475434.0,8941619.0,637222900.0


In [7]:
# see countries column
countries = iata_data_filtered["Recipient Country or Region"].unique()
country_names = pd.Series(countries)

country_names

0                                      AF - Afghanistan
1                                       BD - Bangladesh
2                 BO - Bolivia (Plurinational State of)
3                                         CO - Colombia
4                                            EG - Egypt
5                                         ET - Ethiopia
6     GB - United Kingdom of Great Britain and North...
7                                          GE - Georgia
8                                        GT - Guatemala
9                                         HN - Honduras
10                                           HT - Haiti
11                                       ID - Indonesia
12                                          JO - Jordan
13                                           KE - Kenya
14                                        KH - Cambodia
15                                         LB - Lebanon
16                       MD - Moldova (the Republic of)
17                                      MG - Mad

In [8]:
# convert countries to simplier country names
country_map = {
    "AF - Afghanistan": "Afghanistan",
    "BD - Bangladesh": "Bangladesh",
    "BO - Bolivia (Plurinational State of)": "Bolivia",
    "CO - Colombia": "Colombia",
    "EG - Egypt": "Egypt",
    "ET - Ethiopia": "Ethiopia",
    "GB - United Kingdom of Great Britain and Northern Ireland (the)": "United Kingdom",
    "GE - Georgia": "Georgia",
    "GT - Guatemala": "Guatemala",
    "HN - Honduras": "Honduras",
    "HT - Haiti": "Haiti",
    "ID - Indonesia": "Indonesia",
    "JO - Jordan": "Jordan",
    "KE - Kenya": "Kenya",
    "KH - Cambodia": "Cambodia",
    "LB - Lebanon": "Lebanon",
    "MD - Moldova (the Republic of)": "Moldova",
    "MG - Madagascar": "Madagascar",
    "NG - Nigeria": "Nigeria",
    "NI - Nicaragua": "Nicaragua",
    "NP - Nepal": "Nepal",
    "PH - Philippines (the)": "Philippines",
    "PK - Pakistan": "Pakistan",
    "RW - Rwanda": "Rwanda",
    "SL - Sierra Leone": "Sierra Leone",
    "SN - Senegal": "Senegal",
    "TZ - Tanzania, the United Republic of": "Tanzania",
    "UA - Ukraine": "Ukraine",
    "UG - Uganda": "Uganda",
    "US - United States of America (the)": "United States",
    "VN - Viet Nam": "Vietnam",
    "YE - Yemen": "Yemen"
}


# get iso alpha id
iata_data_filtered.loc[:, "country"] = (
    iata_data_filtered["Recipient Country or Region"]
    .map(country_map)
)

# map countries to simpler form
iata_data_filtered.loc[:, "iso_alpha2"] = (
    iata_data_filtered["Recipient Country or Region"]
    .str.split(" - ", expand=True)[0]
)

In [9]:
# dim_country table
dim_country = (
    iata_data_filtered[["country", "iso_alpha2"]]
        .drop_duplicates()
        .reset_index(drop=True)
)

# add pk
dim_country["country_id"] = dim_country.index + 1


# reorder columns
dim_country = dim_country[["country_id", "country", "iso_alpha2"]]

dim_country = dim_country.rename(columns={"country": "country_name"})

dim_country.head()


Unnamed: 0,country_id,country_name,iso_alpha2
0,1,Afghanistan,AF
1,2,Bangladesh,BD
2,3,Bolivia,BO
3,4,Colombia,CO
4,5,Egypt,EG


In [10]:
# clean organization names
iata_data_filtered["reporting_org_name_clean"] = (
    iata_data_filtered["Reporting Organisation"]
    .str.replace(r"\s*\[\d+\]", "", regex=True)
)

iata_data_filtered["provider_org_name_clean"] = (
    iata_data_filtered["Provider Organisation"]
    .str.replace(r"\s*\[\d+\]", "", regex=True)
)

iata_data_filtered["receiver_org_name_clean"] = (
    iata_data_filtered["Receiver Organisation"]
    .str.replace(r"\s*\[\d+\]", "", regex=True)
)

# get org name id

# extract org iati id
iata_data_filtered["reporting_org_iati_id"] = (
    iata_data_filtered["Reporting Organisation"]
    .str.extract(r"\[(.*?)\]"))

iata_data_filtered["provider_org_iati_id"] = (
    iata_data_filtered["Provider Organisation"]
    .str.extract(r"\[(.*?)\]"))

iata_data_filtered["receiver_org_iati_id"] = (
    iata_data_filtered["Receiver Organisation"]
    .str.extract(r"\[(.*?)\]"))


# extract organization types
iata_data_filtered["reporting_org_type_name"] = (
    iata_data_filtered["Reporting Organisation Type"]
    .str.replace(r"^\d+\s*-\s*", "", regex=True)
)

iata_data_filtered["provider_org_type_name"] = (
    iata_data_filtered["Provider Organisation Type"]
    .str.replace(r"^\d+\s*-\s*", "", regex=True)
)

iata_data_filtered["receiver_org_type_name"] = (
    iata_data_filtered["Receiver Organisation Type"]
    .str.replace(r"^\d+\s*-\s*", "", regex=True)
)

# extract org type code
iata_data_filtered["reporting_org_type_code"] = (
    iata_data_filtered["Reporting Organisation Type"]
    .str.extract(r"^(\d+)")
)

iata_data_filtered["provider_org_type_code"] = (
    iata_data_filtered["Provider Organisation Type"]
    .str.extract(r"^(\d+)")
)

iata_data_filtered["receiver_org_type_code"] = (
    iata_data_filtered["Receiver Organisation Type"]
    .str.extract(r"^(\d+)")
)

iata_data_filtered.head()

Unnamed: 0,IATI Identifier,Title,Reporting Organisation Group,Reporting Organisation,Reporting Organisation Type,Aid Type,Finance Type,Flow Type,Provider Organisation,Provider Organisation Type,...,receiver_org_name_clean,reporting_org_iati_id,provider_org_iati_id,receiver_org_iati_id,reporting_org_type_name,provider_org_type_name,receiver_org_type_name,reporting_org_type_code,provider_org_type_code,receiver_org_type_code
36,41AAA-11295-001,Procurement and Supply Support of Anti-Tubercu...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,United Nations Office for Project Services (UN...,41AAA,41AAA,41AAA,Multilateral,,Multilateral,40,,40
37,41AAA-11295-014,Diagnostics Procurement Support to Stop Tuberc...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,United Nations Office for Project Services (UN...,41AAA,41AAA,41AAA,Multilateral,,Multilateral,40,,40
38,41AAA-11295-032,Support for the Stop Tuberculosis Partnership ...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,United Nations Office for Project Services (UN...,41AAA,41AAA,41AAA,Multilateral,,Multilateral,40,,40
39,41AAA-11960-007,Support to the Global Environment Facility (GE...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,United Nations Office for Project Services (UN...,41AAA,41AAA,41AAA,Multilateral,,Multilateral,40,,40
40,41AAA-20431-001,Project to Support Improvement of Agricultural...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,United Nations Office for Project Services (UN...,41AAA,41AAA,41AAA,Multilateral,,Multilateral,40,,40


In [11]:
# add row number for each organization type (needed for later pivoting and merging)
def add_rownum(df, col):
    return df.groupby("IATI Identifier")[col].cumcount()

iata_data_filtered["rep_rn"]  = add_rownum(iata_data_filtered, "Reporting Organisation")
iata_data_filtered["prov_rn"] = add_rownum(iata_data_filtered, "Provider Organisation")
iata_data_filtered["recv_rn"] = add_rownum(iata_data_filtered, "Receiver Organisation")

In [12]:
# pivot to long format for organization names and types, then merge together

# orgnanization names long
organization_long = pd.concat([
    iata_data_filtered[[
        "IATI Identifier",
        "reporting_org_name_clean",
        "reporting_org_iati_id",
        "reporting_org_type_name",
        "reporting_org_type_code",
        "rep_rn"
    ]].rename(columns={
        "reporting_org_name_clean": "organization_name_clean",
        "reporting_org_iati_id": "organization_iati_id",
        "reporting_org_type_name": "organization_type_name",
        "reporting_org_type_code": "organization_type_code",
        "rep_rn": "rn"
    }).assign(role="Reporting"),

# organization types long
    iata_data_filtered[[
        "IATI Identifier",
        "provider_org_name_clean",
        "provider_org_iati_id",
        "provider_org_type_name",
        "provider_org_type_code",
        "prov_rn"
    ]].rename(columns={
        "provider_org_name_clean": "organization_name_clean",
        "provider_org_iati_id": "organization_iati_id",
        "provider_org_type_name": "organization_type_name",
        "provider_org_type_code": "organization_type_code",
        "prov_rn": "rn"
    }).assign(role="Provider"),

    iata_data_filtered[[
        "IATI Identifier",
        "receiver_org_name_clean",
        "receiver_org_iati_id",
        "receiver_org_type_name",
        "receiver_org_type_code",
        "recv_rn"
    ]].rename(columns={
        "receiver_org_name_clean": "organization_name_clean",
        "receiver_org_iati_id": "organization_iati_id",
        "receiver_org_type_name": "organization_type_name",
        "receiver_org_type_code": "organization_type_code",
        "recv_rn": "rn"
    }).assign(role="Receiver")
])

organization_long.head()

Unnamed: 0,IATI Identifier,organization_name_clean,organization_iati_id,organization_type_name,organization_type_code,rn,role
36,41AAA-11295-001,United Nations Office for Project Services (UN...,41AAA,Multilateral,40,0,Reporting
37,41AAA-11295-014,United Nations Office for Project Services (UN...,41AAA,Multilateral,40,0,Reporting
38,41AAA-11295-032,United Nations Office for Project Services (UN...,41AAA,Multilateral,40,0,Reporting
39,41AAA-11960-007,United Nations Office for Project Services (UN...,41AAA,Multilateral,40,0,Reporting
40,41AAA-20431-001,United Nations Office for Project Services (UN...,41AAA,Multilateral,40,0,Reporting


In [13]:
# build dim organization
dim_organization = (
    organization_long[
        [
            "organization_name_clean",
            "organization_iati_id",
            "organization_type_name",
            "organization_type_code"
        ]
    ]
    .dropna( subset=["organization_name_clean", "organization_type_name"], how="all" )
    .drop_duplicates()
    .reset_index(drop=True)
)

# rename column
dim_organization = dim_organization.rename(
    columns={"organization_name_clean": "organization_name"}
)

# add pk
dim_organization["organization_id"] = dim_organization.index + 1

dim_organization.head()

Unnamed: 0,organization_name,organization_iati_id,organization_type_name,organization_type_code,organization_id
0,United Nations Office for Project Services (UN...,41AAA,Multilateral,40,1
1,"The Global Fund to Fight AIDS, Tuberculosis an...",47045,Public Private Partnership,30,2
2,"Gavi, the vaccine alliance",47122,Multilateral,40,3
3,Swedish Committee for Afghanistan [AF-MOE-118],AF-MOE-118,International NGO,21,4
4,Afghanistan Public Policy Research Organizatio...,AF-MOE-1212,"Academic, Training and Research",80,5


In [14]:
iata_data_filtered.head()

Unnamed: 0,IATI Identifier,Title,Reporting Organisation Group,Reporting Organisation,Reporting Organisation Type,Aid Type,Finance Type,Flow Type,Provider Organisation,Provider Organisation Type,...,receiver_org_iati_id,reporting_org_type_name,provider_org_type_name,receiver_org_type_name,reporting_org_type_code,provider_org_type_code,receiver_org_type_code,rep_rn,prov_rn,recv_rn
36,41AAA-11295-001,Procurement and Supply Support of Anti-Tubercu...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,41AAA,Multilateral,,Multilateral,40,,40,0,0,0
37,41AAA-11295-014,Diagnostics Procurement Support to Stop Tuberc...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,41AAA,Multilateral,,Multilateral,40,,40,0,0,0
38,41AAA-11295-032,Support for the Stop Tuberculosis Partnership ...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,41AAA,Multilateral,,Multilateral,40,,40,0,0,0
39,41AAA-11960-007,Support to the Global Environment Facility (GE...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,41AAA,Multilateral,,Multilateral,40,,40,0,0,0
40,41AAA-20431-001,Project to Support Improvement of Agricultural...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,41AAA,Multilateral,,Multilateral,40,,40,0,0,0


In [15]:
# split sector category into code + name
iata_data_filtered["sector_category_code"] = (
    iata_data_filtered["Sector Category"].str.extract(r"^(\d+)")
)

iata_data_filtered["sector_category"] = (
    iata_data_filtered["Sector Category"].str.replace(r"^\d+\s*-\s*", "", regex=True)
)

# split sector into code + name
iata_data_filtered["sector_code"] = (
    iata_data_filtered["Sector"].str.extract(r"^(\d+)")
)

iata_data_filtered["sector_name"] = (
    iata_data_filtered["Sector"].str.replace(r"^\d+\s*-\s*", "", regex=True)
)

iata_data_filtered.head()

Unnamed: 0,IATI Identifier,Title,Reporting Organisation Group,Reporting Organisation,Reporting Organisation Type,Aid Type,Finance Type,Flow Type,Provider Organisation,Provider Organisation Type,...,reporting_org_type_code,provider_org_type_code,receiver_org_type_code,rep_rn,prov_rn,recv_rn,sector_category_code,sector_category,sector_code,sector_name
36,41AAA-11295-001,Procurement and Supply Support of Anti-Tubercu...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,40,,40,0,0,0,120,Health,12263,Tuberculosis control
37,41AAA-11295-014,Diagnostics Procurement Support to Stop Tuberc...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,40,,40,0,0,0,120,Health,12263,Tuberculosis control
38,41AAA-11295-032,Support for the Stop Tuberculosis Partnership ...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,40,,40,0,0,0,120,Health,12263,Tuberculosis control
39,41AAA-11960-007,Support to the Global Environment Facility (GE...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,40,,40,0,0,0,410,General Environment Protection,41010,Environmental policy and administrative manage...
40,41AAA-20431-001,Project to Support Improvement of Agricultural...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,40,,40,0,0,0,310,"Agriculture, Forestry, Fishing",31191,Agricultural services


In [16]:
# get sector fields for dim sector

sector_staging = iata_data_filtered[["sector_code", "sector_name", "sector_category_code", "sector_category"]].copy()
sector_staging = sector_staging.dropna().drop_duplicates()

sector_staging.head()

Unnamed: 0,sector_code,sector_name,sector_category_code,sector_category
36,12263,Tuberculosis control,120,Health
39,41010,Environmental policy and administrative manage...,410,General Environment Protection
40,31191,Agricultural services,310,"Agriculture, Forestry, Fishing"
41,14021,Water supply - large systems,140,Water Supply & Sanitation
42,72050,Relief co-ordination and support services,720,Emergency Response


In [17]:
# set up sector dimension

dim_sector = sector_staging[
    [
        "sector_code",
        "sector_name",
        "sector_category_code",
        "sector_category"
    ]
].dropna().drop_duplicates().reset_index(drop=True)

# add primary key
dim_sector["sector_id"] = dim_sector.index + 1

# reorder cols
dim_sector = dim_sector[
    [
        "sector_id",
        "sector_code",
        "sector_name",
        "sector_category_code",
        "sector_category"
    ]
]

dim_sector.head()

Unnamed: 0,sector_id,sector_code,sector_name,sector_category_code,sector_category
0,1,12263,Tuberculosis control,120,Health
1,2,41010,Environmental policy and administrative manage...,410,General Environment Protection
2,3,31191,Agricultural services,310,"Agriculture, Forestry, Fishing"
3,4,14021,Water supply - large systems,140,Water Supply & Sanitation
4,5,72050,Relief co-ordination and support services,720,Emergency Response


In [18]:

# split aid type into code and name
iata_data_filtered["aid_type_code"] = iata_data_filtered["Aid Type"].str.extract(r"^(\S+)")
iata_data_filtered["aid_type_name"] = iata_data_filtered["Aid Type"].str.replace(r"^\S+\s*-\s*", "", regex=True)


# set type code to none for int
iata_data_filtered["aid_type_code"] = iata_data_filtered["aid_type_code"].replace("No", None)
iata_data_filtered.head()

Unnamed: 0,IATI Identifier,Title,Reporting Organisation Group,Reporting Organisation,Reporting Organisation Type,Aid Type,Finance Type,Flow Type,Provider Organisation,Provider Organisation Type,...,receiver_org_type_code,rep_rn,prov_rn,recv_rn,sector_category_code,sector_category,sector_code,sector_name,aid_type_code,aid_type_name
36,41AAA-11295-001,Procurement and Supply Support of Anti-Tubercu...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,40,0,0,0,120,Health,12263,Tuberculosis control,,
37,41AAA-11295-014,Diagnostics Procurement Support to Stop Tuberc...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,40,0,0,0,120,Health,12263,Tuberculosis control,,
38,41AAA-11295-032,Support for the Stop Tuberculosis Partnership ...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,40,0,0,0,120,Health,12263,Tuberculosis control,,
39,41AAA-11960-007,Support to the Global Environment Facility (GE...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,40,0,0,0,410,General Environment Protection,41010,Environmental policy and administrative manage...,,
40,41AAA-20431-001,Project to Support Improvement of Agricultural...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,40,0,0,0,310,"Agriculture, Forestry, Fishing",31191,Agricultural services,,


In [19]:
# build aid dimension

dim_aid_type = (
    iata_data_filtered[["aid_type_code", "aid_type_name"]]
    .dropna()
    .drop_duplicates()
    .reset_index(drop=True)
)

# add pk
dim_aid_type["aid_type_id"] = dim_aid_type.index + 1

dim_aid_type = dim_aid_type[
    ["aid_type_id", "aid_type_name", "aid_type_code"]
]

dim_aid_type.head()


Unnamed: 0,aid_type_id,aid_type_name,aid_type_code
0,1,Project-type interventions,C01
1,2,Contributions to specific-purpose programmes a...,B03
2,3,Scholarships/training in donor country,E01
3,4,Basket funds/pooled funding,B04
4,5,"Core support to NGOs, other private bodies, PP...",B01


In [20]:
# split flow type into code and name
iata_data_filtered["flow_type_code"] = iata_data_filtered["Flow Type"].str.extract(r"^(\S+)")

iata_data_filtered["flow_type_name"] = iata_data_filtered["Flow Type"].str.replace(
    r"^\S+\s*-\s*", "", regex=True
)

# set type code to none as well to support int
iata_data_filtered["flow_type_code"] = iata_data_filtered["flow_type_code"].replace("No", None)

iata_data_filtered.head()

Unnamed: 0,IATI Identifier,Title,Reporting Organisation Group,Reporting Organisation,Reporting Organisation Type,Aid Type,Finance Type,Flow Type,Provider Organisation,Provider Organisation Type,...,prov_rn,recv_rn,sector_category_code,sector_category,sector_code,sector_name,aid_type_code,aid_type_name,flow_type_code,flow_type_name
36,41AAA-11295-001,Procurement and Supply Support of Anti-Tubercu...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,0,0,120,Health,12263,Tuberculosis control,,,,
37,41AAA-11295-014,Diagnostics Procurement Support to Stop Tuberc...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,0,0,120,Health,12263,Tuberculosis control,,,,
38,41AAA-11295-032,Support for the Stop Tuberculosis Partnership ...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,0,0,120,Health,12263,Tuberculosis control,,,,
39,41AAA-11960-007,Support to the Global Environment Facility (GE...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,0,0,410,General Environment Protection,41010,Environmental policy and administrative manage...,,,,
40,41AAA-20431-001,Project to Support Improvement of Agricultural...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,0,0,310,"Agriculture, Forestry, Fishing",31191,Agricultural services,,,,


In [21]:
# build flow dimension

dim_flow_type = (
    iata_data_filtered[["flow_type_code", "flow_type_name"]]
    .dropna()
    .drop_duplicates()
    .reset_index(drop=True)
)

# add pk
dim_flow_type["flow_type_id"] = dim_flow_type.index + 1

dim_flow_type = dim_flow_type[
    ["flow_type_id", "flow_type_name", "flow_type_code"]
]


dim_flow_type.head()


Unnamed: 0,flow_type_id,flow_type_name,flow_type_code
0,1,ODA,10
1,2,Non-export credit OOF,21
2,3,Private Development Finance,30
3,4,Other flows,50
4,5,Private Foreign Direct Investment,36


In [22]:
iata_data_filtered["transaction_type_code"] = iata_data_filtered["Transaction Type"].str.extract(r"^(\S+)")
iata_data_filtered["transaction_type_name"] = iata_data_filtered["Transaction Type"].str.replace(r"^\S+\s*-\s*", "", regex=True)


iata_data_filtered.head()

Unnamed: 0,IATI Identifier,Title,Reporting Organisation Group,Reporting Organisation,Reporting Organisation Type,Aid Type,Finance Type,Flow Type,Provider Organisation,Provider Organisation Type,...,sector_category_code,sector_category,sector_code,sector_name,aid_type_code,aid_type_name,flow_type_code,flow_type_name,transaction_type_code,transaction_type_name
36,41AAA-11295-001,Procurement and Supply Support of Anti-Tubercu...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,120,Health,12263,Tuberculosis control,,,,,4,Expenditure
37,41AAA-11295-014,Diagnostics Procurement Support to Stop Tuberc...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,120,Health,12263,Tuberculosis control,,,,,4,Expenditure
38,41AAA-11295-032,Support for the Stop Tuberculosis Partnership ...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,120,Health,12263,Tuberculosis control,,,,,4,Expenditure
39,41AAA-11960-007,Support to the Global Environment Facility (GE...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,410,General Environment Protection,41010,Environmental policy and administrative manage...,,,,,4,Expenditure
40,41AAA-20431-001,Project to Support Improvement of Agricultural...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,310,"Agriculture, Forestry, Fishing",31191,Agricultural services,,,,,4,Expenditure


In [23]:
# build transaction dimension
dim_transaction_type = (
    iata_data_filtered[["transaction_type_code", "transaction_type_name"]]
    .dropna()
    .drop_duplicates()
    .reset_index(drop=True)
)
# pk
dim_transaction_type["transaction_type_id"] = dim_transaction_type.index + 1

# reorder cols
dim_transaction_type = dim_transaction_type[
    ["transaction_type_id", "transaction_type_code", "transaction_type_name"]
]

dim_transaction_type.head()

Unnamed: 0,transaction_type_id,transaction_type_code,transaction_type_name
0,1,4,Expenditure
1,2,2,Outgoing Commitment
2,3,3,Disbursement
3,4,1,Incoming Funds
4,5,budget,Budget


In [24]:
# extract code and name from finance type

iata_data_filtered["finance_type_code"] = iata_data_filtered["Finance Type"].str.extract(r"^(\S+)")
iata_data_filtered["finance_type_name"] = iata_data_filtered["Finance Type"].str.replace(r"^\S+\s*-\s*", "", regex=True)

# set type code to no data as well for better null consistency
iata_data_filtered["finance_type_code"] = iata_data_filtered["finance_type_code"].replace("No", None)


iata_data_filtered.head()

Unnamed: 0,IATI Identifier,Title,Reporting Organisation Group,Reporting Organisation,Reporting Organisation Type,Aid Type,Finance Type,Flow Type,Provider Organisation,Provider Organisation Type,...,sector_code,sector_name,aid_type_code,aid_type_name,flow_type_code,flow_type_name,transaction_type_code,transaction_type_name,finance_type_code,finance_type_name
36,41AAA-11295-001,Procurement and Supply Support of Anti-Tubercu...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,12263,Tuberculosis control,,,,,4,Expenditure,,
37,41AAA-11295-014,Diagnostics Procurement Support to Stop Tuberc...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,12263,Tuberculosis control,,,,,4,Expenditure,,
38,41AAA-11295-032,Support for the Stop Tuberculosis Partnership ...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,12263,Tuberculosis control,,,,,4,Expenditure,,
39,41AAA-11960-007,Support to the Global Environment Facility (GE...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,41010,Environmental policy and administrative manage...,,,,,4,Expenditure,,
40,41AAA-20431-001,Project to Support Improvement of Agricultural...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,31191,Agricultural services,,,,,4,Expenditure,,


In [25]:
# build finance dimension

dim_finance_type = (
    iata_data_filtered[["finance_type_code", "finance_type_name"]]
    .dropna()
    .drop_duplicates()
    .reset_index(drop=True)
)

dim_finance_type["finance_type_id"] = dim_finance_type.index + 1


dim_finance_type = dim_finance_type[
    ["finance_type_id", "finance_type_name", "finance_type_code"]
]

dim_finance_type.head()

Unnamed: 0,finance_type_id,finance_type_name,finance_type_code
0,1,Standard grant,110
1,2,Capital subscription on encashment basis,311
2,3,Standard loan,421
3,4,Acquisition of equity not part of joint ventur...,511
4,5,Debt forgiveness: Other,618


In [26]:
world_indicator_data = pd.read_csv("data/world-development-indicators/data2.csv")

world_indicator_data.head()

Unnamed: 0,Country Name,Country Code,Series Name,Series Code,1970 [YR1970],1971 [YR1971],1972 [YR1972],1973 [YR1973],1974 [YR1974],1975 [YR1975],...,2014 [YR2014],2015 [YR2015],2016 [YR2016],2017 [YR2017],2018 [YR2018],2019 [YR2019],2020 [YR2020],2021 [YR2021],2022 [YR2022],2023 [YR2023]
0,United States,USA,"Population, total",SP.POP.TOTL,205052000.0,207661000.0,209896000.0,211909000.0,213854000.0,215973000.0,...,319257560.0,321815121.0,324353340.0,326608609.0,328529577.0,330226227.0,331577720.0,332099760.0,334017321.0,336806231.0
1,United States,USA,Population density (people per sq. km of land ...,EN.POP.DNST,22.3881314035655,22.6729890729952,22.9170124118896,23.1367971909474,23.3491575462716,23.5805156917379,...,34.9013776562134,35.1809713558577,35.4584505795077,35.7049975840182,35.9149986553586,36.1004771837305,36.248222996211,36.3052926398919,36.5149212564854,36.8198061311277
2,United States,USA,GDP per capita (current US$),NY.GDP.PCAP.CD,5234.2966662115,5609.38259952519,6094.01798986165,6726.35895596695,7225.69135952566,7801.45666356443,...,54973.4207515712,56572.9188996063,57638.1018367192,59635.0984397965,62499.8744390068,64746.4506778863,63515.9491807833,70205.050916026,76657.2488844403,81032.262117545
3,United States,USA,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,70.8073170731707,71.1073170731707,71.1560975609756,71.3560975609756,71.9560975609756,72.6048780487805,...,78.8414634146341,78.690243902439,78.5390243902439,78.5390243902439,78.6390243902439,78.7878048780488,76.9804878048781,76.3292682926829,77.4341463414634,78.3853658536585
4,United States,USA,"Mortality rate, under-5 (per 1,000 live births)",SH.DYN.MORT,23.2,22.4,21.5,20.6,19.7,18.8,...,6.8,6.8,6.7,6.6,6.5,6.5,6.5,6.5,6.5,6.5


In [27]:
# long pivot the years

indicator_long = world_indicator_data.melt(
    id_vars=["Country Name", "Country Code", "Series Name", "Series Code"],
    var_name="year",
    value_name="value"
)

indicator_long.head()

Unnamed: 0,Country Name,Country Code,Series Name,Series Code,year,value
0,United States,USA,"Population, total",SP.POP.TOTL,1970 [YR1970],205052000.0
1,United States,USA,Population density (people per sq. km of land ...,EN.POP.DNST,1970 [YR1970],22.3881314035655
2,United States,USA,GDP per capita (current US$),NY.GDP.PCAP.CD,1970 [YR1970],5234.2966662115
3,United States,USA,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,1970 [YR1970],70.8073170731707
4,United States,USA,"Mortality rate, under-5 (per 1,000 live births)",SH.DYN.MORT,1970 [YR1970],23.2


In [28]:
# year has weird values so convert

indicator_long["year"] = indicator_long["year"].str.extract(r"(\d{4})").astype(int)

indicator_long.head(1)

Unnamed: 0,Country Name,Country Code,Series Name,Series Code,year,value
0,United States,USA,"Population, total",SP.POP.TOTL,1970,205052000


In [29]:
# look at distinct series

indicator_long["Series Name"].unique()

array(['Population, total',
       'Population density (people per sq. km of land area)',
       'GDP per capita (current US$)',
       'Life expectancy at birth, total (years)',
       'Mortality rate, under-5 (per 1,000 live births)',
       'Primary completion rate, total (% of relevant age group)',
       'School enrollment, primary (% net)',
       'Lower secondary completion rate, total (% of relevant age group)',
       'Net ODA received per capita (current US$)', nan], dtype=object)

In [30]:
# make a map to convert the names to readable format

indicator_map = {
    "Population, total": "population",
    "Population density (people per sq. km of land area)": "population_density",
    "GDP per capita (current US$)": "gdp_per_capita",
    "Net ODA received per capita (current US$)": "net_oda_per_capita",
    "Life expectancy at birth, total (years)": "life_expectancy",
    "Mortality rate, under-5 (per 1,000 live births)": "under_5_mortality",
    "Primary completion rate, total (% of relevant age group)": "primary_completion_rate",
    "Lower secondary completion rate, total (% of relevant age group)": "lower_secondary_completion_rate",
    "School enrollment, primary (% net)": "school_enrollment_primary"
}


indicator_long["metric"] = indicator_long["Series Name"].map(indicator_map)

indicator_long.head()

Unnamed: 0,Country Name,Country Code,Series Name,Series Code,year,value,metric
0,United States,USA,"Population, total",SP.POP.TOTL,1970,205052000.0,population
1,United States,USA,Population density (people per sq. km of land ...,EN.POP.DNST,1970,22.3881314035655,population_density
2,United States,USA,GDP per capita (current US$),NY.GDP.PCAP.CD,1970,5234.2966662115,gdp_per_capita
3,United States,USA,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,1970,70.8073170731707,life_expectancy
4,United States,USA,"Mortality rate, under-5 (per 1,000 live births)",SH.DYN.MORT,1970,23.2,under_5_mortality


In [31]:
# theres weird rows where theres no data at all so filter it out (verified using R)

indicator_long = indicator_long[indicator_long["metric"].notna()].copy()
indicator_long = indicator_long[indicator_long["year"] < 2024].copy()

# theres "..." entries as opposed to n/a entries
indicator_long["value"] = indicator_long["value"].replace("..", None)

# convert numeric
indicator_long["value"] = pd.to_numeric(indicator_long["value"], errors="coerce")



In [32]:
# pivot again to convert metrics to seperate columns

indicator_pivoted = indicator_long.pivot_table(
    index=["Country Name", "Country Code", "year"],
    columns="metric",
    values="value"
).reset_index()

indicator_pivoted.head()

metric,Country Name,Country Code,year,gdp_per_capita,life_expectancy,lower_secondary_completion_rate,net_oda_per_capita,population,population_density,primary_completion_rate,school_enrollment_primary,under_5_mortality
0,Afghanistan,AFG,1970,,37.46,,2.445499,11290128.0,17.310041,,,301.6
1,Afghanistan,AFG,1971,,37.932,,3.841743,11567667.0,17.735564,,,296.5
2,Afghanistan,AFG,1972,,38.423,,4.655088,11853696.0,18.174104,,,291.4
3,Afghanistan,AFG,1973,,38.951,9.78837,4.582991,12157999.0,18.640662,,,285.9
4,Afghanistan,AFG,1974,,39.469,8.5725,3.922488,12469127.0,19.117684,17.058519,26.82221,280.6


In [33]:
# verify country names
indicator_pivoted["Country Name"].unique()

array(['Afghanistan', 'Bangladesh', 'Bolivia', 'Cambodia', 'Colombia',
       'Egypt, Arab Rep.', 'Ethiopia', 'Georgia', 'Guatemala', 'Haiti',
       'Honduras', 'Indonesia', 'Jordan', 'Kenya', 'Lebanon',
       'Madagascar', 'Moldova', 'Nepal', 'Nicaragua', 'Nigeria',
       'Pakistan', 'Philippines', 'Rwanda', 'Senegal', 'Sierra Leone',
       'Tanzania', 'Uganda', 'Ukraine', 'United Kingdom', 'United States',
       'Viet Nam', 'Yemen, Rep.'], dtype=object)

In [34]:
# map country names to

country_mapping = {
    "Afghanistan": "Afghanistan",
    "Bangladesh": "Bangladesh",
    "Bolivia": "Bolivia",
    "Cambodia": "Cambodia",
    "Colombia": "Colombia",
    "Egypt, Arab Rep.": "Egypt",
    "Ethiopia": "Ethiopia",
    "Georgia": "Georgia",
    "Guatemala": "Guatemala",
    "Haiti": "Haiti",
    "Honduras": "Honduras",
    "Indonesia": "Indonesia",
    "Jordan": "Jordan",
    "Kenya": "Kenya",
    "Lebanon": "Lebanon",
    "Madagascar": "Madagascar",
    "Moldova": "Moldova",
    "Nepal": "Nepal",
    "Nicaragua": "Nicaragua",
    "Nigeria": "Nigeria",
    "Pakistan": "Pakistan",
    "Philippines": "Philippines",
    "Rwanda": "Rwanda",
    "Senegal": "Senegal",
    "Sierra Leone": "Sierra Leone",
    "Tanzania": "Tanzania",
    "Uganda": "Uganda",
    "Ukraine": "Ukraine",
    "United Kingdom": "United Kingdom",
    "United States": "United States",
    "Viet Nam": "Vietnam",
    "Yemen, Rep.": "Yemen"
}

indicator_pivoted["country_clean"] = indicator_pivoted["Country Name"].map(country_mapping)

indicator_pivoted.head()

metric,Country Name,Country Code,year,gdp_per_capita,life_expectancy,lower_secondary_completion_rate,net_oda_per_capita,population,population_density,primary_completion_rate,school_enrollment_primary,under_5_mortality,country_clean
0,Afghanistan,AFG,1970,,37.46,,2.445499,11290128.0,17.310041,,,301.6,Afghanistan
1,Afghanistan,AFG,1971,,37.932,,3.841743,11567667.0,17.735564,,,296.5,Afghanistan
2,Afghanistan,AFG,1972,,38.423,,4.655088,11853696.0,18.174104,,,291.4,Afghanistan
3,Afghanistan,AFG,1973,,38.951,9.78837,4.582991,12157999.0,18.640662,,,285.9,Afghanistan
4,Afghanistan,AFG,1974,,39.469,8.5725,3.922488,12469127.0,19.117684,17.058519,26.82221,280.6,Afghanistan


In [35]:
indicator_pivoted["country_clean"].unique()

array(['Afghanistan', 'Bangladesh', 'Bolivia', 'Cambodia', 'Colombia',
       'Egypt', 'Ethiopia', 'Georgia', 'Guatemala', 'Haiti', 'Honduras',
       'Indonesia', 'Jordan', 'Kenya', 'Lebanon', 'Madagascar', 'Moldova',
       'Nepal', 'Nicaragua', 'Nigeria', 'Pakistan', 'Philippines',
       'Rwanda', 'Senegal', 'Sierra Leone', 'Tanzania', 'Uganda',
       'Ukraine', 'United Kingdom', 'United States', 'Vietnam', 'Yemen'],
      dtype=object)

In [36]:
# add quarter for joining
indicator_pivoted["quarter"] = "Q1"

In [37]:
fact_indicator = indicator_pivoted.merge(
    dim_country[["country_id", "country_name"]],
    left_on="country_clean",
    right_on="country_name",
    how="left"
)

fact_indicator.head()

Unnamed: 0,Country Name,Country Code,year,gdp_per_capita,life_expectancy,lower_secondary_completion_rate,net_oda_per_capita,population,population_density,primary_completion_rate,school_enrollment_primary,under_5_mortality,country_clean,quarter,country_id,country_name
0,Afghanistan,AFG,1970,,37.46,,2.445499,11290128.0,17.310041,,,301.6,Afghanistan,Q1,1,Afghanistan
1,Afghanistan,AFG,1971,,37.932,,3.841743,11567667.0,17.735564,,,296.5,Afghanistan,Q1,1,Afghanistan
2,Afghanistan,AFG,1972,,38.423,,4.655088,11853696.0,18.174104,,,291.4,Afghanistan,Q1,1,Afghanistan
3,Afghanistan,AFG,1973,,38.951,9.78837,4.582991,12157999.0,18.640662,,,285.9,Afghanistan,Q1,1,Afghanistan
4,Afghanistan,AFG,1974,,39.469,8.5725,3.922488,12469127.0,19.117684,17.058519,26.82221,280.6,Afghanistan,Q1,1,Afghanistan


In [38]:
fact_indicator = fact_indicator.merge(
    dim_time,
    on=["year", "quarter"],
    how="left"
)
fact_indicator.head()


Unnamed: 0,Country Name,Country Code,year,gdp_per_capita,life_expectancy,lower_secondary_completion_rate,net_oda_per_capita,population,population_density,primary_completion_rate,school_enrollment_primary,under_5_mortality,country_clean,quarter,country_id,country_name,time_id
0,Afghanistan,AFG,1970,,37.46,,2.445499,11290128.0,17.310041,,,301.6,Afghanistan,Q1,1,Afghanistan,1
1,Afghanistan,AFG,1971,,37.932,,3.841743,11567667.0,17.735564,,,296.5,Afghanistan,Q1,1,Afghanistan,5
2,Afghanistan,AFG,1972,,38.423,,4.655088,11853696.0,18.174104,,,291.4,Afghanistan,Q1,1,Afghanistan,9
3,Afghanistan,AFG,1973,,38.951,9.78837,4.582991,12157999.0,18.640662,,,285.9,Afghanistan,Q1,1,Afghanistan,13
4,Afghanistan,AFG,1974,,39.469,8.5725,3.922488,12469127.0,19.117684,17.058519,26.82221,280.6,Afghanistan,Q1,1,Afghanistan,17


In [39]:
# check columns
iata_data_filtered.head()

Unnamed: 0,IATI Identifier,Title,Reporting Organisation Group,Reporting Organisation,Reporting Organisation Type,Aid Type,Finance Type,Flow Type,Provider Organisation,Provider Organisation Type,...,sector_code,sector_name,aid_type_code,aid_type_name,flow_type_code,flow_type_name,transaction_type_code,transaction_type_name,finance_type_code,finance_type_name
36,41AAA-11295-001,Procurement and Supply Support of Anti-Tubercu...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,12263,Tuberculosis control,,,,,4,Expenditure,,
37,41AAA-11295-014,Diagnostics Procurement Support to Stop Tuberc...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,12263,Tuberculosis control,,,,,4,Expenditure,,
38,41AAA-11295-032,Support for the Stop Tuberculosis Partnership ...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,12263,Tuberculosis control,,,,,4,Expenditure,,
39,41AAA-11960-007,Support to the Global Environment Facility (GE...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,41010,Environmental policy and administrative manage...,,,,,4,Expenditure,,
40,41AAA-20431-001,Project to Support Improvement of Agricultural...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,31191,Agricultural services,,,,,4,Expenditure,,


In [40]:
iata_data_filtered["provider_org_name_clean"].head()

36    United Nations Office for Project Services (UN...
37    United Nations Office for Project Services (UN...
38    United Nations Office for Project Services (UN...
39    United Nations Office for Project Services (UN...
40    United Nations Office for Project Services (UN...
Name: provider_org_name_clean, dtype: object

In [41]:
fact_iata = iata_data_filtered.rename(columns={
    "IATI Identifier": "iati_id",
    "Title": "aid_title",
    "Value (USD)": "value_usd",
    "Value (EUR)": "value_euro",
    "Value (Local currrency)": "value_local",
    "Humanitarian": "humanitarian",
    "Calendar Year": "year",
    "Calendar Quarter": "quarter",
})

fact_iata.head()


Unnamed: 0,iati_id,aid_title,Reporting Organisation Group,Reporting Organisation,Reporting Organisation Type,Aid Type,Finance Type,Flow Type,Provider Organisation,Provider Organisation Type,...,sector_code,sector_name,aid_type_code,aid_type_name,flow_type_code,flow_type_name,transaction_type_code,transaction_type_name,finance_type_code,finance_type_name
36,41AAA-11295-001,Procurement and Supply Support of Anti-Tubercu...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,12263,Tuberculosis control,,,,,4,Expenditure,,
37,41AAA-11295-014,Diagnostics Procurement Support to Stop Tuberc...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,12263,Tuberculosis control,,,,,4,Expenditure,,
38,41AAA-11295-032,Support for the Stop Tuberculosis Partnership ...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,12263,Tuberculosis control,,,,,4,Expenditure,,
39,41AAA-11960-007,Support to the Global Environment Facility (GE...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,41010,Environmental policy and administrative manage...,,,,,4,Expenditure,,
40,41AAA-20431-001,Project to Support Improvement of Agricultural...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,31191,Agricultural services,,,,,4,Expenditure,,


In [42]:
# add country id to fact table by merging
fact_iata = fact_iata.merge(
    dim_country[["country_id", "country_name"]],
    left_on="country",
    right_on="country_name",
    how="left"
)

In [43]:
fact_iata.head()

Unnamed: 0,iati_id,aid_title,Reporting Organisation Group,Reporting Organisation,Reporting Organisation Type,Aid Type,Finance Type,Flow Type,Provider Organisation,Provider Organisation Type,...,aid_type_code,aid_type_name,flow_type_code,flow_type_name,transaction_type_code,transaction_type_name,finance_type_code,finance_type_name,country_id,country_name
0,41AAA-11295-001,Procurement and Supply Support of Anti-Tubercu...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,,,,,4,Expenditure,,,1,Afghanistan
1,41AAA-11295-014,Diagnostics Procurement Support to Stop Tuberc...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,,,,,4,Expenditure,,,1,Afghanistan
2,41AAA-11295-032,Support for the Stop Tuberculosis Partnership ...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,,,,,4,Expenditure,,,1,Afghanistan
3,41AAA-11960-007,Support to the Global Environment Facility (GE...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,,,,,4,Expenditure,,,1,Afghanistan
4,41AAA-20431-001,Project to Support Improvement of Agricultural...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,,,,,4,Expenditure,,,1,Afghanistan


In [44]:
# add time id to fact table by merging
fact_iata = fact_iata.merge(
    dim_time,
    left_on=["year", "quarter"],
    right_on=["year", "quarter"],
    how="left"
)

In [45]:
fact_iata = fact_iata.merge(
    dim_organization[[
        "organization_id",
        "organization_name",
        "organization_type_name"
    ]],
    left_on=["reporting_org_name_clean", "reporting_org_type_name"],
    right_on=["organization_name", "organization_type_name"],
    how="left"
).rename(columns={"organization_id": "reporting_org_id"})

fact_iata = fact_iata.drop(columns=["organization_name", "organization_type_name"])


In [46]:
fact_iata.head()

Unnamed: 0,iati_id,aid_title,Reporting Organisation Group,Reporting Organisation,Reporting Organisation Type,Aid Type,Finance Type,Flow Type,Provider Organisation,Provider Organisation Type,...,flow_type_code,flow_type_name,transaction_type_code,transaction_type_name,finance_type_code,finance_type_name,country_id,country_name,time_id,reporting_org_id
0,41AAA-11295-001,Procurement and Supply Support of Anti-Tubercu...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,,,4,Expenditure,,,1,Afghanistan,185,1
1,41AAA-11295-014,Diagnostics Procurement Support to Stop Tuberc...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,,,4,Expenditure,,,1,Afghanistan,193,1
2,41AAA-11295-032,Support for the Stop Tuberculosis Partnership ...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,,,4,Expenditure,,,1,Afghanistan,206,1
3,41AAA-11960-007,Support to the Global Environment Facility (GE...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,,,4,Expenditure,,,1,Afghanistan,183,1
4,41AAA-20431-001,Project to Support Improvement of Agricultural...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,,,4,Expenditure,,,1,Afghanistan,189,1


In [47]:
fact_iata = fact_iata.merge(
    dim_organization[[
        "organization_id",
        "organization_name",
        "organization_type_name"
    ]],
    left_on=["provider_org_name_clean", "provider_org_type_name"],
    right_on=["organization_name", "organization_type_name"],
    how="left"
).rename(columns={"organization_id": "provider_org_id"})

fact_iata = fact_iata.drop(columns=["organization_name", "organization_type_name"])


In [48]:
fact_iata = fact_iata.merge(
    dim_organization[[
        "organization_id",
        "organization_name",
        "organization_type_name"
    ]],
    left_on=["receiver_org_name_clean", "receiver_org_type_name"],
    right_on=["organization_name", "organization_type_name"],
    how="left"
).rename(columns={"organization_id": "receiving_org_id"})

fact_iata = fact_iata.drop(columns=["organization_name", "organization_type_name"])


In [49]:
fact_iata.head()

Unnamed: 0,iati_id,aid_title,Reporting Organisation Group,Reporting Organisation,Reporting Organisation Type,Aid Type,Finance Type,Flow Type,Provider Organisation,Provider Organisation Type,...,transaction_type_code,transaction_type_name,finance_type_code,finance_type_name,country_id,country_name,time_id,reporting_org_id,provider_org_id,receiving_org_id
0,41AAA-11295-001,Procurement and Supply Support of Anti-Tubercu...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,4,Expenditure,,,1,Afghanistan,185,1,1201.0,1.0
1,41AAA-11295-014,Diagnostics Procurement Support to Stop Tuberc...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,4,Expenditure,,,1,Afghanistan,193,1,1201.0,1.0
2,41AAA-11295-032,Support for the Stop Tuberculosis Partnership ...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,4,Expenditure,,,1,Afghanistan,206,1,1201.0,1.0
3,41AAA-11960-007,Support to the Global Environment Facility (GE...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,4,Expenditure,,,1,Afghanistan,183,1,1201.0,1.0
4,41AAA-20431-001,Project to Support Improvement of Agricultural...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,4,Expenditure,,,1,Afghanistan,189,1,1201.0,1.0


In [50]:

fact_iata = fact_iata.merge(
    dim_aid_type[["aid_type_code", "aid_type_id"]],
    on="aid_type_code",
    how="left"
)

fact_iata = fact_iata.merge(
    dim_finance_type[["finance_type_code", "finance_type_id"]],
    on="finance_type_code",
    how="left"
)


fact_iata = fact_iata.merge(
    dim_flow_type[["flow_type_name", "flow_type_id"]],
    on="flow_type_name",
    how="left"
)


fact_iata = fact_iata.merge(
    dim_transaction_type[["transaction_type_code", "transaction_type_id"]],
    on="transaction_type_code",
    how="left"
)


fact_iata = fact_iata.merge(
    dim_sector[["sector_code", "sector_id"]],
    on="sector_code",
    how="left"
)

fact_iata.head()

Unnamed: 0,iati_id,aid_title,Reporting Organisation Group,Reporting Organisation,Reporting Organisation Type,Aid Type,Finance Type,Flow Type,Provider Organisation,Provider Organisation Type,...,country_name,time_id,reporting_org_id,provider_org_id,receiving_org_id,aid_type_id,finance_type_id,flow_type_id,transaction_type_id,sector_id
0,41AAA-11295-001,Procurement and Supply Support of Anti-Tubercu...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,Afghanistan,185,1,1201.0,1.0,,,,1,1.0
1,41AAA-11295-014,Diagnostics Procurement Support to Stop Tuberc...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,Afghanistan,193,1,1201.0,1.0,,,,1,1.0
2,41AAA-11295-032,Support for the Stop Tuberculosis Partnership ...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,Afghanistan,206,1,1201.0,1.0,,,,1,1.0
3,41AAA-11960-007,Support to the Global Environment Facility (GE...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,Afghanistan,183,1,1201.0,1.0,,,,1,2.0
4,41AAA-20431-001,Project to Support Improvement of Agricultural...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,Afghanistan,189,1,1201.0,1.0,,,,1,3.0


In [51]:
fk_cols = [
    "country_id",
    "time_id",
    "reporting_org_id",
    "provider_org_id",
    "receiving_org_id",
    "aid_type_id",
    "finance_type_id",
    "flow_type_id",
    "transaction_type_id",
    "sector_id"
]

for col in fk_cols:
    fact_iata[col] = fact_iata[col].astype("Int64")

fact_iata.head()


Unnamed: 0,iati_id,aid_title,Reporting Organisation Group,Reporting Organisation,Reporting Organisation Type,Aid Type,Finance Type,Flow Type,Provider Organisation,Provider Organisation Type,...,country_name,time_id,reporting_org_id,provider_org_id,receiving_org_id,aid_type_id,finance_type_id,flow_type_id,transaction_type_id,sector_id
0,41AAA-11295-001,Procurement and Supply Support of Anti-Tubercu...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,Afghanistan,185,1,1201,1,,,,1,1
1,41AAA-11295-014,Diagnostics Procurement Support to Stop Tuberc...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,Afghanistan,193,1,1201,1,,,,1,1
2,41AAA-11295-032,Support for the Stop Tuberculosis Partnership ...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,Afghanistan,206,1,1201,1,,,,1,1
3,41AAA-11960-007,Support to the Global Environment Facility (GE...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,Afghanistan,183,1,1201,1,,,,1,2
4,41AAA-20431-001,Project to Support Improvement of Agricultural...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,,,,United Nations Office for Project Services (UN...,,...,Afghanistan,189,1,1201,1,,,,1,3


In [52]:
# select and reorder columns for fact table

fact_iata = fact_iata[
   [
        "iati_id",
        "aid_title",
        "reporting_org_id",
        "provider_org_id",
        "receiving_org_id",
        "sector_id",
        "time_id",
        "country_id",
        "aid_type_id",
        "finance_type_id",
        "flow_type_id",
        "transaction_type_id",
        "value_usd",
        "humanitarian"
    ]
]

fact_indicator = fact_indicator[
    [
        "country_id",
        "time_id",
        "population",
        "gdp_per_capita",
        "net_oda_per_capita",
        "life_expectancy",
        "under_5_mortality",
        "primary_completion_rate",
        "lower_secondary_completion_rate",
        "population_density"
    ]
]



In [53]:
# export data into csv
import os
os.makedirs("data", exist_ok=True)

directory = "data/etl_output"

tables = {
    "dim_country": dim_country,
    "dim_time": dim_time,
    "dim_sector": dim_sector,
    "dim_finance_type": dim_finance_type,
    "dim_flow_type": dim_flow_type,
    "dim_transaction_type": dim_transaction_type,
    "dim_aid_type": dim_aid_type,
    "dim_organization": dim_organization,
    "fact_aid_transaction": fact_iata,
    "fact_country_context": fact_indicator
}

# save each table to CSV inside your directory
for name, df in tables.items():
    df.to_csv(f"{directory}/{name}.csv", index=False)
