In [1]:
# import etl libraries
import requests
import pandas as pd, numpy as np
from bs4 import BeautifulSoup

In [2]:
# fetch data
iata_data = pd.read_csv("data/country-development-finance/csv/iata_data.csv")

In [3]:
iata_data.head()

Unnamed: 0,IATI Identifier,Title,Reporting Organisation Group,Reporting Organisation,Reporting Organisation Type,Aid Type,Finance Type,Flow Type,Provider Organisation,Provider Organisation Type,...,Sector Category,Sector,Humanitarian,Calendar Year,Calendar Quarter,Calendar Year and Quarter,URL,Value (USD),Value (EUR),Value (Local currrency)
0,41120-100879,ROAP/AFG04/22/Secure Communal HLP rights in Af...,UN - United Nations,UN-Habitat [41120],40 - Multilateral,No data,No data,No data,UN-Habitat [41120],40 - Multilateral,...,160 - Other Social Infrastructure & Services,16030 - Housing policy and administrative mana...,0,2025,Q4,2025 Q4,https://d-portal.org/q.html?aid=41120-100879,499924.15,432684.914315,38821610.0
1,41120-100879,ROAP/AFG04/22/Secure Communal HLP rights in Af...,UN - United Nations,UN-Habitat [41120],40 - Multilateral,No data,No data,No data,UNOCHA-New York,No data,...,160 - Other Social Infrastructure & Services,16030 - Housing policy and administrative mana...,0,2025,Q4,2025 Q4,https://d-portal.org/q.html?aid=41120-100879,499940.75,432699.281634,38822900.0
2,41120-102631,ROAP/AFG01/22/Adaptive responses: Evidence bas...,UN - United Nations,UN-Habitat [41120],40 - Multilateral,No data,No data,No data,UN-Habitat [41120],40 - Multilateral,...,730 - Reconstruction Relief & Rehabilitation,73010 - Immediate post-emergency reconstructio...,0,2025,Q4,2025 Q4,https://d-portal.org/q.html?aid=41120-102631,314966.64,272603.981305,24458730.0
3,41120-102631,ROAP/AFG01/22/Adaptive responses: Evidence bas...,UN - United Nations,UN-Habitat [41120],40 - Multilateral,No data,No data,No data,UNHCR-Afghanistan,No data,...,730 - Reconstruction Relief & Rehabilitation,73010 - Immediate post-emergency reconstructio...,0,2025,Q4,2025 Q4,https://d-portal.org/q.html?aid=41120-102631,314966.64,272603.981305,24458730.0
4,41120-102645,ROAP/AFG05/22/People-friendly Streets in Afgha...,UN - United Nations,UN-Habitat [41120],40 - Multilateral,No data,No data,No data,UN-Habitat [41120],40 - Multilateral,...,430 - Other Multisector,43030 - Urban development and management,0,2025,Q4,2025 Q4,https://d-portal.org/q.html?aid=41120-102645,494995.06,428418.781374,38438840.0


In [4]:
# create time dimension table

MIN_YEAR = 1970
MAX_YEAR = 2024

years = list(range(MIN_YEAR, MAX_YEAR))
quarters = ["Q1", "Q2", "Q3", "Q4"]

dim_time = pd.DataFrame([(y, q) for y in years for q in quarters],
                        columns=["year", "quarter"])
dim_time["time_id"] = dim_time.index + 1

dim_time.head()

Unnamed: 0,year,quarter,time_id
0,1970,Q1,1
1,1970,Q2,2
2,1970,Q3,3
3,1970,Q4,4
4,1971,Q1,5


In [5]:
# filter data by year range listed in dim_time

iata_data_filtered = iata_data[
    (iata_data["Calendar Year"] >= MIN_YEAR) &
    (iata_data["Calendar Year"] <= MAX_YEAR)
].copy()

iata_data_filtered.head()

Unnamed: 0,IATI Identifier,Title,Reporting Organisation Group,Reporting Organisation,Reporting Organisation Type,Aid Type,Finance Type,Flow Type,Provider Organisation,Provider Organisation Type,...,Sector Category,Sector,Humanitarian,Calendar Year,Calendar Quarter,Calendar Year and Quarter,URL,Value (USD),Value (EUR),Value (Local currrency)
36,41AAA-11295-001,Procurement and Supply Support of Anti-Tubercu...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,No data,No data,No data,United Nations Office for Project Services (UN...,No data,...,120 - Health,12263 - Tuberculosis control,0,2016,Q1,2016 Q1,https://d-portal.org/q.html?aid=41AAA-11295-001,2986309.0,2743005.0,203218300.0
37,41AAA-11295-014,Diagnostics Procurement Support to Stop Tuberc...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,No data,No data,No data,United Nations Office for Project Services (UN...,No data,...,120 - Health,12263 - Tuberculosis control,0,2018,Q1,2018 Q1,https://d-portal.org/q.html?aid=41AAA-11295-014,2895425.0,2414262.0,201212600.0
38,41AAA-11295-032,Support for the Stop Tuberculosis Partnership ...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,No data,No data,No data,United Nations Office for Project Services (UN...,No data,...,120 - Health,12263 - Tuberculosis control,0,2021,Q2,2021 Q2,https://d-portal.org/q.html?aid=41AAA-11295-032,514886.0,422003.1,39983470.0
39,41AAA-11960-007,Support to the Global Environment Facility (GE...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,No data,No data,No data,United Nations Office for Project Services (UN...,No data,...,410 - General Environment Protection,41010 - Environmental policy and administrativ...,0,2015,Q3,2015 Q3,https://d-portal.org/q.html?aid=41AAA-11960-007,1668446.0,1491148.0,101308000.0
40,41AAA-20431-001,Project to Support Improvement of Agricultural...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,No data,No data,No data,United Nations Office for Project Services (UN...,No data,...,"310 - Agriculture, Forestry, Fishing",31191 - Agricultural services,0,2017,Q1,2017 Q1,https://d-portal.org/q.html?aid=41AAA-20431-001,9475434.0,8941619.0,637222900.0


In [6]:
# see countries column
countries = iata_data_filtered["Recipient Country or Region"].unique()
country_names = pd.Series(countries)

country_names

0                                      AF - Afghanistan
1                                       BD - Bangladesh
2                 BO - Bolivia (Plurinational State of)
3                                         CO - Colombia
4                                            EG - Egypt
5                                         ET - Ethiopia
6     GB - United Kingdom of Great Britain and North...
7                                          GE - Georgia
8                                        GT - Guatemala
9                                         HN - Honduras
10                                           HT - Haiti
11                                       ID - Indonesia
12                                          JO - Jordan
13                                           KE - Kenya
14                                        KH - Cambodia
15                                         LB - Lebanon
16                       MD - Moldova (the Republic of)
17                                      MG - Mad

In [7]:
# convert countries to simplier country names
country_map = {
    "AF - Afghanistan": "Afghanistan",
    "BD - Bangladesh": "Bangladesh",
    "BO - Bolivia (Plurinational State of)": "Bolivia",
    "CO - Colombia": "Colombia",
    "EG - Egypt": "Egypt",
    "ET - Ethiopia": "Ethiopia",
    "GB - United Kingdom of Great Britain and Northern Ireland (the)": "United Kingdom",
    "GE - Georgia": "Georgia",
    "GT - Guatemala": "Guatemala",
    "HN - Honduras": "Honduras",
    "HT - Haiti": "Haiti",
    "ID - Indonesia": "Indonesia",
    "JO - Jordan": "Jordan",
    "KE - Kenya": "Kenya",
    "KH - Cambodia": "Cambodia",
    "LB - Lebanon": "Lebanon",
    "MD - Moldova (the Republic of)": "Moldova",
    "MG - Madagascar": "Madagascar",
    "NG - Nigeria": "Nigeria",
    "NI - Nicaragua": "Nicaragua",
    "NP - Nepal": "Nepal",
    "PH - Philippines (the)": "Philippines",
    "PK - Pakistan": "Pakistan",
    "RW - Rwanda": "Rwanda",
    "SL - Sierra Leone": "Sierra Leone",
    "SN - Senegal": "Senegal",
    "TZ - Tanzania, the United Republic of": "Tanzania",
    "UA - Ukraine": "Ukraine",
    "UG - Uganda": "Uganda",
    "US - United States of America (the)": "United States",
    "VN - Viet Nam": "Vietnam",
    "YE - Yemen": "Yemen"
}


# get iso alpha id
iata_data_filtered.loc[:, "country"] = (
    iata_data_filtered["Recipient Country or Region"]
    .map(country_map)
)

# map countries to simpler form
iata_data_filtered.loc[:, "iso_alpha2"] = (
    iata_data_filtered["Recipient Country or Region"]
    .str.split(" - ", expand=True)[0]
)

In [8]:
iata_data_filtered["iso_alpha2"].unique()

array(['AF', 'BD', 'BO', 'CO', 'EG', 'ET', 'GB', 'GE', 'GT', 'HN', 'HT',
       'ID', 'JO', 'KE', 'KH', 'LB', 'MD', 'MG', 'NG', 'NI', 'NP', 'PH',
       'PK', 'RW', 'SL', 'SN', 'TZ', 'UA', 'UG', 'US', 'VN', 'YE'],
      dtype=object)

In [9]:
iata_data_filtered["country"].unique()

array(['Afghanistan', 'Bangladesh', 'Bolivia', 'Colombia', 'Egypt',
       'Ethiopia', 'United Kingdom', 'Georgia', 'Guatemala', 'Honduras',
       'Haiti', 'Indonesia', 'Jordan', 'Kenya', 'Cambodia', 'Lebanon',
       'Moldova', 'Madagascar', 'Nigeria', 'Nicaragua', 'Nepal',
       'Philippines', 'Pakistan', 'Rwanda', 'Sierra Leone', 'Senegal',
       'Tanzania', 'Ukraine', 'Uganda', 'United States', 'Vietnam',
       'Yemen'], dtype=object)

In [10]:
# dim_country table
dim_country = (
    iata_data_filtered[["country", "iso_alpha2"]]
        .drop_duplicates()
        .reset_index(drop=True)
)

# add pk
dim_country["country_id"] = dim_country.index + 1


# reorder columns
dim_country = dim_country[["country_id", "country", "iso_alpha2"]]

dim_country.head()


Unnamed: 0,country_id,country,iso_alpha2
0,1,Afghanistan,AF
1,2,Bangladesh,BD
2,3,Bolivia,BO
3,4,Colombia,CO
4,5,Egypt,EG


In [None]:
# extract organizations, types, roles via pivot longer
organizations_long = iata_data_filtered.melt(
    id_vars = ["IATI Identifier"],
    value_vars = [
        "Reporting Organisation",
        "Provider Organisation",
        "Receiver Organisation"
    ],
    var_name = "organization_role",
    value_name = "organization_name"
)

organization_types_long = iata_data_filtered.melt(
    id_vars = ["IATI Identifier"],
    value_vars = [
        "Reporting Organisation Type",
        "Provider Organisation Type",
        "Receiver Organisation Type"
    ],
    var_name = "role_type",
    value_name = "organization_type"
)

In [12]:
organizations_long.head()

Unnamed: 0,IATI Identifier,organization_role,organization_name
0,41120-100879,Reporting Organisation,UN-Habitat [41120]
1,41120-100879,Reporting Organisation,UN-Habitat [41120]
2,41120-102631,Reporting Organisation,UN-Habitat [41120]
3,41120-102631,Reporting Organisation,UN-Habitat [41120]
4,41120-102645,Reporting Organisation,UN-Habitat [41120]


In [13]:
organization_types_long.head()

Unnamed: 0,IATI Identifier,role_type,organization_type
0,41120-100879,Reporting Organisation Type,40 - Multilateral
1,41120-100879,Reporting Organisation Type,40 - Multilateral
2,41120-102631,Reporting Organisation Type,40 - Multilateral
3,41120-102631,Reporting Organisation Type,40 - Multilateral
4,41120-102645,Reporting Organisation Type,40 - Multilateral


In [14]:
organizations_combined = pd.concat([organizations_long, organization_types_long["organization_type"]], axis=1)
organizations_combined = organizations_combined.dropna(subset=["organization_name"])
organizations_combined["organization_role"] = organizations_combined["organization_role"].str.split().str[0]

organizations_combined.head()

Unnamed: 0,IATI Identifier,organization_role,organization_name,organization_type
0,41120-100879,Reporting,UN-Habitat [41120],40 - Multilateral
1,41120-100879,Reporting,UN-Habitat [41120],40 - Multilateral
2,41120-102631,Reporting,UN-Habitat [41120],40 - Multilateral
3,41120-102631,Reporting,UN-Habitat [41120],40 - Multilateral
4,41120-102645,Reporting,UN-Habitat [41120],40 - Multilateral


In [15]:
# deduplicate rows

organizations_deduped = (
    organizations_combined[
        ["organization_name", "organization_type"]
    ]
    .drop_duplicates()
    .reset_index(drop=True)
)

organizations_deduped.head()

Unnamed: 0,organization_name,organization_type
0,UN-Habitat [41120],40 - Multilateral
1,United Nations Office for Project Services (UN...,40 - Multilateral
2,"The Global Fund to Fight AIDS, Tuberculosis an...",30 - Public Private Partnership
3,"Gavi, the vaccine alliance [47122]",40 - Multilateral
4,Swedish Committee for Afghanistan [AF-MOE-118],21 - International NGO


In [16]:
# extract ids from org name and type, store in seperate columns

organizations_deduped["organization_iati_id"] = (
    organizations_deduped["organization_name"]
    .str.extract(r"\[(.*?)\]")
)

organizations_deduped["organization_name_clean"] = (
    organizations_deduped["organization_name"]
    .str.replace(r"\s*\[\d+\]", "", regex=True)
)

organizations_deduped["organization_type_code"] = (
    organizations_deduped["organization_type"]
    .str.extract(r"^(\d+)")
)

organizations_deduped["organization_type_name"] = (
    organizations_deduped["organization_type"]
    .str.replace(r"^\d+\s*-\s*", "", regex=True)
)

organizations_deduped.head()

Unnamed: 0,organization_name,organization_type,organization_iati_id,organization_name_clean,organization_type_code,organization_type_name
0,UN-Habitat [41120],40 - Multilateral,41120,UN-Habitat,40,Multilateral
1,United Nations Office for Project Services (UN...,40 - Multilateral,41AAA,United Nations Office for Project Services (UN...,40,Multilateral
2,"The Global Fund to Fight AIDS, Tuberculosis an...",30 - Public Private Partnership,47045,"The Global Fund to Fight AIDS, Tuberculosis an...",30,Public Private Partnership
3,"Gavi, the vaccine alliance [47122]",40 - Multilateral,47122,"Gavi, the vaccine alliance",40,Multilateral
4,Swedish Committee for Afghanistan [AF-MOE-118],21 - International NGO,AF-MOE-118,Swedish Committee for Afghanistan [AF-MOE-118],21,International NGO


In [17]:
# build dim organization
dim_organization = organizations_deduped[
    [
        "organization_name_clean",
        "organization_iati_id",
        "organization_type_code",
        "organization_type_name",
        "organization_name"
    ]
].reset_index(drop=True)

dim_organization["organization_id"] = dim_organization.index + 1


# add pk
dim_organization["organization_id"] = dim_organization.index + 1

# rearrange org columns
dim_organization = dim_organization[
    [
        "organization_id",
        "organization_name_clean",
        "organization_iati_id",
        "organization_type_code",
        "organization_type_name",
        "organization_name"
    ]
]

dim_organization.head()

Unnamed: 0,organization_id,organization_name_clean,organization_iati_id,organization_type_code,organization_type_name,organization_name
0,1,UN-Habitat,41120,40,Multilateral,UN-Habitat [41120]
1,2,United Nations Office for Project Services (UN...,41AAA,40,Multilateral,United Nations Office for Project Services (UN...
2,3,"The Global Fund to Fight AIDS, Tuberculosis an...",47045,30,Public Private Partnership,"The Global Fund to Fight AIDS, Tuberculosis an..."
3,4,"Gavi, the vaccine alliance",47122,40,Multilateral,"Gavi, the vaccine alliance [47122]"
4,5,Swedish Committee for Afghanistan [AF-MOE-118],AF-MOE-118,21,International NGO,Swedish Committee for Afghanistan [AF-MOE-118]


In [18]:
dim_organization[dim_organization["organization_name"] == "No data"]


Unnamed: 0,organization_id,organization_name_clean,organization_iati_id,organization_type_code,organization_type_name,organization_name
1242,1243,No data,,,No data,No data
1285,1286,No data,,21.0,International NGO,No data
1387,1388,No data,,90.0,Other,No data
1783,1784,No data,,80.0,"Academic, Training and Research",No data
1788,1789,No data,,22.0,National NGO,No data
1820,1821,No data,,10.0,Government,No data
2066,2067,No data,,40.0,Multilateral,No data
2278,2279,No data,,71.0,Private Sector in Provider Country,No data
3794,3795,No data,,60.0,Foundation,No data
4061,4062,No data,,70.0,Private Sector,No data


In [None]:
# get sector fields for dim sector

sector_staging = iata_data_filtered[["Sector Category", "Sector"]].copy()
sector_staging = sector_staging.dropna().drop_duplicates()

sector_staging.head()

Unnamed: 0,Sector Category,Sector
0,160 - Other Social Infrastructure & Services,16030 - Housing policy and administrative mana...
2,730 - Reconstruction Relief & Rehabilitation,73010 - Immediate post-emergency reconstructio...
4,430 - Other Multisector,43030 - Urban development and management
16,430 - Other Multisector,43082 - Research/scientific institutions
36,120 - Health,12263 - Tuberculosis control


In [20]:
# split sector category into code + name
sector_staging["sector_category_code"] = (
    sector_staging["Sector Category"].str.extract(r"^(\d+)")
)

sector_staging["sector_category"] = (
    sector_staging["Sector Category"].str.replace(r"^\d+\s*-\s*", "", regex=True)
)


In [21]:
# split sector into code + name
sector_staging["sector_code"] = (
    sector_staging["Sector"].str.extract(r"^(\d+)")
)

sector_staging["sector_name"] = (
    sector_staging["Sector"].str.replace(r"^\d+\s*-\s*", "", regex=True)
)

sector_staging.head()

Unnamed: 0,Sector Category,Sector,sector_category_code,sector_category,sector_code,sector_name
0,160 - Other Social Infrastructure & Services,16030 - Housing policy and administrative mana...,160,Other Social Infrastructure & Services,16030,Housing policy and administrative management
2,730 - Reconstruction Relief & Rehabilitation,73010 - Immediate post-emergency reconstructio...,730,Reconstruction Relief & Rehabilitation,73010,Immediate post-emergency reconstruction and re...
4,430 - Other Multisector,43030 - Urban development and management,430,Other Multisector,43030,Urban development and management
16,430 - Other Multisector,43082 - Research/scientific institutions,430,Other Multisector,43082,Research/scientific institutions
36,120 - Health,12263 - Tuberculosis control,120,Health,12263,Tuberculosis control


In [22]:
# set up sector dimension

dim_sector = sector_staging[
    [
        "sector_code",
        "sector_name",
        "sector_category_code",
        "sector_category"
    ]
].drop_duplicates().reset_index(drop=True)

# add primary key
dim_sector["sector_id"] = dim_sector.index + 1

# reorder cols
dim_sector = dim_sector[
    [
        "sector_id",
        "sector_code",
        "sector_name",
        "sector_category_code",
        "sector_category"
    ]
]

dim_sector.head()

Unnamed: 0,sector_id,sector_code,sector_name,sector_category_code,sector_category
0,1,16030,Housing policy and administrative management,160,Other Social Infrastructure & Services
1,2,73010,Immediate post-emergency reconstruction and re...,730,Reconstruction Relief & Rehabilitation
2,3,43030,Urban development and management,430,Other Multisector
3,4,43082,Research/scientific institutions,430,Other Multisector
4,5,12263,Tuberculosis control,120,Health


In [None]:
aid_staging = iata_data_filtered[["Aid Type"]].dropna().drop_duplicates()
aid_staging.head()

Unnamed: 0,Aid Type
0,No data
69,C01 - Project-type interventions
1181,B03 - Contributions to specific-purpose progra...
1775,E01 - Scholarships/training in donor country
1956,B04 - Basket funds/pooled funding


In [24]:
# split aid type into code and name


aid_staging["aid_type_code"] = aid_staging["Aid Type"].str.extract(r"^(\S+)")
aid_staging["aid_type_name"] = aid_staging["Aid Type"].str.replace(r"^\S+\s*-\s*", "", regex=True)


# set type code to none for int
aid_staging["aid_type_code"] = aid_staging["aid_type_code"].replace("No", None)
aid_staging.head()

Unnamed: 0,Aid Type,aid_type_code,aid_type_name
0,No data,,No data
69,C01 - Project-type interventions,C01,Project-type interventions
1181,B03 - Contributions to specific-purpose progra...,B03,Contributions to specific-purpose programmes a...
1775,E01 - Scholarships/training in donor country,E01,Scholarships/training in donor country
1956,B04 - Basket funds/pooled funding,B04,Basket funds/pooled funding


In [25]:
# build aid dimension

dim_aid_type = (
    aid_staging[["aid_type_code", "aid_type_name"]]
    .drop_duplicates()
    .reset_index(drop=True)
)

# add pk
dim_aid_type["aid_type_id"] = dim_aid_type.index + 1

dim_aid_type = dim_aid_type[
    ["aid_type_id", "aid_type_code", "aid_type_name"]
]

dim_aid_type.head()


Unnamed: 0,aid_type_id,aid_type_code,aid_type_name
0,1,,No data
1,2,C01,Project-type interventions
2,3,B03,Contributions to specific-purpose programmes a...
3,4,E01,Scholarships/training in donor country
4,5,B04,Basket funds/pooled funding


In [None]:
# flow type staging
flow_staging = iata_data_filtered[["Flow Type"]].copy()
flow_staging = flow_staging.drop_duplicates()


flow_staging.head()

Unnamed: 0,Flow Type
0,No data
69,10 - ODA
2531,21 - Non-export credit OOF
4421,30 - Private Development Finance
4427,50 - Other flows


In [27]:
# split flow type into code and name
flow_staging["flow_type_code"] = flow_staging["Flow Type"].str.extract(r"^(\S+)")

flow_staging["flow_type_name"] = flow_staging["Flow Type"].str.replace(
    r"^\S+\s*-\s*", "", regex=True
)

# set type code to none as well to support int
flow_staging["flow_type_code"] = flow_staging["flow_type_code"].replace("No", None)

flow_staging.head()

Unnamed: 0,Flow Type,flow_type_code,flow_type_name
0,No data,,No data
69,10 - ODA,10.0,ODA
2531,21 - Non-export credit OOF,21.0,Non-export credit OOF
4421,30 - Private Development Finance,30.0,Private Development Finance
4427,50 - Other flows,50.0,Other flows


In [28]:
# build flow dimension

dim_flow_type = (
    flow_staging[["flow_type_code", "flow_type_name"]]
    .drop_duplicates()
    .reset_index(drop=True)
)

# add pk
dim_flow_type["flow_type_id"] = dim_flow_type.index + 1

dim_flow_type = dim_flow_type[
    ["flow_type_id", "flow_type_code", "flow_type_name"]
]


dim_flow_type.head()


Unnamed: 0,flow_type_id,flow_type_code,flow_type_name
0,1,,No data
1,2,10.0,ODA
2,3,21.0,Non-export credit OOF
3,4,30.0,Private Development Finance
4,5,50.0,Other flows


In [None]:
transaction_staging = iata_data_filtered[["Transaction Type"]].copy()
transaction_staging = transaction_staging.drop_duplicates()


transaction_staging.head()

Unnamed: 0,Transaction Type
0,4 - Expenditure
1,1 - Incoming Funds
69,2 - Outgoing Commitment
72,3 - Disbursement
73955,budget - Budget


In [30]:
transaction_staging["transaction_type_code"] = transaction_staging["Transaction Type"].str.extract(r"^(\S+)")
transaction_staging["transaction_type_name"] = transaction_staging["Transaction Type"].str.replace(r"^\S+\s*-\s*", "", regex=True)


transaction_staging.head()

Unnamed: 0,Transaction Type,transaction_type_code,transaction_type_name
0,4 - Expenditure,4,Expenditure
1,1 - Incoming Funds,1,Incoming Funds
69,2 - Outgoing Commitment,2,Outgoing Commitment
72,3 - Disbursement,3,Disbursement
73955,budget - Budget,budget,Budget


In [31]:
# build transaction dimension
dim_transaction_type = (
    transaction_staging[["transaction_type_code", "transaction_type_name"]]
    .drop_duplicates()
    .reset_index(drop=True)
)
# pk
dim_transaction_type["transaction_type_id"] = dim_transaction_type.index + 1

# reorder cols
dim_transaction_type = dim_transaction_type[
    ["transaction_type_id", "transaction_type_code", "transaction_type_name"]
]

dim_transaction_type.head()



Unnamed: 0,transaction_type_id,transaction_type_code,transaction_type_name
0,1,4,Expenditure
1,2,1,Incoming Funds
2,3,2,Outgoing Commitment
3,4,3,Disbursement
4,5,budget,Budget


In [None]:
# finance type staging

finance_staging = iata_data_filtered[["Finance Type"]].copy()
finance_staging = finance_staging.drop_duplicates()

finance_staging.head()

Unnamed: 0,Finance Type
0,No data
69,110 - Standard grant
6078,311 - Capital subscription on encashment basis
8969,421 - Standard loan
12165,511 - Acquisition of equity not part of joint ...


In [33]:
# extract code and name from finance type

finance_staging["finance_type_code"] = finance_staging["Finance Type"].str.extract(r"^(\S+)")
finance_staging["finance_type_name"] = finance_staging["Finance Type"].str.replace(r"^\S+\s*-\s*", "", regex=True)

# set type code to no data as well for better null consistency
finance_staging["finance_type_code"] = finance_staging["finance_type_code"].replace("No", None)


finance_staging.head()

Unnamed: 0,Finance Type,finance_type_code,finance_type_name
0,No data,,No data
69,110 - Standard grant,110.0,Standard grant
6078,311 - Capital subscription on encashment basis,311.0,Capital subscription on encashment basis
8969,421 - Standard loan,421.0,Standard loan
12165,511 - Acquisition of equity not part of joint ...,511.0,Acquisition of equity not part of joint ventur...


In [34]:
# build finance dimension

dim_finance_type = (
    finance_staging[["finance_type_code", "finance_type_name"]]
    .drop_duplicates()
    .reset_index(drop=True)
)

dim_finance_type["finance_type_id"] = dim_finance_type.index + 1


dim_finance_type = dim_finance_type[
    ["finance_type_id", "finance_type_code", "finance_type_name"]
]

dim_finance_type.head()

Unnamed: 0,finance_type_id,finance_type_code,finance_type_name
0,1,,No data
1,2,110.0,Standard grant
2,3,311.0,Capital subscription on encashment basis
3,4,421.0,Standard loan
4,5,511.0,Acquisition of equity not part of joint ventur...


In [35]:
world_indicator_data = pd.read_csv("data/world-development-indicators/data2.csv")

world_indicator_data.head()

Unnamed: 0,Country Name,Country Code,Series Name,Series Code,1970 [YR1970],1971 [YR1971],1972 [YR1972],1973 [YR1973],1974 [YR1974],1975 [YR1975],...,2014 [YR2014],2015 [YR2015],2016 [YR2016],2017 [YR2017],2018 [YR2018],2019 [YR2019],2020 [YR2020],2021 [YR2021],2022 [YR2022],2023 [YR2023]
0,United States,USA,"Population, total",SP.POP.TOTL,205052000.0,207661000.0,209896000.0,211909000.0,213854000.0,215973000.0,...,319257560.0,321815121.0,324353340.0,326608609.0,328529577.0,330226227.0,331577720.0,332099760.0,334017321.0,336806231.0
1,United States,USA,Population density (people per sq. km of land ...,EN.POP.DNST,22.3881314035655,22.6729890729952,22.9170124118896,23.1367971909474,23.3491575462716,23.5805156917379,...,34.9013776562134,35.1809713558577,35.4584505795077,35.7049975840182,35.9149986553586,36.1004771837305,36.248222996211,36.3052926398919,36.5149212564854,36.8198061311277
2,United States,USA,GDP per capita (current US$),NY.GDP.PCAP.CD,5234.2966662115,5609.38259952519,6094.01798986165,6726.35895596695,7225.69135952566,7801.45666356443,...,54973.4207515712,56572.9188996063,57638.1018367192,59635.0984397965,62499.8744390068,64746.4506778863,63515.9491807833,70205.050916026,76657.2488844403,81032.262117545
3,United States,USA,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,70.8073170731707,71.1073170731707,71.1560975609756,71.3560975609756,71.9560975609756,72.6048780487805,...,78.8414634146341,78.690243902439,78.5390243902439,78.5390243902439,78.6390243902439,78.7878048780488,76.9804878048781,76.3292682926829,77.4341463414634,78.3853658536585
4,United States,USA,"Mortality rate, under-5 (per 1,000 live births)",SH.DYN.MORT,23.2,22.4,21.5,20.6,19.7,18.8,...,6.8,6.8,6.7,6.6,6.5,6.5,6.5,6.5,6.5,6.5


In [36]:
# long pivot the years

indicator_long = world_indicator_data.melt(
    id_vars=["Country Name", "Country Code", "Series Name", "Series Code"],
    var_name="year",
    value_name="value"
)

indicator_long.head()

Unnamed: 0,Country Name,Country Code,Series Name,Series Code,year,value
0,United States,USA,"Population, total",SP.POP.TOTL,1970 [YR1970],205052000.0
1,United States,USA,Population density (people per sq. km of land ...,EN.POP.DNST,1970 [YR1970],22.3881314035655
2,United States,USA,GDP per capita (current US$),NY.GDP.PCAP.CD,1970 [YR1970],5234.2966662115
3,United States,USA,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,1970 [YR1970],70.8073170731707
4,United States,USA,"Mortality rate, under-5 (per 1,000 live births)",SH.DYN.MORT,1970 [YR1970],23.2


In [37]:
# year has weird values so convert

indicator_long["year"] = indicator_long["year"].str.extract(r"(\d{4})").astype(int)

indicator_long.head(1)

Unnamed: 0,Country Name,Country Code,Series Name,Series Code,year,value
0,United States,USA,"Population, total",SP.POP.TOTL,1970,205052000


In [38]:
# look at distinct series

indicator_long["Series Name"].unique()

array(['Population, total',
       'Population density (people per sq. km of land area)',
       'GDP per capita (current US$)',
       'Life expectancy at birth, total (years)',
       'Mortality rate, under-5 (per 1,000 live births)',
       'Primary completion rate, total (% of relevant age group)',
       'School enrollment, primary (% net)',
       'Lower secondary completion rate, total (% of relevant age group)',
       'Net ODA received per capita (current US$)', nan], dtype=object)

In [39]:
# make a map to convert the names to readable format

indicator_map = {
    "Population, total": "population",
    "Population density (people per sq. km of land area)": "population_density",
    "GDP per capita (current US$)": "gdp_per_capita",
    "Net ODA received per capita (current US$)": "net_oda_per_capita",
    "Life expectancy at birth, total (years)": "life_expectancy",
    "Mortality rate, under-5 (per 1,000 live births)": "under5_mortality",
    "Primary completion rate, total (% of relevant age group)": "primary_completion_rate",
    "Lower secondary completion rate, total (% of relevant age group)": "lower_secondary_completion_rate",
    "School enrollment, primary (% net)": "school_enrollment_primary"
}


indicator_long["metric"] = indicator_long["Series Name"].map(indicator_map)

indicator_long.head()

Unnamed: 0,Country Name,Country Code,Series Name,Series Code,year,value,metric
0,United States,USA,"Population, total",SP.POP.TOTL,1970,205052000.0,population
1,United States,USA,Population density (people per sq. km of land ...,EN.POP.DNST,1970,22.3881314035655,population_density
2,United States,USA,GDP per capita (current US$),NY.GDP.PCAP.CD,1970,5234.2966662115,gdp_per_capita
3,United States,USA,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,1970,70.8073170731707,life_expectancy
4,United States,USA,"Mortality rate, under-5 (per 1,000 live births)",SH.DYN.MORT,1970,23.2,under5_mortality


In [40]:
# theres weird rows where theres no data at all so filter it out (verified using R)

indicator_long = indicator_long[indicator_long["metric"].notna()].copy()
indicator_long = indicator_long[indicator_long["year"] < 2024].copy()

# theres "..." entries as opposed to n/a entries
indicator_long["value"] = indicator_long["value"].replace("..", None)

# convert numeric
indicator_long["value"] = pd.to_numeric(indicator_long["value"], errors="coerce")



In [41]:
# pivot again to convert metrics to seperate columns

indicator_pivoted = indicator_long.pivot_table(
    index=["Country Name", "Country Code", "year"],
    columns="metric",
    values="value"
).reset_index()

indicator_pivoted.head()

metric,Country Name,Country Code,year,gdp_per_capita,life_expectancy,lower_secondary_completion_rate,net_oda_per_capita,population,population_density,primary_completion_rate,school_enrollment_primary,under5_mortality
0,Afghanistan,AFG,1970,,37.46,,2.445499,11290128.0,17.310041,,,301.6
1,Afghanistan,AFG,1971,,37.932,,3.841743,11567667.0,17.735564,,,296.5
2,Afghanistan,AFG,1972,,38.423,,4.655088,11853696.0,18.174104,,,291.4
3,Afghanistan,AFG,1973,,38.951,9.78837,4.582991,12157999.0,18.640662,,,285.9
4,Afghanistan,AFG,1974,,39.469,8.5725,3.922488,12469127.0,19.117684,17.058519,26.82221,280.6


In [42]:
# verify country names
indicator_pivoted["Country Name"].unique()

array(['Afghanistan', 'Bangladesh', 'Bolivia', 'Cambodia', 'Colombia',
       'Egypt, Arab Rep.', 'Ethiopia', 'Georgia', 'Guatemala', 'Haiti',
       'Honduras', 'Indonesia', 'Jordan', 'Kenya', 'Lebanon',
       'Madagascar', 'Moldova', 'Nepal', 'Nicaragua', 'Nigeria',
       'Pakistan', 'Philippines', 'Rwanda', 'Senegal', 'Sierra Leone',
       'Tanzania', 'Uganda', 'Ukraine', 'United Kingdom', 'United States',
       'Viet Nam', 'Yemen, Rep.'], dtype=object)

In [43]:
# map country names to

country_mapping = {
    "Afghanistan": "Afghanistan",
    "Bangladesh": "Bangladesh",
    "Bolivia": "Bolivia",
    "Cambodia": "Cambodia",
    "Colombia": "Colombia",
    "Egypt, Arab Rep.": "Egypt",
    "Ethiopia": "Ethiopia",
    "Georgia": "Georgia",
    "Guatemala": "Guatemala",
    "Haiti": "Haiti",
    "Honduras": "Honduras",
    "Indonesia": "Indonesia",
    "Jordan": "Jordan",
    "Kenya": "Kenya",
    "Lebanon": "Lebanon",
    "Madagascar": "Madagascar",
    "Moldova": "Moldova",
    "Nepal": "Nepal",
    "Nicaragua": "Nicaragua",
    "Nigeria": "Nigeria",
    "Pakistan": "Pakistan",
    "Philippines": "Philippines",
    "Rwanda": "Rwanda",
    "Senegal": "Senegal",
    "Sierra Leone": "Sierra Leone",
    "Tanzania": "Tanzania",
    "Uganda": "Uganda",
    "Ukraine": "Ukraine",
    "United Kingdom": "United Kingdom",
    "United States": "United States",
    "Viet Nam": "Vietnam",
    "Yemen, Rep.": "Yemen"
}

indicator_pivoted["country_clean"] = indicator_pivoted["Country Name"].map(country_mapping)

indicator_pivoted.head()

metric,Country Name,Country Code,year,gdp_per_capita,life_expectancy,lower_secondary_completion_rate,net_oda_per_capita,population,population_density,primary_completion_rate,school_enrollment_primary,under5_mortality,country_clean
0,Afghanistan,AFG,1970,,37.46,,2.445499,11290128.0,17.310041,,,301.6,Afghanistan
1,Afghanistan,AFG,1971,,37.932,,3.841743,11567667.0,17.735564,,,296.5,Afghanistan
2,Afghanistan,AFG,1972,,38.423,,4.655088,11853696.0,18.174104,,,291.4,Afghanistan
3,Afghanistan,AFG,1973,,38.951,9.78837,4.582991,12157999.0,18.640662,,,285.9,Afghanistan
4,Afghanistan,AFG,1974,,39.469,8.5725,3.922488,12469127.0,19.117684,17.058519,26.82221,280.6,Afghanistan


In [44]:
indicator_pivoted["country_clean"].unique()

array(['Afghanistan', 'Bangladesh', 'Bolivia', 'Cambodia', 'Colombia',
       'Egypt', 'Ethiopia', 'Georgia', 'Guatemala', 'Haiti', 'Honduras',
       'Indonesia', 'Jordan', 'Kenya', 'Lebanon', 'Madagascar', 'Moldova',
       'Nepal', 'Nicaragua', 'Nigeria', 'Pakistan', 'Philippines',
       'Rwanda', 'Senegal', 'Sierra Leone', 'Tanzania', 'Uganda',
       'Ukraine', 'United Kingdom', 'United States', 'Vietnam', 'Yemen'],
      dtype=object)

In [45]:
# add quarter for joining
indicator_pivoted["quarter"] = "Q1"

In [46]:
fact_indicator = indicator_pivoted.merge(
    dim_country[["country_id", "country"]],
    left_on="country_clean",
    right_on="country",
    how="left"
)

fact_indicator.head()

Unnamed: 0,Country Name,Country Code,year,gdp_per_capita,life_expectancy,lower_secondary_completion_rate,net_oda_per_capita,population,population_density,primary_completion_rate,school_enrollment_primary,under5_mortality,country_clean,quarter,country_id,country
0,Afghanistan,AFG,1970,,37.46,,2.445499,11290128.0,17.310041,,,301.6,Afghanistan,Q1,1,Afghanistan
1,Afghanistan,AFG,1971,,37.932,,3.841743,11567667.0,17.735564,,,296.5,Afghanistan,Q1,1,Afghanistan
2,Afghanistan,AFG,1972,,38.423,,4.655088,11853696.0,18.174104,,,291.4,Afghanistan,Q1,1,Afghanistan
3,Afghanistan,AFG,1973,,38.951,9.78837,4.582991,12157999.0,18.640662,,,285.9,Afghanistan,Q1,1,Afghanistan
4,Afghanistan,AFG,1974,,39.469,8.5725,3.922488,12469127.0,19.117684,17.058519,26.82221,280.6,Afghanistan,Q1,1,Afghanistan


In [47]:
fact_indicator = fact_indicator.merge(
    dim_time,
    on=["year", "quarter"],
    how="left"
)
fact_indicator.head()


Unnamed: 0,Country Name,Country Code,year,gdp_per_capita,life_expectancy,lower_secondary_completion_rate,net_oda_per_capita,population,population_density,primary_completion_rate,school_enrollment_primary,under5_mortality,country_clean,quarter,country_id,country,time_id
0,Afghanistan,AFG,1970,,37.46,,2.445499,11290128.0,17.310041,,,301.6,Afghanistan,Q1,1,Afghanistan,1
1,Afghanistan,AFG,1971,,37.932,,3.841743,11567667.0,17.735564,,,296.5,Afghanistan,Q1,1,Afghanistan,5
2,Afghanistan,AFG,1972,,38.423,,4.655088,11853696.0,18.174104,,,291.4,Afghanistan,Q1,1,Afghanistan,9
3,Afghanistan,AFG,1973,,38.951,9.78837,4.582991,12157999.0,18.640662,,,285.9,Afghanistan,Q1,1,Afghanistan,13
4,Afghanistan,AFG,1974,,39.469,8.5725,3.922488,12469127.0,19.117684,17.058519,26.82221,280.6,Afghanistan,Q1,1,Afghanistan,17


In [48]:
# clean table, drop unnecessary colss and reorder
fact_indicator = fact_indicator[
    [
        "country_id",
        "time_id",
        "population",
        "population_density",
        "gdp_per_capita",
        "net_oda_per_capita",
        "life_expectancy",
        "under5_mortality",
        "primary_completion_rate",
        "lower_secondary_completion_rate",
        "school_enrollment_primary"
    ]
]

fact_indicator.head()


Unnamed: 0,country_id,time_id,population,population_density,gdp_per_capita,net_oda_per_capita,life_expectancy,under5_mortality,primary_completion_rate,lower_secondary_completion_rate,school_enrollment_primary
0,1,1,11290128.0,17.310041,,2.445499,37.46,301.6,,,
1,1,5,11567667.0,17.735564,,3.841743,37.932,296.5,,,
2,1,9,11853696.0,18.174104,,4.655088,38.423,291.4,,,
3,1,13,12157999.0,18.640662,,4.582991,38.951,285.9,,9.78837,
4,1,17,12469127.0,19.117684,,3.922488,39.469,280.6,17.058519,8.5725,26.82221


In [49]:
# check columns
iata_data_filtered.head()

Unnamed: 0,IATI Identifier,Title,Reporting Organisation Group,Reporting Organisation,Reporting Organisation Type,Aid Type,Finance Type,Flow Type,Provider Organisation,Provider Organisation Type,...,Humanitarian,Calendar Year,Calendar Quarter,Calendar Year and Quarter,URL,Value (USD),Value (EUR),Value (Local currrency),country,iso_alpha2
36,41AAA-11295-001,Procurement and Supply Support of Anti-Tubercu...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,No data,No data,No data,United Nations Office for Project Services (UN...,No data,...,0,2016,Q1,2016 Q1,https://d-portal.org/q.html?aid=41AAA-11295-001,2986309.0,2743005.0,203218300.0,Afghanistan,AF
37,41AAA-11295-014,Diagnostics Procurement Support to Stop Tuberc...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,No data,No data,No data,United Nations Office for Project Services (UN...,No data,...,0,2018,Q1,2018 Q1,https://d-portal.org/q.html?aid=41AAA-11295-014,2895425.0,2414262.0,201212600.0,Afghanistan,AF
38,41AAA-11295-032,Support for the Stop Tuberculosis Partnership ...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,No data,No data,No data,United Nations Office for Project Services (UN...,No data,...,0,2021,Q2,2021 Q2,https://d-portal.org/q.html?aid=41AAA-11295-032,514886.0,422003.1,39983470.0,Afghanistan,AF
39,41AAA-11960-007,Support to the Global Environment Facility (GE...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,No data,No data,No data,United Nations Office for Project Services (UN...,No data,...,0,2015,Q3,2015 Q3,https://d-portal.org/q.html?aid=41AAA-11960-007,1668446.0,1491148.0,101308000.0,Afghanistan,AF
40,41AAA-20431-001,Project to Support Improvement of Agricultural...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,No data,No data,No data,United Nations Office for Project Services (UN...,No data,...,0,2017,Q1,2017 Q1,https://d-portal.org/q.html?aid=41AAA-20431-001,9475434.0,8941619.0,637222900.0,Afghanistan,AF


In [50]:
fact_iata = iata_data_filtered.rename(columns={
    "IATI Identifier": "iati_id",
    "Title": "aid_title",
    "Value (USD)": "value_usd",
    "Value (EUR)": "value_euro",
    "Value (Local currrency)": "value_local",
    "Humanitarian": "humanitarian",
    "Calendar Year": "year",
    "Calendar Quarter": "quarter",
})

fact_iata.head()


Unnamed: 0,iati_id,aid_title,Reporting Organisation Group,Reporting Organisation,Reporting Organisation Type,Aid Type,Finance Type,Flow Type,Provider Organisation,Provider Organisation Type,...,humanitarian,year,quarter,Calendar Year and Quarter,URL,value_usd,value_euro,value_local,country,iso_alpha2
36,41AAA-11295-001,Procurement and Supply Support of Anti-Tubercu...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,No data,No data,No data,United Nations Office for Project Services (UN...,No data,...,0,2016,Q1,2016 Q1,https://d-portal.org/q.html?aid=41AAA-11295-001,2986309.0,2743005.0,203218300.0,Afghanistan,AF
37,41AAA-11295-014,Diagnostics Procurement Support to Stop Tuberc...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,No data,No data,No data,United Nations Office for Project Services (UN...,No data,...,0,2018,Q1,2018 Q1,https://d-portal.org/q.html?aid=41AAA-11295-014,2895425.0,2414262.0,201212600.0,Afghanistan,AF
38,41AAA-11295-032,Support for the Stop Tuberculosis Partnership ...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,No data,No data,No data,United Nations Office for Project Services (UN...,No data,...,0,2021,Q2,2021 Q2,https://d-portal.org/q.html?aid=41AAA-11295-032,514886.0,422003.1,39983470.0,Afghanistan,AF
39,41AAA-11960-007,Support to the Global Environment Facility (GE...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,No data,No data,No data,United Nations Office for Project Services (UN...,No data,...,0,2015,Q3,2015 Q3,https://d-portal.org/q.html?aid=41AAA-11960-007,1668446.0,1491148.0,101308000.0,Afghanistan,AF
40,41AAA-20431-001,Project to Support Improvement of Agricultural...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,No data,No data,No data,United Nations Office for Project Services (UN...,No data,...,0,2017,Q1,2017 Q1,https://d-portal.org/q.html?aid=41AAA-20431-001,9475434.0,8941619.0,637222900.0,Afghanistan,AF


In [51]:
# add country id to fact table by merging
fact_iata = fact_iata.merge(
    dim_country[["country_id", "country"]],
    on="country",
    how="left"
)

In [52]:
fact_iata.head()

Unnamed: 0,iati_id,aid_title,Reporting Organisation Group,Reporting Organisation,Reporting Organisation Type,Aid Type,Finance Type,Flow Type,Provider Organisation,Provider Organisation Type,...,year,quarter,Calendar Year and Quarter,URL,value_usd,value_euro,value_local,country,iso_alpha2,country_id
0,41AAA-11295-001,Procurement and Supply Support of Anti-Tubercu...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,No data,No data,No data,United Nations Office for Project Services (UN...,No data,...,2016,Q1,2016 Q1,https://d-portal.org/q.html?aid=41AAA-11295-001,2986309.0,2743005.0,203218300.0,Afghanistan,AF,1
1,41AAA-11295-014,Diagnostics Procurement Support to Stop Tuberc...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,No data,No data,No data,United Nations Office for Project Services (UN...,No data,...,2018,Q1,2018 Q1,https://d-portal.org/q.html?aid=41AAA-11295-014,2895425.0,2414262.0,201212600.0,Afghanistan,AF,1
2,41AAA-11295-032,Support for the Stop Tuberculosis Partnership ...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,No data,No data,No data,United Nations Office for Project Services (UN...,No data,...,2021,Q2,2021 Q2,https://d-portal.org/q.html?aid=41AAA-11295-032,514886.0,422003.1,39983470.0,Afghanistan,AF,1
3,41AAA-11960-007,Support to the Global Environment Facility (GE...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,No data,No data,No data,United Nations Office for Project Services (UN...,No data,...,2015,Q3,2015 Q3,https://d-portal.org/q.html?aid=41AAA-11960-007,1668446.0,1491148.0,101308000.0,Afghanistan,AF,1
4,41AAA-20431-001,Project to Support Improvement of Agricultural...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,No data,No data,No data,United Nations Office for Project Services (UN...,No data,...,2017,Q1,2017 Q1,https://d-portal.org/q.html?aid=41AAA-20431-001,9475434.0,8941619.0,637222900.0,Afghanistan,AF,1


In [53]:
# add time id to fact table by merging
fact_iata = fact_iata.merge(
    dim_time,
    left_on=["year", "quarter"],
    right_on=["year", "quarter"],
    how="left"
)

In [54]:
fact_iata = fact_iata.merge(
    dim_organization[[
        "organization_id",
        "organization_name",
        "organization_type_name"
    ]],
    left_on=["Reporting Organisation", "Reporting Organisation Type"],
    right_on=["organization_name", "organization_type_name"],
    how="left"
).rename(columns={"organization_id": "reporting_org_id"})

fact_iata = fact_iata.drop(columns=["organization_name", "organization_type_name"])


In [55]:
fact_iata.head()

Unnamed: 0,iati_id,aid_title,Reporting Organisation Group,Reporting Organisation,Reporting Organisation Type,Aid Type,Finance Type,Flow Type,Provider Organisation,Provider Organisation Type,...,Calendar Year and Quarter,URL,value_usd,value_euro,value_local,country,iso_alpha2,country_id,time_id,reporting_org_id
0,41AAA-11295-001,Procurement and Supply Support of Anti-Tubercu...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,No data,No data,No data,United Nations Office for Project Services (UN...,No data,...,2016 Q1,https://d-portal.org/q.html?aid=41AAA-11295-001,2986309.0,2743005.0,203218300.0,Afghanistan,AF,1,185.0,
1,41AAA-11295-014,Diagnostics Procurement Support to Stop Tuberc...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,No data,No data,No data,United Nations Office for Project Services (UN...,No data,...,2018 Q1,https://d-portal.org/q.html?aid=41AAA-11295-014,2895425.0,2414262.0,201212600.0,Afghanistan,AF,1,193.0,
2,41AAA-11295-032,Support for the Stop Tuberculosis Partnership ...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,No data,No data,No data,United Nations Office for Project Services (UN...,No data,...,2021 Q2,https://d-portal.org/q.html?aid=41AAA-11295-032,514886.0,422003.1,39983470.0,Afghanistan,AF,1,206.0,
3,41AAA-11960-007,Support to the Global Environment Facility (GE...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,No data,No data,No data,United Nations Office for Project Services (UN...,No data,...,2015 Q3,https://d-portal.org/q.html?aid=41AAA-11960-007,1668446.0,1491148.0,101308000.0,Afghanistan,AF,1,183.0,
4,41AAA-20431-001,Project to Support Improvement of Agricultural...,UN - United Nations,United Nations Office for Project Services (UN...,40 - Multilateral,No data,No data,No data,United Nations Office for Project Services (UN...,No data,...,2017 Q1,https://d-portal.org/q.html?aid=41AAA-20431-001,9475434.0,8941619.0,637222900.0,Afghanistan,AF,1,189.0,


In [56]:
fact_iata = fact_iata.merge(
    dim_organization[[
        "organization_id",
        "organization_name",
        "organization_type_name"
    ]],
    left_on=["Provider Organisation", "Provider Organisation Type"],
    right_on=["organization_name", "organization_type_name"],
    how="left"
).rename(columns={"organization_id": "provider_org_id"})

fact_iata = fact_iata.drop(columns=["organization_name", "organization_type_name"])


In [57]:
fact_iata = fact_iata.merge(
    dim_organization[[
        "organization_id",
        "organization_name",
        "organization_type_name"
    ]],
    left_on=["Receiver Organisation", "Receiver Organisation Type"],
    right_on=["organization_name", "organization_type_name"],
    how="left"
).rename(columns={"organization_id": "receiving_org_id"})

fact_iata = fact_iata.drop(columns=["organization_name", "organization_type_name"])


In [64]:
dim_organization.head()

Unnamed: 0,organization_id,organization_name_clean,organization_iati_id,organization_type_code,organization_type_name,organization_name
0,1,UN-Habitat,41120,40,Multilateral,UN-Habitat [41120]
1,2,United Nations Office for Project Services (UN...,41AAA,40,Multilateral,United Nations Office for Project Services (UN...
2,3,"The Global Fund to Fight AIDS, Tuberculosis an...",47045,30,Public Private Partnership,"The Global Fund to Fight AIDS, Tuberculosis an..."
3,4,"Gavi, the vaccine alliance",47122,40,Multilateral,"Gavi, the vaccine alliance [47122]"
4,5,Swedish Committee for Afghanistan [AF-MOE-118],AF-MOE-118,21,International NGO,Swedish Committee for Afghanistan [AF-MOE-118]
