## WHO GHO indicators

In [13]:
# import libraries (installed in epi-core environment)
import pandas as pd
import requests
import sqlalchemy
from sqlalchemy import create_engine, text, URL
import json
from pathlib import Path
from dotenv import load_dotenv
import os

In [14]:
# check for .env

env_path = Path.cwd().parent / "env" / ".env"  # up one folder

if not os.path.exists(env_path):
    sys.exit(f"Missing {env_path} file. Please create one with your DB credentials.")

load_dotenv(dotenv_path=env_path) 

PG_USER = os.getenv("PGUSER")
PG_PASSWORD = os.getenv("PGPASSWORD")
PG_HOST = os.getenv("PGHOST")
PG_PORT = os.getenv("PGPORT")
PG_DATABASE = os.getenv("PGDATABASE")

# Validate
missing_vars = [var for var, val in {
    "PG_USER": PG_USER,
    "PG_PASSWORD": PG_PASSWORD,
    "PG_HOST": PG_HOST,
    "PG_PORT": PG_PORT,
    "PG_DATABASE": PG_DATABASE
}.items() if val is None]

if missing_vars:
    print(f"Missing required environment variables: {', '.join(missing_vars)} "
             f"\nMake sure they are defined in {env_path}")

In [15]:
# configuration

pg_url = URL.create(
    "postgresql+psycopg",
    username=os.getenv("PGUSER"),
    password=os.getenv("PGPASSWORD"),
    host=os.getenv("PGHOST", "localhost"),
    port=int(os.getenv("PGPORT", 5432)),
    database=os.getenv("PGDATABASE"),
)

INDICATORS = ["WHOSIS_000001", "WHOSIS_000002", "MDG_0000000026", "u5mr", "MDG_0000000020"]
BASE_URL = "https://ghoapi.azureedge.net/api"  # WHO GHO OData endpoint

In [17]:
# connect to db
engine = create_engine(pg_url, pool_pre_ping=True)

with engine.connect() as conn:
    # Drop existing tables if you want a clean run
    conn.execute(text("""
    DO $$ DECLARE r RECORD;
    BEGIN
        FOR r IN (SELECT tablename FROM pg_tables WHERE schemaname = 'public') LOOP
            EXECUTE 'DROP TABLE IF EXISTS public.' || quote_ident(r.tablename) || ' CASCADE';
        END LOOP;
    END $$;
    """))
    conn.commit()

In [20]:
# create schema by running sql file

def run_sql_file(engine, path):
    if not os.path.exists(path):
        raise FileNotFoundError(f"SQL file not found: {path}")
    with open(path, "r", encoding="utf-8") as f:
        sql_text = f.read()
    # exec_driver_sql allows multiple statements separated by semicolons
    with engine.begin() as conn:      # begins a transaction
        conn.exec_driver_sql(sql_text)


try:
    run_sql_file(engine, os.path.join("..", "scripts", "schema.sql") if os.path.basename(os.getcwd())=="notebooks" else "scripts/schema.sql")
    print("Schema created/verified.")
except Exception as e:
    print("Failed to apply SQL files:", e)
    raise

Schema created/verified.


In [21]:
# extract

all_records = []

for ind in INDICATORS:
    print(f"Fetching {ind}...")
    url = f"{BASE_URL}/{ind}"
    resp = requests.get(url)
    if resp.status_code != 200:
        print(f"Failed to fetch {ind}: {resp.status_code}")
        continue
    data = resp.json().get("value", [])
    for row in data:
        all_records.append({
            "indicator_id": ind,
            "indicator_name": row.get("IndicatorName"),
            "country_code": row.get("SpatialDim"),
            "country_name": row.get("SpatialDimType"),  # WHO uses this as country name for some datasets
            "region": row.get("ParentLocation"),
            "year": int(row.get("TimeDim")) if row.get("TimeDim") else None,
            "value": float(row.get("NumericValue")) if row.get("NumericValue") else None
        })

df = pd.DataFrame(all_records)

Fetching WHOSIS_000001...
Fetching WHOSIS_000002...
Fetching MDG_0000000026...
Fetching u5mr...
Fetching MDG_0000000020...


In [22]:
# transform

# Clean up country_name (SpatialDimType is often 'COUNTRY', so we'll fetch proper country names from WHO "Dimension" endpoint)
# For now, we'll just map code -> itself as placeholder
df['country_name'] = df['country_code']

# Split into dimension + fact tables
dim_indicator = df[['indicator_id', 'indicator_name']].drop_duplicates()
dim_country = df[['country_code', 'country_name', 'region']].drop_duplicates()
fact_measure = df[['indicator_id', 'country_code', 'year', 'value']]


In [23]:
# load
dim_indicator.to_sql('dim_indicator', engine, if_exists='append', index=False)
dim_country.to_sql('dim_country', engine, if_exists='append', index=False)
fact_measure.to_sql('fact_measure', engine, if_exists='append', index=False)

print("ETL complete. Data loaded into epidemiology database.")
print(f"Indicators loaded: {len(dim_indicator)} | Countries: {len(dim_country)} | Records: {len(fact_measure)}")


ETL complete. Data loaded into epidemiology database.
Indicators loaded: 5 | Countries: 209 | Records: 53601
