# Loading WHO data with dlt

In [1]:
import dlt

/bin/bash: line 1: export: `./parquet_files': not a valid identifier


In [2]:
import dlt
from dlt.sources.helpers.rest_client import RESTClient
from dlt.sources.helpers.rest_client.paginators import PageNumberPaginator

maximum_page = 3000

BASE_WHO_URL = "https://ghoapi.azureedge.net/api/"
ENDPOINTS= {"dimensions": "Dimension", "indicators": "Indicator", "hospital_beds": "WHS6_102", "hospitals_per_100k": "DEVICES00"}

# Define the API resource for WHO data
def fetch_who_data(endpoint, params={}):
    client = RESTClient(
        base_url=BASE_WHO_URL,
        paginator=PageNumberPaginator(
            base_page=0,
            total_path=None,
            maximum_page=1
        )
    )

    for page in client.paginate(endpoint, params=params):    # <--- API endpoint for retrieving taxi ride data
        yield page   # <--- yield data to manage memory

@dlt.source
def who_source():
    for endpoint_name, endpoint in ENDPOINTS.items():
        params={"size":100}
        yield dlt.resource(
            fetch_who_data(endpoint, params),
            name=endpoint_name,
            write_disposition="replace",
        )

# define new dlt pipeline
pipeline = dlt.pipeline(pipeline_name="who_source", destination="duckdb", dataset_name="who_data")

# run the pipeline with the new resource
load_info = pipeline.run(who_source())
row_counts = pipeline.last_trace.last_normalize_info

In [3]:
# Start a connection to your database using native `duckdb` connection and look what tables were generated:

import duckdb

# A database '<pipeline_name>.duckdb' was created in working directory so just connect to it

# Connect to the DuckDB database
conn = duckdb.connect(f"{pipeline.pipeline_name}.duckdb")

# Set search path to the dataset
conn.sql(f"SET search_path = '{pipeline.dataset_name}'")

# Describe the dataset
conn.sql("DESCRIBE").df()

Unnamed: 0,database,schema,name,column_names,column_types,temporary
0,who_source,who_data,_dlt_loads,"[load_id, schema_name, status, inserted_at, sc...","[VARCHAR, VARCHAR, BIGINT, TIMESTAMP WITH TIME...",False
1,who_source,who_data,_dlt_pipeline_state,"[version, engine_version, pipeline_name, state...","[BIGINT, BIGINT, VARCHAR, VARCHAR, TIMESTAMP W...",False
2,who_source,who_data,_dlt_version,"[version, engine_version, inserted_at, schema_...","[BIGINT, BIGINT, TIMESTAMP WITH TIME ZONE, VAR...",False
3,who_source,who_data,dimensions,"[value, _dlt_load_id, _dlt_id, code, title]","[BLOB, VARCHAR, VARCHAR, VARCHAR, VARCHAR]",False
4,who_source,who_data,hospital_beds,"[value, _dlt_load_id, _dlt_id, id, indicator_c...","[BLOB, VARCHAR, VARCHAR, BIGINT, VARCHAR, VARC...",False
5,who_source,who_data,hospitals_per_100k,"[value, _dlt_load_id, _dlt_id, id, indicator_c...","[BLOB, VARCHAR, VARCHAR, BIGINT, VARCHAR, VARC...",False
6,who_source,who_data,indicators,"[value, _dlt_load_id, _dlt_id, indicator_code,...","[BLOB, VARCHAR, VARCHAR, VARCHAR, VARCHAR, VAR...",False
7,who_source,who_data_20250322050724,_dlt_loads,"[load_id, schema_name, status, inserted_at, sc...","[VARCHAR, VARCHAR, BIGINT, TIMESTAMP WITH TIME...",False
8,who_source,who_data_20250322050724,_dlt_pipeline_state,"[version, engine_version, pipeline_name, state...","[BIGINT, BIGINT, VARCHAR, VARCHAR, TIMESTAMP W...",False
9,who_source,who_data_20250322050724,_dlt_version,"[version, engine_version, inserted_at, schema_...","[BIGINT, BIGINT, TIMESTAMP WITH TIME ZONE, VAR...",False


In [4]:
conn.sql("select count(*), code from dimensions group by code")

┌──────────────┬────────────────────────┐
│ count_star() │          code          │
│    int64     │        varchar         │
├──────────────┼────────────────────────┤
│            1 │ EMFFREQUENCY           │
│            1 │ GHECAUSELEVEL2         │
│            1 │ MNFSUMMARY             │
│            1 │ NATIONALSYSTEMTYPE     │
│            1 │ CONSUMPTIONTYPE        │
│            1 │ DHSMICSGEOREGION       │
│            1 │ DONORTYPE              │
│            1 │ EMFEXPOSED             │
│            1 │ ENVCAUSE               │
│            1 │ FOODBORNE_HAZARD3      │
│            · │         ·              │
│            · │         ·              │
│            · │         ·              │
│            1 │ HOUSEHOLD_COMP_BY_AGE  │
│            1 │ POLICYSECTOR           │
│            1 │ PRICEMEASURETYPE       │
│            1 │ RCREGION               │
│            1 │ RSUDMONITORING         │
│            1 │ RSUDSPECIFICPOPULATION │
│            1 │ SUNBED_INFO      

In [None]:
conn.sql()

┌───────┬──────────────┐
│  id   │ count_star() │
│ int64 │    int64     │
├───────┴──────────────┤
│        0 rows        │
└──────────────────────┘