# TB Data Acquisition

Fetch and load TB data from WHO GHO API (2018-2023)


In [72]:
# Configuration
import sys
import os
from pathlib import Path


project_root = next((p for p in [Path.cwd()] + list(Path.cwd().parents) 
                     if (p / 'notebooks' / '01_data_acquisition.ipynb').exists()),
                    Path.cwd())
os.chdir(project_root)

# API Configuration
WHO_GHO_API_BASE = "https://ghoapi.azureedge.net/api"
WHO_INDICATORS = {
    "tb_incidence": "TB_e_inc_tbhiv_num",
    "tb_mortality": "TB_e_mort_exc_tbhiv_num",
}

# Data parameters
START_YEAR = 2018
END_YEAR = 2023
YEARS = list(range(START_YEAR, END_YEAR + 1))

# Paths
DATABASE_PATH = str(project_root / "data" / "database" / "tb_data.db")
RAW_DATA_DIR = project_root / "data" / "raw"

print(f"✓ Project: {project_root}")
print(f"✓ Database: {DATABASE_PATH}")


✓ Project: /Users/joshua/datascienceproject
✓ Database: /Users/joshua/datascienceproject/data/database/tb_data.db


## Database Functions


In [73]:
import sqlite3

def get_db_connection(db_path):
    os.makedirs(os.path.dirname(db_path), exist_ok=True)
    conn = sqlite3.connect(db_path)
    conn.row_factory = sqlite3.Row
    return conn

def create_schema(db_path):
    conn = get_db_connection(db_path)
    cursor = conn.cursor()
    
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS country_metadata (
            country_code TEXT PRIMARY KEY,
            region_code TEXT,
            region_name TEXT,
            UNIQUE(country_code)
        )
    """)
    
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS tb_data (
            id INTEGER PRIMARY KEY,
            country_code TEXT NOT NULL,
            year INTEGER NOT NULL,
            tb_incidence_num REAL,
            tb_incidence_low REAL,
            tb_incidence_high REAL,
            tb_mortality_num REAL,
            tb_mortality_low REAL,
            tb_mortality_high REAL,
            tb_incidence_rate REAL,
            FOREIGN KEY (country_code) REFERENCES country_metadata(country_code),
            UNIQUE(country_code, year)
        )
    """)
    
    cursor.execute("CREATE INDEX IF NOT EXISTS idx_tb_country_year ON tb_data(country_code, year)")
    
    conn.commit()
    conn.close()
    print(f"Database schema created successfully at {db_path}")

def create_unified_view(db_path: str):
    conn = get_db_connection(db_path)
    cursor = conn.cursor()
    
    cursor.execute("DROP VIEW IF EXISTS unified_tb_data")
    cursor.execute("""
        CREATE VIEW unified_tb_data AS
        SELECT 
            cm.country_code, cm.region_code, cm.region_name,
            tb.year, tb.tb_incidence_num, tb.tb_incidence_low, tb.tb_incidence_high,
            tb.tb_mortality_num, tb.tb_mortality_low, tb.tb_mortality_high, tb.tb_incidence_rate
        FROM country_metadata cm
        LEFT JOIN tb_data tb ON cm.country_code = tb.country_code
        ORDER BY cm.country_code, tb.year
    """)
    
    conn.commit()
    conn.close()
    print("Unified view created successfully")


## API Fetching Functions


In [74]:
# API fetching functions
import requests
import pandas as pd
import time

def fetch(indicator: str, years=None):
    if years is None:
        years = YEARS
        
    url = f"{WHO_GHO_API_BASE}/{indicator}"
    try:
        response = requests.get(url, timeout=60)
        response.raise_for_status()
        data = response.json()
        
        if 'value' not in data:
            return pd.DataFrame()
        
        df = pd.DataFrame(data['value'])
        if df.empty:
            return df
        
        if years:
            df = df[df['TimeDim'].isin(years)].copy()
        
        df.rename(columns={
            'SpatialDim': 'country_code', 'TimeDim': 'year', 'NumericValue': 'value',
            'Low': 'value_low', 'High': 'value_high',
            'ParentLocation': 'region_name', 'ParentLocationCode': 'region_code'
        }, inplace=True)
        
        df = df[df.get('SpatialDimType', '') == 'COUNTRY'].copy()
        df = df.dropna(subset=['country_code', 'year'])
        df = df[df['value'] > 0].copy()
        return df
    except Exception as e:
        print(f"Error fetching {indicator}: {e}")
        return pd.DataFrame()


## Database Loading Functions


In [75]:
# Database loading functions
def _update_metadata(conn, df):
    if 'region_code' not in df.columns or 'region_name' not in df.columns:
        return
    metadata = df[['country_code', 'region_code', 'region_name']].drop_duplicates()
    for _, row in metadata.iterrows():
        conn.execute("""
            INSERT OR REPLACE INTO country_metadata (country_code, region_code, region_name)
            VALUES (?, ?, ?)
        """, (row['country_code'], row.get('region_code'), row.get('region_name')))

def _upsert_tb_data(conn, df, field):
    """Upsert TB data (incidence or mortality)."""
    for _, row in df.iterrows():
        cursor = conn.cursor()
        cursor.execute("SELECT id FROM tb_data WHERE country_code = ? AND year = ?",
                      (row['country_code'], int(row['year'])))
        if cursor.fetchone():
            conn.execute(f"""
                UPDATE tb_data SET {field}_num = ?, {field}_low = ?, {field}_high = ?
                WHERE country_code = ? AND year = ?
            """, (row.get('value'), row.get('value_low'), row.get('value_high'),
                  row['country_code'], int(row['year'])))
        else:
            conn.execute(f"""
                INSERT INTO tb_data (country_code, year, {field}_num, {field}_low, {field}_high)
                VALUES (?, ?, ?, ?, ?)
            """, (row['country_code'], int(row['year']), row.get('value'),
                  row.get('value_low'), row.get('value_high')))

def load_tb_incidence(df, db_path):
    if df.empty:
        return
    conn = get_db_connection(db_path)
    _update_metadata(conn, df)
    _upsert_tb_data(conn, df, 'tb_incidence')
    conn.commit()
    conn.close()
    print(f"Loaded {len(df)} TB incidence records")

def load_tb_mortality(df, db_path):
    if df.empty:
        return
    conn = get_db_connection(db_path)
    _update_metadata(conn, df)
    _upsert_tb_data(conn, df, 'tb_mortality')
    conn.commit()
    conn.close()
    print(f"Loaded {len(df)} TB mortality records")


## Data Acquisition Orchestration


In [76]:
# Main orchestration functions
def save_csv(df, filename):
    """Save DataFrame to CSV."""
    RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)
    df.to_csv(RAW_DATA_DIR / filename, index=False)

def acquire_all():
    datasets = {}
    
    # WHO data
    for key, indicator in [('tb_incidence', WHO_INDICATORS['tb_incidence']),
                          ('tb_mortality', WHO_INDICATORS['tb_mortality'])]:
        df = fetch(indicator, YEARS)
        if not df.empty:
            datasets[key] = df
            save_csv(df, f'{key}.csv')
            time.sleep(1)
    
    return datasets

def load_all(datasets, db_path):
    """Load all datasets into database."""
    loaders = {
        'tb_incidence': load_tb_incidence,
        'tb_mortality': load_tb_mortality
    }
    
    for key, loader in loaders.items():
        if key in datasets:
            loader(datasets[key], db_path)


## Workflow: Create Database Schema


In [77]:
# Create database schema
create_schema(DATABASE_PATH)


Database schema created successfully at /Users/joshua/datascienceproject/data/database/tb_data.db


## Workflow: Fetch Data from APIs


In [78]:
# Fetch all data from APIs
datasets = acquire_all()
print(f"Fetched {len(datasets)} datasets")


Fetched 2 datasets


## Workflow: Load Data into Database


In [79]:
# Load into database
load_all(datasets, DATABASE_PATH)
create_unified_view(DATABASE_PATH)
print("Data acquisition complete")

Loaded 1002 TB incidence records
Loaded 1062 TB mortality records
Unified view created successfully
Data acquisition complete


## Workflow: Verify Data


In [80]:
# Verify data
import pandas as pd

conn = get_db_connection(DATABASE_PATH)
summary = pd.read_sql("""
    SELECT *
    FROM unified_tb_data
    ORDER BY year
""", conn)
conn.close()

print("Data Coverage by Year:")
print(summary)

Data Coverage by Year:
     country_code region_code      region_name  year  tb_incidence_num  \
0             ZWE         AFR           Africa  2018           20000.0   
1             SGP         WPR  Western Pacific  2018              35.0   
2             JAM         AMR         Americas  2018              17.0   
3             UZB         EUR           Europe  2018            1000.0   
4             MUS         AFR           Africa  2018              44.0   
...           ...         ...              ...   ...               ...   
1057          TLS        SEAR  South-East Asia  2023              57.0   
1058          MWI         AFR           Africa  2023           12000.0   
1059          ECU         AMR         Americas  2023            1500.0   
1060          IDN         WPR  Western Pacific  2023           25000.0   
1061          AUS         WPR  Western Pacific  2023              30.0   

      tb_incidence_low  tb_incidence_high  tb_mortality_num  tb_mortality_low  \
0      