# World Bank Data Acquisition

This notebook acquires economic and demographic data from the World Bank API for UN member countries and stores it in a SQLite database.

## Step 1: Import Required Libraries


In [10]:
import requests
import sqlite3
import pathlib
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Optional, Tuple
import pycountry
import os
import pathlib

## Step 2: Define Core Functions


## Step 3: Define UN Member Countries and Utility Functions


In [11]:
UN_MEMBERS = {
    'AFG', 'ALB', 'DZA', 'AND', 'AGO', 'ATG', 'ARG', 'ARM', 'AUS', 'AUT',
    'AZE', 'BHS', 'BHR', 'BGD', 'BRB', 'BLR', 'BEL', 'BLZ', 'BEN', 'BTN',
    'BOL', 'BIH', 'BWA', 'BRA', 'BRN', 'BGR', 'BFA', 'BDI', 'CPV', 'KHM',
    'CMR', 'CAN', 'CAF', 'TCD', 'CHL', 'CHN', 'COL', 'COM', 'COG', 'COD',
    'CRI', 'CIV', 'HRV', 'CUB', 'CYP', 'CZE', 'DNK', 'DJI', 'DMA', 'DOM',
    'ECU', 'EGY', 'SLV', 'GNQ', 'ERI', 'EST', 'SWZ', 'ETH', 'FJI', 'FIN',
    'FRA', 'GAB', 'GMB', 'GEO', 'DEU', 'GHA', 'GRC', 'GRD', 'GTM', 'GIN',
    'GNB', 'GUY', 'HTI', 'HND', 'HUN', 'ISL', 'IND', 'IDN', 'IRN', 'IRQ',
    'IRL', 'ISR', 'ITA', 'JAM', 'JPN', 'JOR', 'KAZ', 'KEN', 'KIR', 'PRK',
    'KOR', 'KWT', 'KGZ', 'LAO', 'LVA', 'LBN', 'LSO', 'LBR', 'LBY', 'LIE',
    'LTU', 'LUX', 'MDG', 'MWI', 'MYS', 'MDV', 'MLI', 'MLT', 'MHL', 'MRT',
    'MUS', 'MEX', 'FSM', 'MDA', 'MCO', 'MNG', 'MNE', 'MAR', 'MOZ', 'MMR',
    'NAM', 'NRU', 'NPL', 'NLD', 'NZL', 'NIC', 'NER', 'NGA', 'MKD', 'NOR',
    'OMN', 'PAK', 'PLW', 'PAN', 'PNG', 'PRY', 'PER', 'PHL', 'POL', 'PRT',
    'QAT', 'ROU', 'RUS', 'RWA', 'KNA', 'LCA', 'VCT', 'WSM', 'SMR', 'STP',
    'SAU', 'SEN', 'SRB', 'SYC', 'SLE', 'SGP', 'SVK', 'SVN', 'SLB', 'SOM',
    'ZAF', 'SSD', 'ESP', 'LKA', 'SDN', 'SUR', 'SWE', 'CHE', 'SYR', 'TJK',
    'TZA', 'THA', 'TLS', 'TGO', 'TON', 'TTO', 'TUN', 'TUR', 'TKM', 'TUV',
    'UGA', 'UKR', 'ARE', 'GBR', 'USA', 'URY', 'UZB', 'VUT', 'VAT', 'VEN',
    'VNM', 'YEM', 'ZMB', 'ZWE'
}

## Step 5: Set Up Project Paths and Database Location


In [12]:
project_root = next((p for p in [pathlib.Path.cwd()] + list(pathlib.Path.cwd().parents) 
                     if (p / 'notebooks' / 'WorldBank_acquisition.ipynb').exists()),
                    pathlib.Path.cwd())
os.chdir(project_root)

DATABASE_PATH = str(project_root / "data" / "database" / "tb_data.db")
RAW_DATA_DIR = project_root / "data" / "raw"

print(f"✓ Project: {project_root}")
print(f"✓ Database: {DATABASE_PATH}")

✓ Project: /Users/joshua/datascienceproject/notebooks
✓ Database: /Users/joshua/datascienceproject/notebooks/data/database/tb_data.db


## Step 7: Initialize Database

In [13]:
db_path = pathlib.Path(DATABASE_PATH)
db_path.parent.mkdir(parents=True, exist_ok=True)
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

cursor.execute("DROP TABLE IF EXISTS country_data")

cursor.execute("""
    CREATE TABLE IF NOT EXISTS country_data (
        id INTEGER PRIMARY KEY AUTOINCREMENT, 
        country_code TEXT, 
        gdp_value_US REAL,
        poverty_rate REAL,
        population REAL,
        FOREIGN KEY (country_code) REFERENCES country_metadata(country_code)
    )
""")
conn.commit()

## Step 8: Fetch and Insert Data

**This is the main data acquisition step.** 


In [14]:
un_member_codes = list(UN_MEMBERS)
results = []
max_workers = 30

def process_country(country):
    if country not in UN_MEMBERS:
        return None
    
    print(f'Processing: {country}')
    
    def fetch_indicator(indicator):
        try:
            url = f'http://api.worldbank.org/v2/country/{country}/indicator/{indicator}?format=json&date=2020:2024&per_page=10'
            response = requests.get(url, timeout=30)
            json_data = response.json()
            
            if len(json_data) > 1 and json_data[1]:
                total = sum(item['value'] if item['value'] is not None else 0 for item in json_data[1])
                return total / len(json_data[1]) if json_data[1] else None
            return None
        except:
            return None
    
    avg_GDP = fetch_indicator("NY.GDP.MKTP.CD")
    avg_pov = fetch_indicator("SI.POV.DDAY")
    avg_pop = fetch_indicator("SP.POP.TOTL")
    
    return (country, avg_GDP, avg_pov, avg_pop)

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = {executor.submit(process_country, country): country for country in un_member_codes}

    for future in as_completed(futures):
        try:
            result = future.result()
            if result is not None:
                results.append(result)
        except Exception as e:
            country = futures[future]
            print(f'Error processing {country}: {e}')

if results:
    print(f'Full insertion beginning. {len(results)} records to insert.')
    cursor.executemany("INSERT INTO country_data(country_code, gdp_value_US, poverty_rate, population) VALUES (?, ?, ?, ?)", results)
    conn.commit()
    print('Completed Insertion.')
else:
    print('No data to insert.')

Processing: CHNProcessing: MDA

Processing: VNM
Processing: LBN
Processing: PLW
Processing: CUB
Processing: SRB
Processing: WSM
Processing: SOM
Processing: ARE
Processing: SEN
Processing: CPV
Processing: THA
Processing: CHE
Processing: GRD
Processing: LBY
Processing: SUR
Processing: ITA
Processing: ETH
Processing: PRY
Processing: MUS
Processing: TKM
Processing: JAM
Processing: MMR
Processing: MRT
Processing: GMB
Processing: MHL
Processing: LVA
Processing: COD
Processing: CAN
Processing: BDI
Processing: BLZ
Processing: COG
Processing: MDV
Processing: ARG
Processing: IRL
Processing: PNG
Processing: VCT
Processing: ARM
Processing: COM
Processing: MEX
Processing: TUN
Processing: BRA
Processing: SWE
Processing: ZWE
Processing: ROU
Processing: BTN
Processing: DZA
Processing: SLV
Processing: RUS
Processing: BWA
Processing: EGY
Processing: KHM
Processing: DOM
Processing: GIN
Processing: BHR
Processing: FRA
Processing: DMA
Processing: NPL
Processing: HND
Processing: NRU
Processing: SAU
Processi

## Step 9: Verify Data Insertion

In [15]:
cursor.execute("SELECT * FROM country_data")
country_data = cursor.fetchall()
print(f"Total records: {len(country_data)}")
print("\nFirst 10 records:")
for i, row in enumerate(country_data[:10]):
    print(row)

Total records: 194

First 10 records:
(1, 'MRT', 9691939327.04048, 0.0, 4880495.6)
(2, 'ITA', 2173627237130.138, 0.6799999999999999, 59113037.8)
(3, 'LBN', 19183022423.27632, 1.1800000000000002, 5748892.8)
(4, 'SOM', 10278438115.999893, 0.0, 17818457.0)
(5, 'PLW', 206436780.0, 0.0, 17751.2)
(6, 'SUR', 3596149599.4521523, 0.44000000000000006, 623338.8)
(7, 'CAN', 2056613491918.482, 0.08, 39315303.8)
(8, 'JAM', 16984282477.242239, 0.27999999999999997, 2837305.2)
(9, 'CHE', 843361336728.892, 0.06000000000000001, 8808399.2)
(10, 'VNM', 407356326732.5698, 0.5800000000000001, 99606964.4)
