# World Bank Data Acquisition

This notebook acquires economic and demographic data from the World Bank API for UN member countries and stores it in a SQLite database.

## Step 1: Import Required Libraries


In [None]:
import requests
import sqlite3
import pathlib
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Optional, Tuple
import pycountry
import os
import pathlib

## Step 2: Define Core Functions


In [None]:
def get_wb_country_data(country_code, indicator_code, format='json', year1=None, year2=None, numResults=50):
    url = 'http://api.worldbank.org/v2/country/'+country_code+'/indicator/'+indicator_code+'?'+'format='+format
    if year1 and year2:
        url += '&date='+str(year1)+':'+str(year2)
    if numResults:
        url += '&per_page='+str(numResults)
    response = requests.get(url)
    return response.json()

def get_db_connection(db_path: pathlib.Path) -> sqlite3.Connection:
    db_path = pathlib.Path(db_path)
    db_path.parent.mkdir(parents=True, exist_ok=True)
    conn = sqlite3.connect(db_path)
    return conn

def create_tables(conn: sqlite3.Connection, schema: list) -> None:
    cursor = conn.cursor()
    for table in schema:
        cursor.execute(table)
    conn.commit()

def insert_data(conn: sqlite3.Connection, data: list) -> None:
    cursor = conn.cursor()
    cursor.executemany(f"INSERT INTO country_data(country_code, gdp_value_US, poverty_rate, population) VALUES (?, ?, ?, ?)", data)
    conn.commit()

def get_country_info(conn: sqlite3.Connection):
    cursor = conn.cursor()
    cursor.execute("SELECT * FROM country_data")
    return cursor.fetchall()

## Step 3: Define UN Member Countries and Utility Functions


In [None]:
UN_MEMBERS = {
    'AFG', 'ALB', 'DZA', 'AND', 'AGO', 'ATG', 'ARG', 'ARM', 'AUS', 'AUT',
    'AZE', 'BHS', 'BHR', 'BGD', 'BRB', 'BLR', 'BEL', 'BLZ', 'BEN', 'BTN',
    'BOL', 'BIH', 'BWA', 'BRA', 'BRN', 'BGR', 'BFA', 'BDI', 'CPV', 'KHM',
    'CMR', 'CAN', 'CAF', 'TCD', 'CHL', 'CHN', 'COL', 'COM', 'COG', 'COD',
    'CRI', 'CIV', 'HRV', 'CUB', 'CYP', 'CZE', 'DNK', 'DJI', 'DMA', 'DOM',
    'ECU', 'EGY', 'SLV', 'GNQ', 'ERI', 'EST', 'SWZ', 'ETH', 'FJI', 'FIN',
    'FRA', 'GAB', 'GMB', 'GEO', 'DEU', 'GHA', 'GRC', 'GRD', 'GTM', 'GIN',
    'GNB', 'GUY', 'HTI', 'HND', 'HUN', 'ISL', 'IND', 'IDN', 'IRN', 'IRQ',
    'IRL', 'ISR', 'ITA', 'JAM', 'JPN', 'JOR', 'KAZ', 'KEN', 'KIR', 'PRK',
    'KOR', 'KWT', 'KGZ', 'LAO', 'LVA', 'LBN', 'LSO', 'LBR', 'LBY', 'LIE',
    'LTU', 'LUX', 'MDG', 'MWI', 'MYS', 'MDV', 'MLI', 'MLT', 'MHL', 'MRT',
    'MUS', 'MEX', 'FSM', 'MDA', 'MCO', 'MNG', 'MNE', 'MAR', 'MOZ', 'MMR',
    'NAM', 'NRU', 'NPL', 'NLD', 'NZL', 'NIC', 'NER', 'NGA', 'MKD', 'NOR',
    'OMN', 'PAK', 'PLW', 'PAN', 'PNG', 'PRY', 'PER', 'PHL', 'POL', 'PRT',
    'QAT', 'ROU', 'RUS', 'RWA', 'KNA', 'LCA', 'VCT', 'WSM', 'SMR', 'STP',
    'SAU', 'SEN', 'SRB', 'SYC', 'SLE', 'SGP', 'SVK', 'SVN', 'SLB', 'SOM',
    'ZAF', 'SSD', 'ESP', 'LKA', 'SDN', 'SUR', 'SWE', 'CHE', 'SYR', 'TJK',
    'TZA', 'THA', 'TLS', 'TGO', 'TON', 'TTO', 'TUN', 'TUR', 'TKM', 'TUV',
    'UGA', 'UKR', 'ARE', 'GBR', 'USA', 'URY', 'UZB', 'VUT', 'VAT', 'VEN',
    'VNM', 'YEM', 'ZMB', 'ZWE'
}

def average_value(json_data: list) -> float:
    sum = 0
    for i in range(len(json_data[1])):
        value = json_data[1][i]['value']
        sum += value if value is not None else 0
    avg = sum/len(json_data[1])
    return avg

## Step 4: Define Data Fetching and Processing Functions


In [None]:
def fetch_country_indicator(country: str, indicator: str):
    try:
        json_data = get_wb_country_data(country, indicator, format='json', year1=2020, year2=2024, numResults=10)
        return average_value(json_data)
    except Exception as e:
        print(f'Error fetching {indicator} for {country}')
        return None

def country_tuple(country, max_workers = 3):
    if country not in UN_MEMBERS:
        print(f'Skipped: {country} (not UN member)')

    print('Processing: ' + country)

    with ThreadPoolExecutor(max_workers) as executor:
        future_gdp = executor.submit(fetch_country_indicator, country, "NY.GDP.MKTP.CD")
        future_pov = executor.submit(fetch_country_indicator, country, "SI.POV.DDAY")
        future_pop = executor.submit(fetch_country_indicator, country, "SP.POP.TOTL")
            
        avg_GDP = future_gdp.result()
        avg_pov = future_pov.result()
        avg_pop = future_pop.result()    
    
    return (country, avg_GDP, avg_pov, avg_pop)

def bulk_insertion_concurrent(codes: list, conn: sqlite3.Connection, max_workers: int = 10):
    un_member_codes = list(UN_MEMBERS)

    results = []

    with ThreadPoolExecutor(max_workers) as executor:
        future_country = {executor.submit(country_tuple, country): country for country in un_member_codes}

        for future in as_completed(future_country):
            try:
                country = future_country[future]
                result = future.result()
                if result is not None:
                    results.append(result)
            except Exception as e:
                print(f'Error processing {country}: {e}')
    
    if results:
        print('Full insertion beginning.')
        insert_data(conn, results)
        print('Completed Insertion.')
    else:
        print('No data to insert.')

## Step 5: Set Up Project Paths and Database Location


In [None]:
project_root = next((p for p in [pathlib.Path.cwd()] + list(pathlib.Path.cwd().parents) 
                     if (p / 'notebooks' / 'WorldBank_acquisition.ipynb').exists()),
                    pathlib.Path.cwd())
os.chdir(project_root)

DATABASE_PATH = str(project_root / "data" / "database" / "tb_data.db")
RAW_DATA_DIR = project_root / "data" / "raw"

print(f"✓ Project: {project_root}")
print(f"✓ Database: {DATABASE_PATH}")

✓ Project: /Users/prathamwankhede/Documents/CS210Project
✓ Database: /Users/prathamwankhede/Documents/CS210Project/data/database/tb_data.db


## Step 6: Define Database Schema

In [None]:
schema = [
    """CREATE TABLE IF NOT EXISTS country_data (
        id INTEGER PRIMARY KEY AUTOINCREMENT, 
        country_code TEXT, 
        gdp_value_US REAL,
        poverty_rate REAL,
        population REAL,
        FOREIGN KEY (country_code) REFERENCES country_metadata(country_code)
        )
    """
]

## Step 7: Initialize Database

In [None]:
conn = get_db_connection(DATABASE_PATH)
cursor = conn.cursor()
cursor.execute("DROP TABLE IF EXISTS country_data")
create_tables(conn, schema=schema)
country_codes = list(pycountry.countries)

## Step 8: Fetch and Insert Data

**This is the main data acquisition step.** 


In [39]:
bulk_insertion_concurrent(country_codes, conn)

Processing: BHS
Processing: KIR
Processing: CMR
Processing: GUY
Processing: ZMB
Processing: DJI
Processing: MYS
Processing: GBR
Processing: JOR
Processing: SWZ
Processing: TCD
Processing: SLE
Processing: COM
Processing: VNM
Processing: HUN
Processing: LUX
Processing: KEN
Processing: AND
Processing: KWT
Processing: MHL
Processing: IRL
Processing: SWE
Processing: NLD
Processing: BHR
Processing: MNG
Processing: BFA
Processing: ALB
Processing: GNB
Processing: KOR
Processing: GNQ
Processing: MDV
Processing: OMN
Processing: SAU
Processing: UZB
Processing: MKD
Processing: GHA
Processing: LTU
Processing: IDN
Processing: PAN
Processing: FSM
Processing: ARE
Processing: SYC
Processing: BDI
Processing: QAT
Processing: LCA
Processing: MRT
Processing: ISR
Processing: BGD
Processing: HTI
Processing: MDA
Processing: GRC
Processing: MWI
Processing: AZE
Processing: SSD
Processing: BIH
Processing: STP
Processing: VEN
Processing: COD
Processing: DMA
Processing: TON
Processing: BEL
Processing: URY
Processi

## Step 9: Verify Data Insertion

In [40]:
get_country_info(conn)

[(1, 'JOR', 48640442507.042274, 0.0, 11235987.2),
 (2, 'MYS', 388104813734.0072, 0.0, 34710284.2),
 (3, 'CMR', 46147712052.82514, 5.34, 27651103.6),
 (4, 'KIR', 274534325.7314058, 0.0, 130398.6),
 (5, 'GUY', 13997082123.453236, 0.0, 820408.0),
 (6, 'DJI', 3620660251.630366, 0.0, 1137039.6),
 (7, 'ZMB', 24660339152.282463, 14.34, 20170972.2),
 (8, 'SWZ', 4628709844.658633, 0.0, 1218313.4),
 (9, 'GBR', 3193567997243.018, 0.2, 67810000.0),
 (10, 'BHS', 13480220000.0, 0.0, 398099.4),
 (11, 'COM', 1356061722.4578419, 0.0, 834308.0),
 (12, 'TCD', 17873348098.793243, 7.9, 18625291.0),
 (13, 'SLE', 6990272862.107933, 0.0, 8277300.2),
 (14, 'LUX', 84326114459.20609, 0.06000000000000001, 653546.6),
 (15, 'VNM', 407356326732.5698, 0.5800000000000001, 99606964.4),
 (16, 'HUN', 191243033184.5868, 0.0, 9612185.0),
 (17, 'AND', 3484234594.81185, 0.0, 79648.6),
 (18, 'KEN', 111469484695.5792, 27.54, 54292181.6),
 (19, 'KWT', 153701973802.5892, 0.0, 4635611.2),
 (20, 'MHL', 258751568.84596044, 0.0, 401