In [4]:
import requests
import pandas as pd
from typing import List, Union
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Constants
WORLD_BANK_URL = 'http://api.worldbank.org/v2'

def fetch_world_bank_data(indicator: str, countries: Union[str, List[str]], start_year: int = 1960, end_year: int = None) -> pd.DataFrame:
    """Fetch data for specified years and countries for a given indicator from the World Bank API."""
    if end_year is None:
        end_year = pd.Timestamp.now().year
    
    countries_str = ';'.join(countries) if isinstance(countries, list) else countries
    
    url = f"{WORLD_BANK_URL}/country/{countries_str}/indicator/{indicator}"
    params = {
        'format': 'json',
        'per_page': 10000,  
        'date': f"{start_year}:{end_year}"
    }
    
    all_data = []
    page = 1
    
    while True:
        params['page'] = page
        try:
            response = requests.get(url, params=params)
            response.raise_for_status()
            data = response.json()
            
            if not data or len(data) < 2 or not data[1]:
                break
            
            all_data.extend(data[1])
            
            if len(data[1]) < params['per_page']:
                break
            
            page += 1
        except requests.RequestException as e:
            logger.error(f"Error fetching data: {str(e)}")
            break
    
    return process_world_bank_data(all_data, indicator)

def process_world_bank_data(data: List[dict], indicator: str) -> pd.DataFrame:
    """Process the fetched World Bank data into a DataFrame."""
    if not data:
        logger.warning(f"No data retrieved for indicator: {indicator}")
        return pd.DataFrame()

    df = pd.DataFrame(data)
    
    df['country_name'] = df['country'].apply(lambda x: x['value'] if isinstance(x, dict) else x)
    df['value'] = pd.to_numeric(df['value'], errors='coerce')
    df['date'] = pd.to_datetime(df['date'], format='%Y')
    
    df = df.drop(columns=['indicator', 'obs_status', 'decimal', 'country', 'unit'])
    df = df.rename(columns={'countryiso3code': 'country_code', 'date': 'year', 'value': indicator})
    
    return df.set_index(['country_name', 'country_code', 'year']).sort_index()

def get_world_bank_data(indicator: str, countries: Union[str, List[str]], start_year: int = 1960, end_year: int = None) -> pd.DataFrame:
    """Fetch and process World Bank data for a given indicator."""
    try:
        df = fetch_world_bank_data(indicator, countries, start_year, end_year)
        logger.info(f"Successfully retrieved data for {indicator}")
        return df
    except Exception as e:
        logger.exception(f"Error retrieving data for {indicator}: {str(e)}")
        return pd.DataFrame()

In [5]:
#Example
if __name__ == "__main__":
    indicator = "NY.GDP.PCAP.CD"
    countries = ["USA", "UKR", "JPN"]
    start_year = 1960
    end_year = 2020
    df = get_world_bank_data(indicator, countries, start_year, end_year)
    print(df.head())

2024-08-07 16:46:47,666 - INFO - Successfully retrieved data for NY.GDP.PCAP.CD


                                      NY.GDP.PCAP.CD
country_name country_code year                      
Japan        JPN          1960-01-01      508.702779
                          1961-01-01      608.864581
                          1962-01-01      684.565510
                          1963-01-01      775.592123
                          1964-01-01      902.867722


In [6]:
test

NameError: name 'test' is not defined