In [9]:
import requests
import pandas as pd
from typing import List, Union
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Constants
WORLD_BANK_URL = 'http://api.worldbank.org/v2'

def fetch_world_bank_data(indicator: str, countries: Union[str, List[str]], start_year: int = 1960, end_year: int = None) -> pd.DataFrame:
    """Fetch data for specified years and countries for a given indicator from the World Bank API."""
    if end_year is None:
        end_year = pd.Timestamp.now().year
    
    countries_str = ';'.join(countries) if isinstance(countries, list) else countries
    
    url = f"{WORLD_BANK_URL}/country/{countries_str}/indicator/{indicator}"
    params = {
        'format': 'json',
        'per_page': 1000,  # Increased from default 50
        'date': f"{start_year}:{end_year}"
    }
    
    all_data = []
    page = 1
    
    while True:
        params['page'] = page
        try:
            response = requests.get(url, params=params)
            response.raise_for_status()
            data = response.json()
            
            if not data or len(data) < 2 or not data[1]:
                break
            
            all_data.extend(data[1])
            
            if len(data[1]) < params['per_page']:
                break
            
            page += 1
        except requests.RequestException as e:
            logger.error(f"Error fetching data: {str(e)}")
            break
    
    return process_world_bank_data(all_data, indicator)

def process_world_bank_data(data: List[dict], indicator: str) -> pd.DataFrame:
    """Process the fetched World Bank data into a DataFrame."""
    if not data:
        logger.warning(f"No data retrieved for indicator: {indicator}")
        return pd.DataFrame()

    df = pd.DataFrame(data)
    
    df['country_name'] = df['country'].apply(lambda x: x['value'] if isinstance(x, dict) else x)
    df['value'] = pd.to_numeric(df['value'], errors='coerce')
    df['date'] = pd.to_datetime(df['date'], format='%Y')
    
    df = df.drop(columns=['indicator', 'obs_status', 'decimal', 'country'])
    df = df.rename(columns={'countryiso3code': 'country_code', 'date': 'year', 'value': indicator})
    
    return df.set_index(['country_name', 'country_code', 'year']).sort_index()

def get_world_bank_data(indicator: str, countries: Union[str, List[str]], start_year: int = 1960, end_year: int = None) -> pd.DataFrame:
    """Fetch and process World Bank data for a given indicator."""
    try:
        df = fetch_world_bank_data(indicator, countries, start_year, end_year)
        logger.info(f"Successfully retrieved data for {indicator}")
        return df
    except Exception as e:
        logger.exception(f"Error retrieving data for {indicator}: {str(e)}")
        return pd.DataFrame()

In [12]:
import time

if __name__ == "__main__":
    indicator = "NY.GDP.PCAP.CD"
    countries = ["AUS", "UKR", "JPN"]
    start_year = 2000
    end_year = 2020

    start_time = time.time()
    df = get_world_bank_data(indicator, countries, start_year, end_year)
    end_time = time.time()

    print(df.head())
    print(f"Execution time: {end_time - start_time:.2f} seconds")

2024-08-05 13:31:06,088 - INFO - Successfully retrieved data for NY.GDP.PCAP.CD


                                      NY.GDP.PCAP.CD unit
country_name country_code year                           
Australia    AUS          2000-01-01    21870.415967     
                          2001-01-01    19695.729738     
                          2002-01-01    20301.843171     
                          2003-01-01    23718.133850     
                          2004-01-01    30836.730682     
Execution time: 0.57 seconds


In [13]:
import time
import world_bank_data as wb
import pandas as pd

def get_world_bank_data(indicator: str, years: int = 1) -> pd.DataFrame:
    start_time = time.time()
    
    # Fetch the data
    data = wb.get_series(indicator, mrv=years)
    
    # Convert to DataFrame if it's not already
    if not isinstance(data, pd.DataFrame):
        data = data.to_frame()
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    print(f"Time taken to fetch data: {elapsed_time:.2f} seconds")
    return data

# Example usage
if __name__ == "__main__":
    indicator = "NY.GDP.PCAP.CD"  # GDP per capita
    years = 50  # Let's fetch 10 years of data
    
    df = get_world_bank_data(indicator, years)
    print(df.head())
    print(f"Shape of the DataFrame: {df.shape}")

Time taken to fetch data: 0.27 seconds
                                                               NY.GDP.PCAP.CD
Country                     Series                       Year                
Africa Eastern and Southern GDP per capita (current US$) 1974      421.977185
                                                         1975      435.977902
                                                         1976      430.261244
                                                         1977      468.301007
                                                         1978      509.479882
Shape of the DataFrame: (13300, 1)
