In [7]:
import pandas as pd
import asyncio
import aiohttp
from typing import List, Dict, Optional, Union
import nest_asyncio
import logging
from aiohttp import ClientError, ClientResponseError
from cachetools import cached, TTLCache, keys
import json
import re
import time

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Apply nest_asyncio to allow running async code in Jupyter notebooks
nest_asyncio.apply()

# Constants
WORLD_BANK_URL = 'http://api.worldbank.org/v2'
CACHE_SIZE = 128
CACHE_TTL = 3600  # Cache timeout in seconds

class WBRequestError(ClientResponseError):
    """Custom error for World Bank API request issues"""

# Timing decorator
def timing_decorator(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        print(f"{func.__name__} took {end_time - start_time:.4f} seconds to execute.")
        return result
    return wrapper

@timing_decorator
def collapse(values: Union[str, List, None]) -> str:
    """
    Collapse multiple values to a semicolon-separated list of values.
    
    Args:
    values (Union[str, List, None]): The value(s) to collapse
    
    Returns:
    str: A semicolon-separated string of values
    """
    if isinstance(values, str):
        return values
    if values is None:
        return 'all'
    if isinstance(values, list):
        return ';'.join([collapse(v) for v in values])
    return str(values)
    
@timing_decorator
def extract_preferred_field(data: Union[Dict, List, str], id_or_value: str) -> str:
    """
    Extract the preferred representation of data when it has multiple representations.
    
    Args:
    data (Union[Dict, List, str]): The data to extract from
    id_or_value (str): Whether to prefer 'id' or 'value' fields
    
    Returns:
    str: The extracted preferred field
    """
    if not id_or_value:
        return data

    if not data:
        return ''

    if isinstance(data, dict):
        return data.get(id_or_value, '')

    if isinstance(data, list):
        return ','.join([extract_preferred_field(i, id_or_value) for i in data])

    return data
    
@timing_decorator
def _extract_message(msg: str) -> str:
    """
    Extract error message from XML response.
    
    Args:
    msg (str): The XML message to parse
    
    Returns:
    str: The extracted error message
    """
    if 'wb:message' not in msg:
        return msg
    return re.sub(re.compile('.*<wb:message[^>]*>', re.DOTALL), '',
                  re.sub(re.compile('</wb:message>.*', re.DOTALL), '', msg))

@timing_decorator
async def fetch_page(session: aiohttp.ClientSession, url: str, params: Dict) -> Optional[List[Dict]]:
    """
    Fetch a single page of data from the World Bank API.
    
    Args:
    session (aiohttp.ClientSession): The aiohttp session to use for requests
    url (str): The URL to fetch data from
    params (Dict): The parameters for the API request
    
    Returns:
    Optional[List[Dict]]: A list of data dictionaries from the API response, or None if there's an error
    """
    try:
        async with session.get(url, params=params) as response:
            response.raise_for_status()
            data = await response.json()
            return data
    except ClientResponseError as e:
        logger.error(f"HTTP error occurred: {e.status} - {e.message}")
        raise WBRequestError(e.request_info, e.history, status=e.status, message=e.message)
    except ClientError as e:
        logger.error(f"Error fetching data: {str(e)}")
    except Exception as e:
        logger.error(f"Unexpected error occurred: {str(e)}")
    return None

@timing_decorator
async def fetch_world_bank_data(indicator: str, countries: Union[str, List[str]], start_year: int = 1960, end_year: int = None, language: str = 'en') -> pd.DataFrame:
    """
    Fetch data for specified years and countries for a given indicator from the World Bank API.
    
    Args:
    indicator (str): The indicator code (e.g., 'NY.GDP.PCAP.CD' for GDP per capita)
    countries (Union[str, List[str]]): The country or countries to fetch data for
    start_year (int): The starting year for data retrieval (default: 1960)
    end_year (int): The ending year for data retrieval (default: current year)
    language (str): The language for the API response (default: 'en')
    
    Returns:
    pd.DataFrame: A DataFrame containing the fetched data
    """
    if end_year is None:
        end_year = pd.Timestamp.now().year
    
    url = f"{WORLD_BANK_URL}/country/{collapse(countries)}/indicator/{indicator}"
    params = {
        'format': 'json',
        'per_page': 10000,
        'date': f"{start_year}:{end_year}",
        'language': language
    }
    
    async with aiohttp.ClientSession() as session:
        all_data = []
        page = 1
        
        while True:
            params['page'] = page
            data = await fetch_page(session, url, params)
            
            if not data or len(data) < 2 or not data[1]:
                break
            
            all_data.extend(data[1])
            page += 1
            
            # Check if we've reached the last page
            if len(data[1]) < params['per_page']:
                break
    
    return process_world_bank_data(all_data, indicator)

@timing_decorator
def process_world_bank_data(data: List[Dict], indicator: str) -> pd.DataFrame:
    """
    Process the raw data from the World Bank API into a clean DataFrame.
    
    Args:
    data (List[Dict]): Raw data from the World Bank API
    indicator (str): The indicator code used for the data retrieval
    
    Returns:
    pd.DataFrame: A processed DataFrame containing the World Bank data
    """
    if not data:
        logger.warning(f"No data retrieved for indicator: {indicator}")
        return pd.DataFrame()

    df = pd.DataFrame(data)
    
    # Process the DataFrame
    df['country_name'] = df['country'].apply(lambda x: x['value'] if isinstance(x, dict) else x)
    df['value'] = pd.to_numeric(df['value'], errors='coerce')
    df['date'] = pd.to_datetime(df['date'].astype(str), format='%Y', errors='coerce')
    df = df.drop(columns=['indicator', 'obs_status', 'decimal', 'country'])
    
    # Rename columns for clarity
    df = df.rename(columns={'countryiso3code': 'country_code', 'date': 'year'})
    
    # Set index for easier data manipulation
    df = df.set_index(['country_name', 'country_code', 'year'])
    
    # Sort the index
    df = df.sort_index()
    
    return df

@timing_decorator
def _robust_key(*args, **kwargs):
    """Generate a robust cache key"""
    # Convert any list arguments to tuples
    args = tuple(tuple(arg) if isinstance(arg, list) else arg for arg in args)
    
    # Convert any list values in kwargs to tuples
    for key, value in kwargs.items():
        if isinstance(value, list):
            kwargs[key] = tuple(value)
    
    if 'proxies' in kwargs:
        kwargs['proxies'] = json.dumps(kwargs['proxies'])
    
    return keys.hashkey(*args, **kwargs)

@timing_decorator
@cached(TTLCache(CACHE_SIZE, CACHE_TTL), key=_robust_key)
def get_world_bank_data(indicator: str, countries: Union[str, List[str]], start_year: int = 1960, end_year: int = None, language: str = 'en') -> pd.DataFrame:
    """
    Fetch and process World Bank data for a given indicator.
    
    Args:
    indicator (str): The indicator code (e.g., 'NY.GDP.PCAP.CD' for GDP per capita)
    countries (Union[str, List[str]]): The country or countries to fetch data for
    start_year (int): The starting year for data retrieval (default: 1960)
    end_year (int): The ending year for data retrieval (default: current year)
    language (str): The language for the API response (default: 'en')
    
    Returns:
    pd.DataFrame: A processed DataFrame containing the World Bank data
    """
    try:
        df = asyncio.run(fetch_world_bank_data(indicator, countries, start_year, end_year, language))
        logger.info(f"Successfully retrieved data for indicator: {indicator}")
        return df
    except WBRequestError as e:
        logger.error(f"World Bank API error: {str(e)}")
        return pd.DataFrame()
    except Exception as e:
        logger.error(f"Error retrieving data for indicator {indicator}: {str(e)}")
        return pd.DataFrame()


In [9]:
# Example usage with timing
if __name__ == "__main__":
    start_time = time.time()
    
    indicator = "NY.GDP.PCAP.CD"  # GDP per capita
    countries = ["AUS", "UKR", "GBR"]
    df = get_world_bank_data(indicator, countries, start_year=2000, end_year=2020)
    print(df.head())
    
    end_time = time.time()
    print(f"Total execution time: {end_time - start_time:.4f} seconds")

_robust_key took 0.0000 seconds to execute.
fetch_world_bank_data took 0.0000 seconds to execute.
collapse took 0.0000 seconds to execute.
collapse took 0.0000 seconds to execute.
collapse took 0.0000 seconds to execute.
collapse took 0.0001 seconds to execute.
fetch_page took 0.0000 seconds to execute.


2024-08-03 18:04:41,729 - INFO - Successfully retrieved data for indicator: NY.GDP.PCAP.CD


process_world_bank_data took 0.0129 seconds to execute.
get_world_bank_data took 89.0052 seconds to execute.
                                             value unit
country_name country_code year                         
Australia    AUS          2000-01-01  21870.415967     
                          2001-01-01  19695.729738     
                          2002-01-01  20301.843171     
                          2003-01-01  23718.133850     
                          2004-01-01  30836.730682     
Total execution time: 89.0073 seconds


In [4]:
import time
import world_bank_data as wb
import pandas as pd

def get_world_bank_data(indicator: str, years: int = 1) -> pd.DataFrame:
    start_time = time.time()
    
    # Fetch the data
    data = wb.get_series(indicator, mrv=years)
    
    # Convert to DataFrame if it's not already
    if not isinstance(data, pd.DataFrame):
        data = data.to_frame()
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    print(f"Time taken to fetch data: {elapsed_time:.2f} seconds")
    return data

# Example usage
if __name__ == "__main__":
    indicator = "NY.GDP.PCAP.CD"  # GDP per capita
    years = 100  # Let's fetch 10 years of data
    
    df = get_world_bank_data(indicator, years)
    print(df.head())
    print(f"Shape of the DataFrame: {df.shape}")

Time taken to fetch data: 0.75 seconds
                                                               NY.GDP.PCAP.CD
Country                     Series                       Year                
Africa Eastern and Southern GDP per capita (current US$) 1960      162.342517
                                                         1961      166.263682
                                                         1962      171.961916
                                                         1963      182.018479
                                                         1964      192.639989
Shape of the DataFrame: (17024, 1)
