In [4]:
import logging
import pandas as pd
import numpy as np
import wbdata
from pathlib import Path

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('data_fetch_wbdata.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger()

# Define date range and indicators
start_date = '1987-05-20'
end_date = '2022-11-14'
indicators = {
    'NY.GDP.MKTP.CD': {'name': 'GDP', 'country': 'WLD'},
    'FP.CPI.TOTL.ZG': {'name': 'CPI', 'country': 'WLD'},
    'SL.UEM.TOTL.ZS': {'name': 'Unemployment_Rate', 'country': 'WLD'},
    'PA.NUS.FCRF': {'name': 'Exchange_Rate', 'country': 'EMU'}
}

# Fetch data for each indicator
data_frames = {}
for indicator_code, info in indicators.items():
    try:
        logger.info(f"Fetching {info['name']} data...")
        data = wbdata.get_dataframe(
            {indicator_code: info['name']},
            country=info['country'],
            date=(start_date, end_date)
        )
        logger.info(f"Successfully fetched {info['name']} data")
        data_frames[info['name']] = data
    except Exception as e:
        logger.error(f"Error fetching {info['name']} data: {str(e)}")
        data_frames[info['name']] = pd.DataFrame()
        logger.warning(f"No data available for {info['name']}")

# Process data function
def process_data(df, indicator_name):
    """
    Process and clean the fetched data.
    """
    try:
        if df is None or df.empty:
            logger.warning(f"No data available for {indicator_name}")
            return pd.DataFrame()

        # Clean and process the data
        df = df.reset_index()
        df.columns = ['date', indicator_name]
        df['date'] = pd.to_datetime(df['date'])

        # Handle missing values
        df[indicator_name] = df[indicator_name].replace([np.inf, -np.inf], np.nan)
        df.dropna(inplace=True)

        # Convert to daily frequency
        full_index = pd.date_range(start=df['date'].min(), end=df['date'].max(), freq='D')
        df_daily = df.set_index('date').reindex(full_index)
        df_daily.interpolate(method='cubic', inplace=True)
        df_daily.reset_index(inplace=True)
        df_daily.rename(columns={'index': 'Date'}, inplace=True)

        logger.info(f"Successfully processed {indicator_name} data")
        return df_daily

    except Exception as e:
        logger.error(f"Error processing {indicator_name} data: {str(e)}")
        return pd.DataFrame()

# Process and save each dataset
output_dir = Path("../data")
output_dir.mkdir(parents=True, exist_ok=True)

for name, df in data_frames.items():
    processed_df = process_data(df, name)
    if not processed_df.empty:
        output_path = output_dir / f"{name}_cleaned_data_daily.csv"
        processed_df.to_csv(output_path, index=False)
        logger.info(f"Successfully saved {name} data to {output_path}")


2024-11-08 12:37:22,342 - INFO - Fetching GDP data...
2024-11-08 12:37:22,347 - INFO - Successfully fetched GDP data
2024-11-08 12:37:22,348 - INFO - Fetching CPI data...
2024-11-08 12:37:22,355 - INFO - Successfully fetched CPI data
2024-11-08 12:37:22,357 - INFO - Fetching Unemployment_Rate data...
2024-11-08 12:37:22,368 - INFO - Successfully fetched Unemployment_Rate data
2024-11-08 12:37:22,368 - INFO - Fetching Exchange_Rate data...
2024-11-08 12:37:22,376 - INFO - Successfully fetched Exchange_Rate data
2024-11-08 12:37:22,385 - INFO - Successfully processed GDP data
2024-11-08 12:37:22,411 - INFO - Successfully saved GDP data to ..\data\GDP_cleaned_data_daily.csv
2024-11-08 12:37:22,417 - INFO - Successfully processed CPI data
2024-11-08 12:37:22,443 - INFO - Successfully saved CPI data to ..\data\CPI_cleaned_data_daily.csv
2024-11-08 12:37:22,450 - INFO - Successfully processed Unemployment_Rate data
2024-11-08 12:37:22,468 - INFO - Successfully saved Unemployment_Rate data to