In [1]:
import requests
import pandas as pd

In [10]:
def get_ons_timeseries(series_id: str, dataset_id: str) -> pd.DataFrame:
    """
    Fetch a time series from the UK ONS API.
    
    Args:
        series_id: The series identifier
        dataset_id: The dataset identifier 
    
    Returns:
        DataFrame with the time series data
    """
    # ONS API endpoint for time series data
    url = f"https://www.ons.gov.uk/employmentandlabourmarket/peopleinwork/labourproductivity/timeseries/{series_id}/{dataset_id}/data"
    
    response = requests.get(url)
    response.raise_for_status()
    
    data = response.json()
    
    # Extract the time series observations
    # ONS data typically has 'years', 'quarters', 'months' sections
    records = []
    
    for period_type in ['years', 'quarters', 'months']:
        if period_type in data:
            for obs in data[period_type]:
                records.append({
                    'date': obs.get('date'),
                    'value': float(obs.get('value')) if obs.get('value') else None,
                    'period_type': period_type,
                    'label': obs.get('label')
                })
    
    df = pd.DataFrame(records)
    
    # Add metadata
    df.attrs['title'] = data.get('description', {}).get('title', '')
    df.attrs['unit'] = data.get('description', {}).get('unit', '')
    
    return df

In [3]:
# Fetch the LZVD series from PRDY dataset
productivity_growth = get_ons_timeseries('lzvd', 'prdy')

productivity_growth = productivity_growth[productivity_growth['period_type'] == 'quarters']
# turn 'date' from YYYY Qn to datetime

def quarter_to_date(date_str):
    year, quarter = date_str.split(' Q')
    month = (int(quarter) - 1) * 3 + 1
    return f"{year}-{month:02d}-01"

productivity_growth['date'] = productivity_growth['date'].apply(quarter_to_date)

productivity_growth['value'] = productivity_growth['value'] / 100  # Convert percentage to decimal

productivity_growth.to_csv('productivity_growth.csv', index=False)

In [5]:
# OECD SDMX REST API endpoint
BASE_URL = "https://sdmx.oecd.org/public/rest/data"

# G7 country codes
G7_COUNTRIES = ["USA", "GBR", "DEU", "FRA", "ITA", "JPN", "CAN"]


In [19]:


def fetch_investment_share():
    """
    Fetch investment (GFCF) as share of GDP from OECD National Accounts
    Using SNA_TABLE1 - GDP expenditure approach
    """
    
    # Build the data query
    # Dataset: SNA_TABLE1 (Main aggregates, expenditure approach)
    # We need: P51 (GFCF) and B1_GE (GDP) to calculate the ratio
    # Or use pre-calculated shares from QNA dataset
    
    dataset = "SNA_TABLE1"
    countries = "+".join(G7_COUNTRIES)
    
    # Transaction codes: P51 = GFCF, B1_GE = GDP
    # Measure: V = current prices, C = volume
    # We'll get both and calculate share
    
    # Query for GFCF and GDP in current prices
    query = f"{dataset}/{countries}.P51+B1_GE.C"
    
    url = f"{BASE_URL}/{query}"
    
    params = {
        "startPeriod": "2000",
        "dimensionAtObservation": "AllDimensions",
        "format": "csv"
    }
    
    print(f"Fetching data from: {url}")
    
    response = requests.get(url, params=params)
    
    if response.status_code != 200:
        print(f"Error: {response.status_code}")
        print("Trying alternative dataset (QNA - Quarterly National Accounts)...")
        return fetch_from_qna()
    
    # Parse CSV response
    from io import StringIO
    df = pd.read_csv(StringIO(response.text))
    
    return process_data(df)


def fetch_from_qna():
    """
    Alternative: Use Quarterly National Accounts with annual frequency
    This dataset often has better coverage for investment ratios
    """
    
    dataset = "QNA"
    countries = "+".join(G7_COUNTRIES)
    
    # P51: GFCF, B1_GE: GDP
    # VOBARSA: Current prices, seasonally adjusted annual rate
    # Or use GDP ratio directly if available
    
    query = f"{dataset}/{countries}.P51+B1_GE.VOBARSA.A"
    url = f"{BASE_URL}/{query}"
    
    params = {
        "startPeriod": "2000",
        "format": "csv"
    }
    
    print(f"Trying QNA dataset: {url}")
    response = requests.get(url, params=params)
    
    if response.status_code != 200:
        print(f"QNA also failed: {response.status_code}")
        print("Trying KEI (Key Economic Indicators)...")
        return fetch_from_kei()
    
    from io import StringIO
    df = pd.read_csv(StringIO(response.text))
    return process_data(df)


def fetch_from_kei():
    """
    Fallback: Key Economic Indicators dataset
    Often has pre-calculated ratios
    """
    
    # Try the Economic Outlook database which has investment ratios
    dataset = "EO"  # Economic Outlook
    countries = "+".join(G7_COUNTRIES)
    
    # ITISKV: Total investment as % of GDP (volume)
    # Or ITISK: Investment share
    
    url = f"{BASE_URL}/{dataset}/{countries}.ITISKV"
    
    params = {
        "startPeriod": "2000",
        "format": "csv"
    }
    
    print(f"Trying Economic Outlook: {url}")
    response = requests.get(url, params=params)
    
    if response.status_code == 200:
        from io import StringIO
        df = pd.read_csv(StringIO(response.text))
        print("\nData retrieved successfully from Economic Outlook!")
        print(f"Shape: {df.shape}")
        print(f"Columns: {list(df.columns)}")
        return df
    
    # Final fallback - simple data API
    print("Trying simpler OECD.Stat approach...")
    return fetch_simple_approach()


def fetch_simple_approach():
    """
    Simple approach using OECD data explorer format
    """
    
    # Use the newer OECD Data Explorer API
    # Investment ratio from National Accounts at a Glance
    
    results = {}
    
    for country in G7_COUNTRIES:
        # SNA Table 1 with specific structure
        url = f"https://sdmx.oecd.org/public/rest/data/OECD.SDD.NAD,DSD_NAMAIN1@DF_TABLE1_EXPENDITURE,1.0/{country}.A.P51._T.V.N"
        
        params = {
            "startPeriod": "2000",
            "format": "csv"
        }
        
        response = requests.get(url, params=params)
        
        if response.status_code == 200:
            from io import StringIO
            df = pd.read_csv(StringIO(response.text))
            if not df.empty:
                results[country] = df
                print(f"✓ Got data for {country}")
        else:
            print(f"✗ No data for {country}: {response.status_code}")
    
    if results:
        return pd.concat(results.values(), ignore_index=True)
    
    return None


def process_data(df):
    """Process and pivot the data for analysis"""
    
    print("\nRaw data sample:")
    print(df.head())
    print(f"\nColumns: {list(df.columns)}")
    
    # Typical OECD CSV columns include REF_AREA, TIME_PERIOD, OBS_VALUE
    if 'REF_AREA' in df.columns and 'TIME_PERIOD' in df.columns:
        # Pivot to get countries as columns, years as rows
        pivot = df.pivot_table(
            index='TIME_PERIOD',
            columns='REF_AREA',
            values='OBS_VALUE'
        )
        return pivot
    
    return df


def calculate_investment_ratio(gfcf_df, gdp_df):
    """Calculate GFCF/GDP ratio if we have separate series"""
    
    ratio = (gfcf_df / gdp_df) * 100
    return ratio


# Alternative: Direct World Bank API (more reliable for this metric)
def fetch_from_world_bank():
    """
    Fallback to World Bank API which has reliable investment/GDP data
    Indicator: NE.GDI.FTOT.ZS (Gross fixed capital formation % of GDP)
    """
    
    print("\n" + "="*60)
    print("Fetching from World Bank API (more reliable for this metric)")
    print("="*60)
    
    # World Bank country codes differ slightly
    wb_countries = {
        "USA": "US", "GBR": "GB", "DEU": "DE", 
        "FRA": "FR", "ITA": "IT", "JPN": "JP", "CAN": "CA"
    }
    
    country_string = ";".join(wb_countries.values())
    indicator = "NE.GDI.FTOT.ZS"  # Gross fixed capital formation (% of GDP)
    
    url = f"https://api.worldbank.org/v2/country/{country_string}/indicator/{indicator}"
    
    params = {
        "format": "json",
        "date": "2000:2024",
        "per_page": 500
    }
    
    response = requests.get(url, params=params)
    
    if response.status_code == 200:
        data = response.json()
        
        if len(data) > 1 and data[1]:
            records = []
            for item in data[1]:
                if item['value'] is not None:
                    records.append({
                        'country': item['country']['value'],
                        'country_code': item['countryiso3code'],
                        'year': int(item['date']),
                        'investment_pct_gdp': item['value']
                    })
            
            df = pd.DataFrame(records)
            
            # Pivot for easy viewing
            pivot = df.pivot_table(
                index='year',
                columns='country',
                values='investment_pct_gdp'
            ).sort_index()
            
            return pivot
    
    return None


if __name__ == "__main__":
    
    # Try OECD first
    print("Attempting to fetch from OECD API...")
    print("="*60)
    
    oecd_data = fetch_investment_share()
    
    if oecd_data is not None and not oecd_data.empty:
        print("\n" + "="*60)
        print("OECD DATA: Investment as % of GDP (G7 Countries)")
        print("="*60)
        print(oecd_data)
    
    # World Bank is often more reliable for this specific indicator
    wb_data = fetch_from_world_bank()
    
    if wb_data is not None:
        print("\n" + "="*60)
        print("WORLD BANK DATA: Gross Fixed Capital Formation (% of GDP)")
        print("="*60)
        print(wb_data.round(1).to_string())
        
        # Save to CSV
        wb_data.to_csv("g7_investment_share_gdp.csv")
        print("\n✓ Data saved to 'g7_investment_share_gdp.csv'")
        
        # Basic stats
        print("\n" + "="*60)
        print("SUMMARY STATISTICS (2020-2023 average)")
        print("="*60)
        recent = wb_data.loc[2020:2023].mean().sort_values(ascending=False)
        for country, value in recent.items():
            print(f"{country}: {value:.1f}%")

Attempting to fetch from OECD API...
Fetching data from: https://sdmx.oecd.org/public/rest/data/SNA_TABLE1/USA+GBR+DEU+FRA+ITA+JPN+CAN.P51+B1_GE.C
Error: 404
Trying alternative dataset (QNA - Quarterly National Accounts)...
Trying QNA dataset: https://sdmx.oecd.org/public/rest/data/QNA/USA+GBR+DEU+FRA+ITA+JPN+CAN.P51+B1_GE.VOBARSA.A
QNA also failed: 404
Trying KEI (Key Economic Indicators)...
Trying Economic Outlook: https://sdmx.oecd.org/public/rest/data/EO/USA+GBR+DEU+FRA+ITA+JPN+CAN.ITISKV
Trying simpler OECD.Stat approach...
✗ No data for USA: 404
✗ No data for GBR: 404
✗ No data for DEU: 404
✗ No data for FRA: 404
✗ No data for ITA: 404
✗ No data for JPN: 404
✗ No data for CAN: 404

Fetching from World Bank API (more reliable for this metric)

WORLD BANK DATA: Gross Fixed Capital Formation (% of GDP)
country  Canada  France  Germany  Italy  Japan  United Kingdom  United States
year                                                                         
2000       19.6    20.9    

In [20]:
investment_g7 = pd.read_csv('g7_investment_share_gdp.csv')

# convert to datetime
investment_g7['year'] = pd.to_datetime(investment_g7['year'], format='%Y')

# melt
investment_g7 = investment_g7.melt(id_vars=['year'], var_name='country', value_name='value')

investment_g7['value'] = investment_g7['value'] / 100  # convert to decimal 

investment_g7.to_csv('g7_investment_share_gdp.csv', index=False)


In [24]:
"""
OECD API: Labor Productivity Time Series for G7 Countries
GDP per hour worked (USD, constant prices, PPP)
Output in long form (tidy data) with datetime year column
"""

import requests
import pandas as pd
from io import StringIO


# G7 country codes
G7_COUNTRIES = ["USA", "GBR", "DEU", "FRA", "ITA", "JPN", "CAN"]

G7_NAMES = {
    "USA": "United States",
    "GBR": "United Kingdom", 
    "DEU": "Germany",
    "FRA": "France",
    "ITA": "Italy",
    "JPN": "Japan",
    "CAN": "Canada"
}


def fetch_oecd_productivity():
    """
    Fetch labor productivity from OECD Productivity Statistics
    GDP per hour worked - the standard productivity measure
    """
    
    print("="*60)
    print("Fetching OECD Labor Productivity Data")
    print("="*60)
    
    countries = "+".join(G7_COUNTRIES)
    
    url = f"https://sdmx.oecd.org/public/rest/data/OECD.SDD.TPS,DSD_PDB@DF_PDB_LV,1.0/{countries}.T_GDPHRS.USD_PPP_PS"
    
    params = {
        "startPeriod": "1990",
        "format": "csv"
    }
    
    print(f"Trying: {url[:80]}...")
    response = requests.get(url, params=params)
    
    if response.status_code == 200:
        df = pd.read_csv(StringIO(response.text))
        if not df.empty:
            print("✓ Got data from OECD Productivity database")
            return process_oecd_data(df)
    
    print(f"Status: {response.status_code}, trying alternative...")
    return None


def process_oecd_data(df):
    """Process OECD CSV response into long form"""
    
    ref_col = None
    time_col = None
    value_col = None
    
    for col in df.columns:
        if 'REF_AREA' in col.upper():
            ref_col = col
        elif 'TIME' in col.upper() and 'PERIOD' in col.upper():
            time_col = col
        elif 'OBS_VALUE' in col.upper():
            value_col = col
    
    if all([ref_col, time_col, value_col]):
        df_long = df[[ref_col, time_col, value_col]].copy()
        df_long.columns = ['country_code', 'year', 'gdp_per_worker']
        
        # Convert year to datetime (January 1st of each year)
        df_long['year'] = pd.to_datetime(df_long['year'], format='%Y')
        
        df_long['gdp_per_worker'] = pd.to_numeric(df_long['gdp_per_worker'], errors='coerce')
        df_long['country'] = df_long['country_code'].map(G7_NAMES)
        
        df_long = df_long[['country', 'country_code', 'year', 'gdp_per_worker']]
        df_long = df_long.sort_values(['country', 'year']).reset_index(drop=True)
        
        return df_long
    
    return df


def fetch_world_bank_productivity():
    """
    World Bank: GDP per person employed
    Returns long form data with datetime year
    """
    
    print("\n" + "="*60)
    print("Fetching World Bank Labor Productivity Data")
    print("(GDP per person employed, constant 2017 PPP $)")
    print("="*60)
    
    wb_countries = {
        "USA": "US", "GBR": "GB", "DEU": "DE",
        "FRA": "FR", "ITA": "IT", "JPN": "JP", "CAN": "CA"
    }
    
    country_string = ";".join(wb_countries.values())
    indicator = "SL.GDP.PCAP.EM.KD"
    
    url = f"https://api.worldbank.org/v2/country/{country_string}/indicator/{indicator}"
    
    params = {
        "format": "json",
        "date": "1990:2024",
        "per_page": 1000
    }
    
    response = requests.get(url, params=params)
    
    if response.status_code == 200:
        data = response.json()
        
        if len(data) > 1 and data[1]:
            records = []
            for item in data[1]:
                if item['value'] is not None:
                    records.append({
                        'country': item['country']['value'],
                        'country_code': item['countryiso3code'],
                        'year': item['date'],
                        'gdp_per_worker': item['value']
                    })
            
            df_long = pd.DataFrame(records)
            
            # Convert year to datetime (January 1st of each year)
            df_long['year'] = pd.to_datetime(df_long['year'], format='%Y')
            
            df_long = df_long.sort_values(['country', 'year']).reset_index(drop=True)
            
            print(f"✓ Retrieved {len(df_long)} observations")
            return df_long
    
    print(f"World Bank request failed: {response.status_code}")
    return None


def add_derived_columns(df):
    """Add growth rates and indexed values to long form data"""
    
    df = df.copy()
    df = df.sort_values(['country', 'year']).reset_index(drop=True)
    
    # Year-over-year growth rate
    df['growth_rate'] = df.groupby('country')['gdp_per_worker'].pct_change() * 100
    
    # Index to 2000 = 100
    def index_to_base(group, base_year=2000):
        base_date = pd.Timestamp(f'{base_year}-01-01')
        if base_date in group['year'].values:
            base_value = group.loc[group['year'] == base_date, 'gdp_per_worker'].values[0]
        else:
            base_value = group['gdp_per_worker'].iloc[0]
        return (group['gdp_per_worker'] / base_value) * 100
    
    df['indexed_2000'] = df.groupby('country', group_keys=False).apply(index_to_base)
    
    return df


if __name__ == "__main__":
    
    # Try OECD first
    oecd_data = fetch_oecd_productivity()
    
    if oecd_data is not None and not oecd_data.empty:
        print("\n" + "="*60)
        print("OECD DATA (Long Form)")
        print("="*60)
        print(oecd_data.head(20).to_string(index=False))
        print(f"\nYear dtype: {oecd_data['year'].dtype}")
    
    # World Bank
    wb_data = fetch_world_bank_productivity()
    
    if wb_data is not None:
        wb_data = add_derived_columns(wb_data)
        
        print("\n" + "="*60)
        print("WORLD BANK DATA (Long Form)")
        print("="*60)
        print(f"\nShape: {wb_data.shape}")
        print(f"Columns: {list(wb_data.columns)}")
        print(f"Year dtype: {wb_data['year'].dtype}")
        
        print(f"\nSample (recent years):")
        recent = wb_data[wb_data['year'] >= '2018'].copy()
        print(recent.round(2).to_string(index=False))
        
        # Save
        wb_data.to_csv("g7_labor_productivity_long.csv", index=False)
        print("\n✓ Saved to 'g7_labor_productivity_long.csv'")
        
        # Demo datetime filtering
        print("\n" + "="*60)
        print("DATETIME FILTERING EXAMPLES")
        print("="*60)
        
        # Filter by date range
        mask = (wb_data['year'] >= '2010-01-01') & (wb_data['year'] < '2015-01-01')
        print(f"\n2010-2014 data points: {mask.sum()}")
        
        # Extract year component
        print(f"\nYear range: {wb_data['year'].dt.year.min()} - {wb_data['year'].dt.year.max()}")

Fetching OECD Labor Productivity Data
Trying: https://sdmx.oecd.org/public/rest/data/OECD.SDD.TPS,DSD_PDB@DF_PDB_LV,1.0/USA+GB...
Status: 422, trying alternative...

Fetching World Bank Labor Productivity Data
(GDP per person employed, constant 2017 PPP $)
✓ Retrieved 238 observations

WORLD BANK DATA (Long Form)

Shape: (238, 6)
Columns: ['country', 'country_code', 'year', 'gdp_per_worker', 'growth_rate', 'indexed_2000']
Year dtype: datetime64[ns]

Sample (recent years):
       country country_code       year  gdp_per_worker  growth_rate  indexed_2000
        Canada          CAN 2018-01-01       110704.70         0.92        112.42
        Canada          CAN 2019-01-01       110430.51        -0.25        112.14
        Canada          CAN 2020-01-01       111042.87         0.55        112.76
        Canada          CAN 2021-01-01       112202.17         1.04        113.94
        Canada          CAN 2022-01-01       111740.98        -0.41        113.47
        Canada          CAN 202

  df['indexed_2000'] = df.groupby('country', group_keys=False).apply(index_to_base)


In [8]:
awe = pd.read_excel('real_avg_weekly_earnings.xlsx', sheet_name='AWE Real_CPI', skiprows=7, nrows=311)

# keep only the first two cols
awe = awe.iloc[:, :2]

awe.columns = ['date', 'real_awe']

awe['date'] = pd.to_datetime(awe['date'], errors='coerce')

awe.to_csv('real_avg_weekly_earnings.csv', index=False)

In [None]:
# Updated ONS API helper (fixes 404)
def get_ons_timeseries(series_id: str, dataset_id: str) -> pd.DataFrame:
    url = f"https://api.ons.gov.uk/timeseries/{series_id}/dataset/{dataset_id}/data"
    response = requests.get(url)
    response.raise_for_status()
    data = response.json()

    records = []
    for period_type in ["years", "quarters", "months"]:
        if period_type in data:
            for obs in data[period_type]:
                records.append({
                    "date": obs.get("date"),
                    "value": float(obs.get("value")) if obs.get("value") else None,
                    "period_type": period_type,
                    "label": obs.get("label"),
                })

    df = pd.DataFrame(records)
    df.attrs["title"] = data.get("description", {}).get("title", "")
    df.attrs["unit"] = data.get("description", {}).get("unit", "")
    return df

# Fetch working age employment rate from ONS
employment_rate = get_ons_timeseries("lf24", "lms")

# Filter for monthly data and prepare for merging with awe
employment_rate = employment_rate[employment_rate["period_type"] == "months"].copy()

# Convert date strings to datetime
employment_rate["date"] = pd.to_datetime(employment_rate["date"], format="%Y-%m")

# Keep only relevant columns
employment_rate = employment_rate[["date", "value"]].rename(columns={"value": "employment_rate"})

# Convert percentage to decimal
employment_rate["employment_rate"] = employment_rate["employment_rate"] / 100

employment_rate.to_csv("uk_employment_rate.csv", index=False)
print(employment_rate.head(10))

HTTPError: 404 Client Error: Not Found for url: https://www.ons.gov.uk/employmentandlabourmarket/peopleinwork/labourproductivity/timeseries/lf24/lms/data