In [1]:
import requests
import pandas as pd

In [10]:
def get_ons_timeseries(series_id: str, dataset_id: str) -> pd.DataFrame:
    """
    Fetch a time series from the UK ONS API.
    
    Args:
        series_id: The series identifier
        dataset_id: The dataset identifier 
    
    Returns:
        DataFrame with the time series data
    """
    # ONS API endpoint for time series data
    url = f"https://www.ons.gov.uk/employmentandlabourmarket/peopleinwork/labourproductivity/timeseries/{series_id}/{dataset_id}/data"
    
    response = requests.get(url)
    response.raise_for_status()
    
    data = response.json()
    
    # Extract the time series observations
    # ONS data typically has 'years', 'quarters', 'months' sections
    records = []
    
    for period_type in ['years', 'quarters', 'months']:
        if period_type in data:
            for obs in data[period_type]:
                records.append({
                    'date': obs.get('date'),
                    'value': float(obs.get('value')) if obs.get('value') else None,
                    'period_type': period_type,
                    'label': obs.get('label')
                })
    
    df = pd.DataFrame(records)
    
    # Add metadata
    df.attrs['title'] = data.get('description', {}).get('title', '')
    df.attrs['unit'] = data.get('description', {}).get('unit', '')
    
    return df

In [3]:
# Fetch the LZVD series from PRDY dataset
productivity_growth = get_ons_timeseries('lzvd', 'prdy')

productivity_growth = productivity_growth[productivity_growth['period_type'] == 'quarters']
# turn 'date' from YYYY Qn to datetime

def quarter_to_date(date_str):
    year, quarter = date_str.split(' Q')
    month = (int(quarter) - 1) * 3 + 1
    return f"{year}-{month:02d}-01"

productivity_growth['date'] = productivity_growth['date'].apply(quarter_to_date)

productivity_growth['value'] = productivity_growth['value'] / 100  # Convert percentage to decimal

productivity_growth.to_csv('productivity_growth.csv', index=False)

In [5]:
# OECD SDMX REST API endpoint
BASE_URL = "https://sdmx.oecd.org/public/rest/data"

# G7 country codes
G7_COUNTRIES = ["USA", "GBR", "DEU", "FRA", "ITA", "JPN", "CAN"]


In [19]:


def fetch_investment_share():
    """
    Fetch investment (GFCF) as share of GDP from OECD National Accounts
    Using SNA_TABLE1 - GDP expenditure approach
    """
    
    # Build the data query
    # Dataset: SNA_TABLE1 (Main aggregates, expenditure approach)
    # We need: P51 (GFCF) and B1_GE (GDP) to calculate the ratio
    # Or use pre-calculated shares from QNA dataset
    
    dataset = "SNA_TABLE1"
    countries = "+".join(G7_COUNTRIES)
    
    # Transaction codes: P51 = GFCF, B1_GE = GDP
    # Measure: V = current prices, C = volume
    # We'll get both and calculate share
    
    # Query for GFCF and GDP in current prices
    query = f"{dataset}/{countries}.P51+B1_GE.C"
    
    url = f"{BASE_URL}/{query}"
    
    params = {
        "startPeriod": "2000",
        "dimensionAtObservation": "AllDimensions",
        "format": "csv"
    }
    
    print(f"Fetching data from: {url}")
    
    response = requests.get(url, params=params)
    
    if response.status_code != 200:
        print(f"Error: {response.status_code}")
        print("Trying alternative dataset (QNA - Quarterly National Accounts)...")
        return fetch_from_qna()
    
    # Parse CSV response
    from io import StringIO
    df = pd.read_csv(StringIO(response.text))
    
    return process_data(df)


def fetch_from_qna():
    """
    Alternative: Use Quarterly National Accounts with annual frequency
    This dataset often has better coverage for investment ratios
    """
    
    dataset = "QNA"
    countries = "+".join(G7_COUNTRIES)
    
    # P51: GFCF, B1_GE: GDP
    # VOBARSA: Current prices, seasonally adjusted annual rate
    # Or use GDP ratio directly if available
    
    query = f"{dataset}/{countries}.P51+B1_GE.VOBARSA.A"
    url = f"{BASE_URL}/{query}"
    
    params = {
        "startPeriod": "2000",
        "format": "csv"
    }
    
    print(f"Trying QNA dataset: {url}")
    response = requests.get(url, params=params)
    
    if response.status_code != 200:
        print(f"QNA also failed: {response.status_code}")
        print("Trying KEI (Key Economic Indicators)...")
        return fetch_from_kei()
    
    from io import StringIO
    df = pd.read_csv(StringIO(response.text))
    return process_data(df)


def fetch_from_kei():
    """
    Fallback: Key Economic Indicators dataset
    Often has pre-calculated ratios
    """
    
    # Try the Economic Outlook database which has investment ratios
    dataset = "EO"  # Economic Outlook
    countries = "+".join(G7_COUNTRIES)
    
    # ITISKV: Total investment as % of GDP (volume)
    # Or ITISK: Investment share
    
    url = f"{BASE_URL}/{dataset}/{countries}.ITISKV"
    
    params = {
        "startPeriod": "2000",
        "format": "csv"
    }
    
    print(f"Trying Economic Outlook: {url}")
    response = requests.get(url, params=params)
    
    if response.status_code == 200:
        from io import StringIO
        df = pd.read_csv(StringIO(response.text))
        print("\nData retrieved successfully from Economic Outlook!")
        print(f"Shape: {df.shape}")
        print(f"Columns: {list(df.columns)}")
        return df
    
    # Final fallback - simple data API
    print("Trying simpler OECD.Stat approach...")
    return fetch_simple_approach()


def fetch_simple_approach():
    """
    Simple approach using OECD data explorer format
    """
    
    # Use the newer OECD Data Explorer API
    # Investment ratio from National Accounts at a Glance
    
    results = {}
    
    for country in G7_COUNTRIES:
        # SNA Table 1 with specific structure
        url = f"https://sdmx.oecd.org/public/rest/data/OECD.SDD.NAD,DSD_NAMAIN1@DF_TABLE1_EXPENDITURE,1.0/{country}.A.P51._T.V.N"
        
        params = {
            "startPeriod": "2000",
            "format": "csv"
        }
        
        response = requests.get(url, params=params)
        
        if response.status_code == 200:
            from io import StringIO
            df = pd.read_csv(StringIO(response.text))
            if not df.empty:
                results[country] = df
                print(f"✓ Got data for {country}")
        else:
            print(f"✗ No data for {country}: {response.status_code}")
    
    if results:
        return pd.concat(results.values(), ignore_index=True)
    
    return None


def process_data(df):
    """Process and pivot the data for analysis"""
    
    print("\nRaw data sample:")
    print(df.head())
    print(f"\nColumns: {list(df.columns)}")
    
    # Typical OECD CSV columns include REF_AREA, TIME_PERIOD, OBS_VALUE
    if 'REF_AREA' in df.columns and 'TIME_PERIOD' in df.columns:
        # Pivot to get countries as columns, years as rows
        pivot = df.pivot_table(
            index='TIME_PERIOD',
            columns='REF_AREA',
            values='OBS_VALUE'
        )
        return pivot
    
    return df


def calculate_investment_ratio(gfcf_df, gdp_df):
    """Calculate GFCF/GDP ratio if we have separate series"""
    
    ratio = (gfcf_df / gdp_df) * 100
    return ratio


# Alternative: Direct World Bank API (more reliable for this metric)
def fetch_from_world_bank():
    """
    Fallback to World Bank API which has reliable investment/GDP data
    Indicator: NE.GDI.FTOT.ZS (Gross fixed capital formation % of GDP)
    """
    
    print("\n" + "="*60)
    print("Fetching from World Bank API (more reliable for this metric)")
    print("="*60)
    
    # World Bank country codes differ slightly
    wb_countries = {
        "USA": "US", "GBR": "GB", "DEU": "DE", 
        "FRA": "FR", "ITA": "IT", "JPN": "JP", "CAN": "CA"
    }
    
    country_string = ";".join(wb_countries.values())
    indicator = "NE.GDI.FTOT.ZS"  # Gross fixed capital formation (% of GDP)
    
    url = f"https://api.worldbank.org/v2/country/{country_string}/indicator/{indicator}"
    
    params = {
        "format": "json",
        "date": "2000:2024",
        "per_page": 500
    }
    
    response = requests.get(url, params=params)
    
    if response.status_code == 200:
        data = response.json()
        
        if len(data) > 1 and data[1]:
            records = []
            for item in data[1]:
                if item['value'] is not None:
                    records.append({
                        'country': item['country']['value'],
                        'country_code': item['countryiso3code'],
                        'year': int(item['date']),
                        'investment_pct_gdp': item['value']
                    })
            
            df = pd.DataFrame(records)
            
            # Pivot for easy viewing
            pivot = df.pivot_table(
                index='year',
                columns='country',
                values='investment_pct_gdp'
            ).sort_index()
            
            return pivot
    
    return None


if __name__ == "__main__":
    
    # Try OECD first
    print("Attempting to fetch from OECD API...")
    print("="*60)
    
    oecd_data = fetch_investment_share()
    
    if oecd_data is not None and not oecd_data.empty:
        print("\n" + "="*60)
        print("OECD DATA: Investment as % of GDP (G7 Countries)")
        print("="*60)
        print(oecd_data)
    
    # World Bank is often more reliable for this specific indicator
    wb_data = fetch_from_world_bank()
    
    if wb_data is not None:
        print("\n" + "="*60)
        print("WORLD BANK DATA: Gross Fixed Capital Formation (% of GDP)")
        print("="*60)
        print(wb_data.round(1).to_string())
        
        # Save to CSV
        wb_data.to_csv("g7_investment_share_gdp.csv")
        print("\n✓ Data saved to 'g7_investment_share_gdp.csv'")
        
        # Basic stats
        print("\n" + "="*60)
        print("SUMMARY STATISTICS (2020-2023 average)")
        print("="*60)
        recent = wb_data.loc[2020:2023].mean().sort_values(ascending=False)
        for country, value in recent.items():
            print(f"{country}: {value:.1f}%")

Attempting to fetch from OECD API...
Fetching data from: https://sdmx.oecd.org/public/rest/data/SNA_TABLE1/USA+GBR+DEU+FRA+ITA+JPN+CAN.P51+B1_GE.C
Error: 404
Trying alternative dataset (QNA - Quarterly National Accounts)...
Trying QNA dataset: https://sdmx.oecd.org/public/rest/data/QNA/USA+GBR+DEU+FRA+ITA+JPN+CAN.P51+B1_GE.VOBARSA.A
QNA also failed: 404
Trying KEI (Key Economic Indicators)...
Trying Economic Outlook: https://sdmx.oecd.org/public/rest/data/EO/USA+GBR+DEU+FRA+ITA+JPN+CAN.ITISKV
Trying simpler OECD.Stat approach...
✗ No data for USA: 404
✗ No data for GBR: 404
✗ No data for DEU: 404
✗ No data for FRA: 404
✗ No data for ITA: 404
✗ No data for JPN: 404
✗ No data for CAN: 404

Fetching from World Bank API (more reliable for this metric)

WORLD BANK DATA: Gross Fixed Capital Formation (% of GDP)
country  Canada  France  Germany  Italy  Japan  United Kingdom  United States
year                                                                         
2000       19.6    20.9    

In [None]:
import requests
import pandas as pd
from io import StringIO

headers = {"Accept": "application/vnd.sdmx.data+csv"}

# ---------- Fetch ----------
url_gfcf_gdp = (
    "https://sdmx.oecd.org/public/rest/data/"
    "OECD.SDD.NAD,DSD_NAAG@DF_NAAG_III,/A..P51G.PT_B1GQ."
)
url_biz_gfcf = (
    "https://sdmx.oecd.org/public/rest/data/"
    "OECD.SDD.NAD,DSD_NAAG@DF_NAAG_III,/A..P51GS1K.."
)

r1 = requests.get(url_gfcf_gdp, headers=headers)
r1.raise_for_status()
df_raw_gfcf = pd.read_csv(StringIO(r1.text))

r2 = requests.get(url_biz_gfcf, headers=headers)
r2.raise_for_status()
df_raw_biz = pd.read_csv(StringIO(r2.text))

# ---------- Clean ----------
keep_cols = ["REF_AREA", "TIME_PERIOD", "OBS_VALUE"]

df_gfcf_pct_gdp = (
    df_raw_gfcf[keep_cols]
    .rename(columns={"OBS_VALUE": "gfcf_pct_gdp"})
    .copy()
)

df_biz_pct_gfcf = (
    df_raw_biz[keep_cols]
    .rename(columns={"OBS_VALUE": "biz_pct_gfcf"})
    .copy()
)

# ---------- Compute business investment as % of GDP ----------
df_merged = df_gfcf_pct_gdp.merge(
    df_biz_pct_gfcf,
    on=["REF_AREA", "TIME_PERIOD"],
    how="inner",
)

df_merged["biz_inv_pct_gdp"] = (
    df_merged["gfcf_pct_gdp"] * df_merged["biz_pct_gfcf"] / 100
)

# Final dataframes
df_gfcf_pct_gdp = df_gfcf_pct_gdp.copy()  # GFCF as % of GDP
df_biz_pct_gdp = (
    df_merged[["REF_AREA", "TIME_PERIOD", "biz_inv_pct_gdp"]]
    .rename(columns={"biz_inv_pct_gdp": "biz_pct_gdp"})
    .copy()
)

# ---------- Verify ----------
print("=== GFCF as % of GDP ===")
print(df_gfcf_pct_gdp.head(10))
print(f"Shape: {df_gfcf_pct_gdp.shape}")
print(f"Countries: {df_gfcf_pct_gdp['REF_AREA'].nunique()}")
print(f"Years: {df_gfcf_pct_gdp['TIME_PERIOD'].min()} - {df_gfcf_pct_gdp['TIME_PERIOD'].max()}")

print("\n=== Business investment as % of GDP ===")
print(df_biz_pct_gdp.head(10))
print(f"Shape: {df_biz_pct_gdp.shape}")
print(f"Countries: {df_biz_pct_gdp['REF_AREA'].nunique()}")
print(f"Years: {df_biz_pct_gdp['TIME_PERIOD'].min()} - {df_biz_pct_gdp['TIME_PERIOD'].max()}")

Fetching GFCF as % of GDP...
Fetching business investment as % of GFCF...

=== GFCF as % of GDP ===
['DATAFLOW', 'FREQ', 'REF_AREA', 'MEASURE', 'UNIT_MEASURE', 'CHAPTER', 'TIME_PERIOD', 'OBS_VALUE', 'ADJUSTMENT', 'COUNTERPART_AREA', 'SECTOR', 'COUNTERPART_SECTOR', 'CONSOLIDATION', 'ACCOUNTING_ENTRY', 'TRANSACTION', 'INSTR_ASSET', 'MATURITY', 'PRODUCT', 'PENSION_FUNDTYPE', 'CURRENCY_DENOM', 'VALUATION', 'PRICE_BASE', 'TRANSFORMATION', 'TABLE_IDENTIFIER', 'REF_YEAR_PRICE', 'BASE_PER', 'CONF_STATUS', 'DECIMALS', 'OBS_STATUS', 'UNIT_MULT', 'CURRENCY']
                                 DATAFLOW FREQ REF_AREA MEASURE UNIT_MEASURE  \
0  OECD.SDD.NAD:DSD_NAAG@DF_NAAG_III(1.0)    A      AUS    P51G      PT_B1GQ   
1  OECD.SDD.NAD:DSD_NAAG@DF_NAAG_III(1.0)    A      CAN    P51G      PT_B1GQ   
2  OECD.SDD.NAD:DSD_NAAG@DF_NAAG_III(1.0)    A      DNK    P51G      PT_B1GQ   
3  OECD.SDD.NAD:DSD_NAAG@DF_NAAG_III(1.0)    A      FRA    P51G      PT_B1GQ   
4  OECD.SDD.NAD:DSD_NAAG@DF_NAAG_III(1.0)    A

In [22]:
import requests
import pandas as pd
from io import StringIO

headers = {"Accept": "application/vnd.sdmx.data+csv"}

# ---------- Fetch ----------
url_gfcf_gdp = (
    "https://sdmx.oecd.org/public/rest/data/"
    "OECD.SDD.NAD,DSD_NAAG@DF_NAAG_III,/A..P51G.PT_B1GQ."
)
url_biz_gfcf = (
    "https://sdmx.oecd.org/public/rest/data/"
    "OECD.SDD.NAD,DSD_NAAG@DF_NAAG_III,/A..P51GS1K.."
)

r1 = requests.get(url_gfcf_gdp, headers=headers)
r1.raise_for_status()
df_raw_gfcf = pd.read_csv(StringIO(r1.text))

r2 = requests.get(url_biz_gfcf, headers=headers)
r2.raise_for_status()
df_raw_biz = pd.read_csv(StringIO(r2.text))

# ---------- Clean ----------
keep_cols = ["REF_AREA", "TIME_PERIOD", "OBS_VALUE"]

df_gfcf_pct_gdp = (
    df_raw_gfcf[keep_cols]
    .rename(columns={"OBS_VALUE": "gfcf_pct_gdp"})
    .copy()
)

df_biz_pct_gfcf = (
    df_raw_biz[keep_cols]
    .rename(columns={"OBS_VALUE": "biz_pct_gfcf"})
    .copy()
)

# ---------- Compute business investment as % of GDP ----------
df_merged = df_gfcf_pct_gdp.merge(
    df_biz_pct_gfcf,
    on=["REF_AREA", "TIME_PERIOD"],
    how="inner",
)

df_merged["biz_inv_pct_gdp"] = (
    df_merged["gfcf_pct_gdp"] * df_merged["biz_pct_gfcf"] / 100
)

# Final dataframes
df_gfcf_pct_gdp = df_gfcf_pct_gdp.copy()  # GFCF as % of GDP
df_biz_pct_gdp = (
    df_merged[["REF_AREA", "TIME_PERIOD", "biz_inv_pct_gdp"]]
    .rename(columns={"biz_inv_pct_gdp": "biz_pct_gdp"})
    .copy()
)

# ---------- Verify ----------
print("=== GFCF as % of GDP ===")
print(df_gfcf_pct_gdp.head(10))
print(f"Shape: {df_gfcf_pct_gdp.shape}")
print(f"Countries: {df_gfcf_pct_gdp['REF_AREA'].nunique()}")
print(f"Years: {df_gfcf_pct_gdp['TIME_PERIOD'].min()} - {df_gfcf_pct_gdp['TIME_PERIOD'].max()}")

print("\n=== Business investment as % of GDP ===")
print(df_biz_pct_gdp.head(10))
print(f"Shape: {df_biz_pct_gdp.shape}")
print(f"Countries: {df_biz_pct_gdp['REF_AREA'].nunique()}")
print(f"Years: {df_biz_pct_gdp['TIME_PERIOD'].min()} - {df_biz_pct_gdp['TIME_PERIOD'].max()}")

=== GFCF as % of GDP ===
  REF_AREA  TIME_PERIOD  gfcf_pct_gdp
0      AUS         1970     30.597052
1      CAN         1970     21.626786
2      DNK         1970     25.232500
3      FRA         1970     26.107233
4      KOR         1970     25.622032
5      NOR         1970     27.986453
6      CHN         1970     24.100026
7      AUS         1971     30.027122
8      CAN         1971     22.487582
9      DNK         1971     24.684925
Shape: (2223, 3)
Countries: 47
Years: 1970 - 2025

=== Business investment as % of GDP ===
  REF_AREA  TIME_PERIOD  biz_pct_gdp
0      AUS         1970    14.483577
1      FRA         1970    11.562925
2      AUS         1971    14.056441
3      FRA         1971    11.684433
4      AUS         1972    12.220396
5      FRA         1972    11.765184
6      USA         1972     9.213907
7      AUS         1973    11.572504
8      FRA         1973    11.832723
9      USA         1973     9.689233
Shape: (1281, 3)
Countries: 41
Years: 1970 - 2024


In [24]:
import pandas as pd

# Assuming df_gfcf_pct_gdp is already built from the previous script
# with columns: REF_AREA, TIME_PERIOD, gfcf_pct_gdp

# If loading from CSV:
# df_gfcf_pct_gdp = pd.read_csv("gfcf_pct_gdp.csv")

# G7 country codes
g7_codes = ["USA", "GBR", "DEU", "FRA", "JPN", "CAN", "ITA"]
g7_names = {
    "USA": "United States",
    "GBR": "United Kingdom",
    "DEU": "Germany",
    "FRA": "France",
    "JPN": "Japan",
    "CAN": "Canada",
    "ITA": "Italy",
}

# --- G7 rows ---
df_g7 = (
    df_gfcf_pct_gdp[df_gfcf_pct_gdp["REF_AREA"].isin(g7_codes)]
    .copy()
)
df_g7["country"] = df_g7["REF_AREA"].map(g7_names)
df_g7["value"] = df_g7["gfcf_pct_gdp"] / 100  # convert to proportion for % axis format
df_g7["p10"] = None
df_g7["p90"] = None
df_g7 = df_g7[["TIME_PERIOD", "country", "value", "p10", "p90"]]

# --- OECD p10/p90 by year ---
df_band = (
    df_gfcf_pct_gdp
    .groupby("TIME_PERIOD")["gfcf_pct_gdp"]
    .quantile([0.1, 0.9])
    .unstack()
    .reset_index()
)
df_band.columns = ["TIME_PERIOD", "p10", "p90"]
df_band["p10"] = df_band["p10"] / 100
df_band["p90"] = df_band["p90"] / 100
df_band["country"] = "OECD"
df_band["value"] = None
df_band = df_band[["TIME_PERIOD", "country", "value", "p10", "p90"]]

# --- Combine and save ---
df_out = pd.concat([df_g7, df_band], ignore_index=True)
df_out = df_out.rename(columns={"TIME_PERIOD": "year"})
df_out = df_out.sort_values(["country", "year"]).reset_index(drop=True)

print(df_out.head(20))
print(f"\nShape: {df_out.shape}")
print(f"Countries: {df_out['country'].unique()}")

df_out.to_csv("gfcf_pct_gdp_chart.csv", index=False)
print("\nSaved to gfcf_pct_gdp_chart.csv")

    year country     value  p10  p90
0   1970  Canada  0.216268  NaN  NaN
1   1971  Canada  0.224876  NaN  NaN
2   1972  Canada  0.221941  NaN  NaN
3   1973  Canada  0.227066  NaN  NaN
4   1974  Canada  0.235168  NaN  NaN
5   1975  Canada  0.242761  NaN  NaN
6   1976  Canada  0.237885  NaN  NaN
7   1977  Canada  0.232798  NaN  NaN
8   1978  Canada  0.229448  NaN  NaN
9   1979  Canada  0.233060  NaN  NaN
10  1980  Canada  0.238622  NaN  NaN
11  1981  Canada  0.250197  NaN  NaN
12  1982  Canada  0.225686  NaN  NaN
13  1983  Canada  0.210234  NaN  NaN
14  1984  Canada  0.201597  NaN  NaN
15  1985  Canada  0.207065  NaN  NaN
16  1986  Canada  0.210366  NaN  NaN
17  1987  Canada  0.218600  NaN  NaN
18  1988  Canada  0.226287  NaN  NaN
19  1989  Canada  0.229614  NaN  NaN

Shape: (443, 5)
Countries: ['Canada' 'France' 'Germany' 'Italy' 'Japan' 'OECD' 'United Kingdom'
 'United States']

Saved to gfcf_pct_gdp_chart.csv


  df_out = pd.concat([df_g7, df_band], ignore_index=True)


In [25]:
# repeat for biz_pct_gdp
# Assuming df_biz_pct_gdp is already built from the previous script
# with columns: REF_AREA, TIME_PERIOD, biz_pct_gdp  
df_g7_biz = (
    df_biz_pct_gdp[df_biz_pct_gdp["REF_AREA"].isin(g7_codes)]
    .copy()
)
df_g7_biz["country"] = df_g7_biz["REF_AREA"].map(g7_names)
df_g7_biz["value"] = df_g7_biz["biz_pct_gdp"] / 100  # convert to proportion for % axis format
df_g7_biz["p10"] = None
df_g7_biz["p90"] = None
df_g7_biz = df_g7_biz[["TIME_PERIOD", "country", "value", "p10", "p90"]]

# --- OECD p10/p90 by year ---
df_band_biz = (
    df_biz_pct_gdp
    .groupby("TIME_PERIOD")["biz_pct_gdp"]
    .quantile([0.1, 0.9])
    .unstack()
    .reset_index()
)
df_band_biz.columns = ["TIME_PERIOD", "p10", "p90"]
df_band_biz["p10"] = df_band_biz["p10"] / 100
df_band_biz["p90"] = df_band_biz["p90"] / 100
df_band_biz["country"] = "OECD"
df_band_biz["value"] = None
df_band_biz = df_band_biz[["TIME_PERIOD", "country", "value", "p10", "p90"]]

# --- Combine and save ---
df_out_biz = pd.concat([df_g7_biz, df_band_biz], ignore_index=True)
df_out_biz = df_out_biz.rename(columns={"TIME_PERIOD": "year"})
df_out_biz = df_out_biz.sort_values(["country", "year"]).reset_index(drop=True)

print(df_out_biz.head(20))
print(f"\nShape: {df_out_biz.shape}")
print(f"Countries: {df_out_biz['country'].unique()}")

df_out_biz.to_csv("biz_pct_gdp_chart.csv", index=False)
print("\nSaved to biz_pct_gdp_chart.csv")


    year country     value  p10  p90
0   1981  Canada  0.139077  NaN  NaN
1   1982  Canada  0.125668  NaN  NaN
2   1983  Canada  0.105804  NaN  NaN
3   1984  Canada  0.098932  NaN  NaN
4   1985  Canada  0.103892  NaN  NaN
5   1986  Canada  0.102848  NaN  NaN
6   1987  Canada  0.105553  NaN  NaN
7   1988  Canada  0.110685  NaN  NaN
8   1989  Canada  0.110338  NaN  NaN
9   1990  Canada  0.104003  NaN  NaN
10  1991  Canada  0.098176  NaN  NaN
11  1992  Canada  0.089180  NaN  NaN
12  1993  Canada  0.087399  NaN  NaN
13  1994  Canada  0.091643  NaN  NaN
14  1995  Canada  0.091728  NaN  NaN
15  1996  Canada  0.097099  NaN  NaN
16  1997  Canada  0.110021  NaN  NaN
17  1998  Canada  0.116593  NaN  NaN
18  1999  Canada  0.114580  NaN  NaN
19  2000  Canada  0.109054  NaN  NaN

Shape: (328, 5)
Countries: ['Canada' 'France' 'Germany' 'Italy' 'Japan' 'OECD' 'United Kingdom'
 'United States']

Saved to biz_pct_gdp_chart.csv


  df_out_biz = pd.concat([df_g7_biz, df_band_biz], ignore_index=True)


In [23]:
# save both to csv
df_gfcf_pct_gdp.to_csv("gfcf_pct_gdp.csv", index=False)
df_biz_pct_gdp.to_csv("biz_pct_gdp.csv", index=False)

In [19]:
investment_g7 = pd.read_csv('g7_investment_share_gdp.csv')

# convert to datetime
investment_g7['year'] = pd.to_datetime(investment_g7['year'], format='%Y')

# melt
investment_g7 = investment_g7.melt(id_vars=['year'], var_name='country', value_name='value')

investment_g7['value'] = investment_g7['value'] / 100  # convert to decimal 

investment_g7.to_csv('g7_investment_share_gdp.csv', index=False)


ValueError: unconverted data remains when parsing with format "%Y": "-01-01", at position 0. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

In [24]:
"""
OECD API: Labor Productivity Time Series for G7 Countries
GDP per hour worked (USD, constant prices, PPP)
Output in long form (tidy data) with datetime year column
"""

import requests
import pandas as pd
from io import StringIO


# G7 country codes
G7_COUNTRIES = ["USA", "GBR", "DEU", "FRA", "ITA", "JPN", "CAN"]

G7_NAMES = {
    "USA": "United States",
    "GBR": "United Kingdom", 
    "DEU": "Germany",
    "FRA": "France",
    "ITA": "Italy",
    "JPN": "Japan",
    "CAN": "Canada"
}


def fetch_oecd_productivity():
    """
    Fetch labor productivity from OECD Productivity Statistics
    GDP per hour worked - the standard productivity measure
    """
    
    print("="*60)
    print("Fetching OECD Labor Productivity Data")
    print("="*60)
    
    countries = "+".join(G7_COUNTRIES)
    
    url = f"https://sdmx.oecd.org/public/rest/data/OECD.SDD.TPS,DSD_PDB@DF_PDB_LV,1.0/{countries}.T_GDPHRS.USD_PPP_PS"
    
    params = {
        "startPeriod": "1990",
        "format": "csv"
    }
    
    print(f"Trying: {url[:80]}...")
    response = requests.get(url, params=params)
    
    if response.status_code == 200:
        df = pd.read_csv(StringIO(response.text))
        if not df.empty:
            print("✓ Got data from OECD Productivity database")
            return process_oecd_data(df)
    
    print(f"Status: {response.status_code}, trying alternative...")
    return None


def process_oecd_data(df):
    """Process OECD CSV response into long form"""
    
    ref_col = None
    time_col = None
    value_col = None
    
    for col in df.columns:
        if 'REF_AREA' in col.upper():
            ref_col = col
        elif 'TIME' in col.upper() and 'PERIOD' in col.upper():
            time_col = col
        elif 'OBS_VALUE' in col.upper():
            value_col = col
    
    if all([ref_col, time_col, value_col]):
        df_long = df[[ref_col, time_col, value_col]].copy()
        df_long.columns = ['country_code', 'year', 'gdp_per_worker']
        
        # Convert year to datetime (January 1st of each year)
        df_long['year'] = pd.to_datetime(df_long['year'], format='%Y')
        
        df_long['gdp_per_worker'] = pd.to_numeric(df_long['gdp_per_worker'], errors='coerce')
        df_long['country'] = df_long['country_code'].map(G7_NAMES)
        
        df_long = df_long[['country', 'country_code', 'year', 'gdp_per_worker']]
        df_long = df_long.sort_values(['country', 'year']).reset_index(drop=True)
        
        return df_long
    
    return df


def fetch_world_bank_productivity():
    """
    World Bank: GDP per person employed
    Returns long form data with datetime year
    """
    
    print("\n" + "="*60)
    print("Fetching World Bank Labor Productivity Data")
    print("(GDP per person employed, constant 2017 PPP $)")
    print("="*60)
    
    wb_countries = {
        "USA": "US", "GBR": "GB", "DEU": "DE",
        "FRA": "FR", "ITA": "IT", "JPN": "JP", "CAN": "CA"
    }
    
    country_string = ";".join(wb_countries.values())
    indicator = "SL.GDP.PCAP.EM.KD"
    
    url = f"https://api.worldbank.org/v2/country/{country_string}/indicator/{indicator}"
    
    params = {
        "format": "json",
        "date": "1990:2024",
        "per_page": 1000
    }
    
    response = requests.get(url, params=params)
    
    if response.status_code == 200:
        data = response.json()
        
        if len(data) > 1 and data[1]:
            records = []
            for item in data[1]:
                if item['value'] is not None:
                    records.append({
                        'country': item['country']['value'],
                        'country_code': item['countryiso3code'],
                        'year': item['date'],
                        'gdp_per_worker': item['value']
                    })
            
            df_long = pd.DataFrame(records)
            
            # Convert year to datetime (January 1st of each year)
            df_long['year'] = pd.to_datetime(df_long['year'], format='%Y')
            
            df_long = df_long.sort_values(['country', 'year']).reset_index(drop=True)
            
            print(f"✓ Retrieved {len(df_long)} observations")
            return df_long
    
    print(f"World Bank request failed: {response.status_code}")
    return None


def add_derived_columns(df):
    """Add growth rates and indexed values to long form data"""
    
    df = df.copy()
    df = df.sort_values(['country', 'year']).reset_index(drop=True)
    
    # Year-over-year growth rate
    df['growth_rate'] = df.groupby('country')['gdp_per_worker'].pct_change() * 100
    
    # Index to 2000 = 100
    def index_to_base(group, base_year=2000):
        base_date = pd.Timestamp(f'{base_year}-01-01')
        if base_date in group['year'].values:
            base_value = group.loc[group['year'] == base_date, 'gdp_per_worker'].values[0]
        else:
            base_value = group['gdp_per_worker'].iloc[0]
        return (group['gdp_per_worker'] / base_value) * 100
    
    df['indexed_2000'] = df.groupby('country', group_keys=False).apply(index_to_base)
    
    return df


if __name__ == "__main__":
    
    # Try OECD first
    oecd_data = fetch_oecd_productivity()
    
    if oecd_data is not None and not oecd_data.empty:
        print("\n" + "="*60)
        print("OECD DATA (Long Form)")
        print("="*60)
        print(oecd_data.head(20).to_string(index=False))
        print(f"\nYear dtype: {oecd_data['year'].dtype}")
    
    # World Bank
    wb_data = fetch_world_bank_productivity()
    
    if wb_data is not None:
        wb_data = add_derived_columns(wb_data)
        
        print("\n" + "="*60)
        print("WORLD BANK DATA (Long Form)")
        print("="*60)
        print(f"\nShape: {wb_data.shape}")
        print(f"Columns: {list(wb_data.columns)}")
        print(f"Year dtype: {wb_data['year'].dtype}")
        
        print(f"\nSample (recent years):")
        recent = wb_data[wb_data['year'] >= '2018'].copy()
        print(recent.round(2).to_string(index=False))
        
        # Save
        wb_data.to_csv("g7_labor_productivity_long.csv", index=False)
        print("\n✓ Saved to 'g7_labor_productivity_long.csv'")
        
        # Demo datetime filtering
        print("\n" + "="*60)
        print("DATETIME FILTERING EXAMPLES")
        print("="*60)
        
        # Filter by date range
        mask = (wb_data['year'] >= '2010-01-01') & (wb_data['year'] < '2015-01-01')
        print(f"\n2010-2014 data points: {mask.sum()}")
        
        # Extract year component
        print(f"\nYear range: {wb_data['year'].dt.year.min()} - {wb_data['year'].dt.year.max()}")

Fetching OECD Labor Productivity Data
Trying: https://sdmx.oecd.org/public/rest/data/OECD.SDD.TPS,DSD_PDB@DF_PDB_LV,1.0/USA+GB...
Status: 422, trying alternative...

Fetching World Bank Labor Productivity Data
(GDP per person employed, constant 2017 PPP $)
✓ Retrieved 238 observations

WORLD BANK DATA (Long Form)

Shape: (238, 6)
Columns: ['country', 'country_code', 'year', 'gdp_per_worker', 'growth_rate', 'indexed_2000']
Year dtype: datetime64[ns]

Sample (recent years):
       country country_code       year  gdp_per_worker  growth_rate  indexed_2000
        Canada          CAN 2018-01-01       110704.70         0.92        112.42
        Canada          CAN 2019-01-01       110430.51        -0.25        112.14
        Canada          CAN 2020-01-01       111042.87         0.55        112.76
        Canada          CAN 2021-01-01       112202.17         1.04        113.94
        Canada          CAN 2022-01-01       111740.98        -0.41        113.47
        Canada          CAN 202

  df['indexed_2000'] = df.groupby('country', group_keys=False).apply(index_to_base)


In [54]:
# Fetch net exports (fiscal balance) as % of GDP from OECD Economic Outlook
url_netx = "https://sdmx.oecd.org/public/rest/data/OECD.ECO.MAD,DSD_EO@DF_EO,/.FBGSQ.A"

r_netx = requests.get(url_netx, headers=headers)
r_netx.raise_for_status()
df_raw_netx = pd.read_csv(StringIO(r_netx.text))

# Clean to keep relevant columns
df_netx = (
    df_raw_netx[["REF_AREA", "TIME_PERIOD", "OBS_VALUE"]]
    .rename(columns={"OBS_VALUE": "net_exports_pct_gdp"})
    .copy()
)

print(f"Shape: {df_netx.shape}")
print(f"Countries: {df_netx['REF_AREA'].nunique()}")
print(f"Years: {df_netx['TIME_PERIOD'].min()} - {df_netx['TIME_PERIOD'].max()}")

# --- G7 rows ---
df_g7_netx = (
    df_netx[df_netx["REF_AREA"].isin(g7_codes)]
    .copy()
)
df_g7_netx["country"] = df_g7_netx["REF_AREA"].map(g7_names)
df_g7_netx["value"] = df_g7_netx["net_exports_pct_gdp"] / 100  # convert to proportion
df_g7_netx["p10"] = None
df_g7_netx["p90"] = None
df_g7_netx = df_g7_netx[["TIME_PERIOD", "country", "value", "p10", "p90"]]

# --- OECD p10/p90 by year ---
df_band_netx = (
    df_netx
    .groupby("TIME_PERIOD")["net_exports_pct_gdp"]
    .quantile([0.1, 0.9])
    .unstack()
    .reset_index()
)
df_band_netx.columns = ["TIME_PERIOD", "p10", "p90"]
df_band_netx["p10"] = df_band_netx["p10"] / 100
df_band_netx["p90"] = df_band_netx["p90"] / 100
df_band_netx["country"] = "OECD"
df_band_netx["value"] = None
df_band_netx = df_band_netx[["TIME_PERIOD", "country", "value", "p10", "p90"]]

# --- Combine and save ---
df_out_netx = pd.concat([df_g7_netx, df_band_netx], ignore_index=True)
df_out_netx = df_out_netx.rename(columns={"TIME_PERIOD": "year"})
df_out_netx = df_out_netx.sort_values(["country", "year"]).reset_index(drop=True)

print(df_out_netx.head(20))
print(f"\nShape: {df_out_netx.shape}")
print(f"Countries: {df_out_netx['country'].unique()}")

df_out_netx.to_csv("net_exports_pct_gdp_chart.csv", index=False)
print("\nSaved to net_exports_pct_gdp_chart.csv")

Shape: (2484, 3)
Countries: 47
Years: 1960 - 2027
    year country     value  p10  p90
0   1961  Canada -0.006377  NaN  NaN
1   1962  Canada -0.003629  NaN  NaN
2   1963  Canada  0.005263  NaN  NaN
3   1964  Canada  0.009579  NaN  NaN
4   1965  Canada -0.003073  NaN  NaN
5   1966  Canada -0.001814  NaN  NaN
6   1967  Canada  0.007464  NaN  NaN
7   1968  Canada  0.009086  NaN  NaN
8   1969  Canada -0.001955  NaN  NaN
9   1970  Canada  0.021728  NaN  NaN
10  1971  Canada  0.012905  NaN  NaN
11  1972  Canada  0.005726  NaN  NaN
12  1973  Canada  0.010800  NaN  NaN
13  1974  Canada -0.000967  NaN  NaN
14  1975  Canada -0.019306  NaN  NaN
15  1976  Canada -0.008928  NaN  NaN
16  1977  Canada -0.003515  NaN  NaN
17  1978  Canada  0.002158  NaN  NaN
18  1979  Canada  0.004162  NaN  NaN
19  1980  Canada  0.016891  NaN  NaN

Shape: (535, 5)
Countries: ['Canada' 'France' 'Germany' 'Italy' 'Japan' 'OECD' 'United Kingdom'
 'United States']

Saved to net_exports_pct_gdp_chart.csv


  df_out_netx = pd.concat([df_g7_netx, df_band_netx], ignore_index=True)


In [None]:
# Fetch exports and imports as % of GDP from World Bank API for UK only
wb_countries = "GB"

indicators = {
    "NE.EXP.GNFS.ZS": "Exports (% of GDP)",
    "NE.IMP.GNFS.ZS": "Imports (% of GDP)",
}

records = []

for indicator_code, indicator_name in indicators.items():
    url = f"https://api.worldbank.org/v2/country/{wb_countries}/indicator/{indicator_code}"
    params = {
        "format": "json",
        "date": "1960:2024",
        "per_page": 5000,
    }
    resp = requests.get(url, params=params)
    resp.raise_for_status()
    data = resp.json()

    if len(data) > 1 and data[1]:
        for item in data[1]:
            if item["value"] is not None:
                records.append({
                    "year": int(item["date"]),
                    "indicator": indicator_name,
                    "value": item["value"] / 100,
                })

df_trade = pd.DataFrame(records)
df_trade = df_trade.sort_values(["indicator", "year"]).reset_index(drop=True)

print(df_trade.head(20))
print(f"\nShape: {df_trade.shape}")
print(f"Indicators: {df_trade['indicator'].unique()}")
print(f"Years: {df_trade['year'].min()} - {df_trade['year'].max()}")

df_trade.to_csv("uk_trade_pct_gdp.csv", index=False)
print("\n✓ Saved to uk_trade_pct_gdp.csv")


In [55]:
awe = pd.read_excel('real_avg_weekly_earnings.xlsx', sheet_name='AWE Real_CPI', skiprows=7, nrows=311)

# keep only the first two cols
awe = awe.iloc[:, :2]

awe.columns = ['date', 'real_awe']

awe['date'] = pd.to_datetime(awe['date'], errors='coerce')

awe.to_csv('real_avg_weekly_earnings.csv', index=False)

In [20]:
wae = pd.read_csv('uk_wa_emp.csv')

# in the 'Title' columns, change to datetime from eg 2025 JAN to 2025-01-01. If it's not in this form, delete the row
wae['Title'] = pd.to_datetime(wae['Title'], format='%Y %b', errors='coerce')
wae = wae.dropna(subset=['Title'])

wae.columns = ['date', 'wa_emp']

wae = wae[wae['date'] >= '2000-01-01']

# divide 'wa_emp' by 100 to get percentage
wae['wa_emp'] = wae['wa_emp'].astype(float) / 100

# merge with awe on date
df = wae.merge(awe, on='date', how='left')

df.columns = ['Date', 'Work Age Employment (%)', 'Real Average Weekly Earnings (£)']

df.melt(id_vars=['Date'], var_name='Series', value_name='Value').to_csv('emp_wages.csv', index=False)

In [21]:
df

Unnamed: 0,Date,Work Age Employment (%),Real Average Weekly Earnings (£)
0,2000-01-01,0.722,423.952736
1,2000-02-01,0.723,414.273648
2,2000-03-01,0.723,429.219800
3,2000-04-01,0.724,426.466376
4,2000-05-01,0.725,430.207379
...,...,...,...
305,2025-06-01,0.752,525.452165
306,2025-07-01,0.751,525.085583
307,2025-08-01,0.750,528.014767
308,2025-09-01,0.749,526.725034


In [23]:
mkt = pd.read_csv('market_dyn.csv', skiprows=6)

mkt.dropna(subset=['Year'], inplace=True)

mkt

Unnamed: 0,Year,Entering businesses,Incumbent growing,Incumbent shrinking,Exiting businesses,Net job creation rate
0,2001.0,4.67,11.56,-9.72,-4.75,1.76
2,2002.0,3.08,9.93,-8.89,-3.84,0.28
4,2003.0,5.92,7.89,-7.66,-4.41,1.74
6,2004.0,4.18,10.05,-8.65,-4.02,1.56
8,2005.0,4.01,8.95,-7.28,-5.01,0.67
10,2006.0,3.76,7.91,-7.9,-3.69,0.08
12,2007.0,3.95,7.89,-7.16,-3.21,1.47
14,2008.0,3.37,7.38,-7.17,-3.58,0.0
16,2009.0,2.9,7.37,-6.81,-3.54,-0.08
18,2010.0,3.52,6.61,-8.07,-3.67,-1.61


In [25]:
pip install fredapi

Collecting fredapi
  Using cached fredapi-0.5.2-py3-none-any.whl.metadata (5.0 kB)
Using cached fredapi-0.5.2-py3-none-any.whl (11 kB)
Installing collected packages: fredapi
Successfully installed fredapi-0.5.2
Note: you may need to restart the kernel to use updated packages.


In [28]:
import os
from fredapi import Fred

# Initialize FRED API (requires API key from https://fred.stlouisfed.org/docs/api/fred/)
fred_api_key = '653ed88f4bdd6c12141a8c80288845c4'  # Set your API key as environment variable
fred = Fred(api_key=fred_api_key)

# Fetch UK Total Factor Productivity
tfp_series = fred.get_series('RTFPNAGBA632NRUG')

# Convert to DataFrame
tfp_df = pd.DataFrame({
    'date': tfp_series.index,
    'uk_tfp': tfp_series.values
}).reset_index(drop=True)

# Convert date to datetime
tfp_df['date'] = pd.to_datetime(tfp_df['date'])

print(tfp_df.head(10))
print(f"\nShape: {tfp_df.shape}")
print(f"Date range: {tfp_df['date'].min()} to {tfp_df['date'].max()}")

# Save to CSV
tfp_df.to_csv('uk_total_factor_productivity.csv', index=False)

        date    uk_tfp
0 1950-01-01       NaN
1 1951-01-01       NaN
2 1952-01-01       NaN
3 1953-01-01       NaN
4 1954-01-01  0.744318
5 1955-01-01  0.742429
6 1956-01-01  0.731308
7 1957-01-01  0.727300
8 1958-01-01  0.726811
9 1959-01-01  0.741846

Shape: (74, 2)
Date range: 1950-01-01 00:00:00 to 2023-01-01 00:00:00


In [27]:
tax = pd.read_excel("corporate_tax_rates.xlsx")

tax.columns

Index(['DateTime', 'Small profits rate', 'Main rate', 'Patent box rate'], dtype='object')

In [29]:
# melt from these cols: ['DateTime', 'Small profits rate', 'Main rate', 'Patent box rate']
tax_long = tax.melt(id_vars=['DateTime'], var_name='tax_type', value_name='tax_rate')

tax_long['DateTime'] = pd.to_datetime(tax_long['DateTime'], errors='coerce')

tax_long['tax_rate'] = pd.to_numeric(tax_long['tax_rate'], errors='coerce') / 100  # convert to decimal

tax_long.to_csv('corporate_tax_rates_long.csv', index=False)

In [37]:
life_sat = pd.read_excel("lifesat.xlsx", sheet_name="Figure 5 - Life satisfaction", skiprows=7)

# restrict to first two cols and last col
life_sat = life_sat.iloc[:, [0, 1, -1]]

life_sat.columns = ['country', 'id', 'life_satisfaction']

life_sat['life_satisfaction'] = pd.to_numeric(life_sat['life_satisfaction'], errors='coerce')

In [38]:
life_sat.to_csv('life_satisfaction.csv', index=False)


In [42]:
lifeex_mb = pd.read_excel("lifeex.xlsx", sheet_name="males@birth", skiprows=5)

lifeex_mb = lifeex_mb.iloc[:, [0, 1, -1]]

lifeex_mb.columns = ['country', 'id', 'life_expectancy']

lifeex_mb['life_expectancy'] = pd.to_numeric(lifeex_mb['life_expectancy'], errors='coerce')

lifeex_mb.to_csv('life_expectancy_males_birth.csv', index=False)

Unnamed: 0,country,id,life_expectancy
0,England,E92000001,79.5
1,United Kingdom,K02000001,79.2
2,Northern Ireland,N92000002,78.8
3,Scotland,S92000003,77.2
4,Wales,W92000004,78.3
...,...,...,...
380,Torfaen,W06000020,77.6
381,Monmouthshire,W06000021,80.3
382,Newport,W06000022,77.9
383,Powys,W06000023,80.3


In [43]:
lifeex_fb = pd.read_excel("lifeex.xlsx", sheet_name="females@birth", skiprows=5)

lifeex_fb = lifeex_fb.iloc[:, [0, 1, -1]]

lifeex_fb.columns = ['country', 'id', 'life_expectancy']

lifeex_fb['life_expectancy'] = pd.to_numeric(lifeex_fb['life_expectancy'], errors='coerce')

lifeex_fb.to_csv('life_expectancy_females_birth.csv', index=False)

In [53]:
epu_link = "https://www.policyuncertainty.com/media/UK_Policy_Uncertainty_Data.xlsx"

epu = pd.read_excel(epu_link)

# turn year column to numeric, and make dat col from year and month (1=jan, 2=feb, etc)
epu['year'] = pd.to_numeric(epu['year'], errors='coerce')
epu['month'] = pd.to_numeric(epu['month'], errors='coerce')

epu['date'] = pd.to_datetime(epu[['year', 'month']].assign(day=1))


epu.drop(columns=['year', 'month'], inplace=True)

epu.to_csv('uk_policy_uncertainty.csv', index=False)