In [2]:
import numpy as np
import pandas as pd
import yfinance as yf

### SnP sector indices 1981 - 2002

In [3]:
# Sector ETF tickers representing the 11 S&P 500 sectors
sector_tickers = [
    "XLF",  # Financials
    "XLK",  # Technology
    "XLV",  # Health Care
    "XLY",  # Consumer Discretionary
    "XLP",  # Consumer Staples
    "XLE",  # Energy
    "XLI",  # Industrials
    "XLU",  # Utilities
    "XLB",  # Materials
    # "XLRE",  # Real Estate
    # "XLC",  # Communication Services
]

indices = [
    "^GSPC",  # S&P 500
]

all_tickers = sector_tickers + indices

# Date range from the paper
start_date = "1999-01-01"
end_date = "2022-12-31"

freq = '1mo'
# freq = '1d'

# Download daily adjusted close prices for sector ETFs
prices = yf.download(
    all_tickers,
    start=start_date,
    end=end_date,
    interval=freq,
    auto_adjust=True,
    progress=True,
)["Close"]

# Calculate monthly returns
returns = prices.pct_change().dropna()

returns[sector_tickers].to_parquet(f'../data/snp_old/returns_{freq}.parquet')
prices[sector_tickers].to_parquet(f'../data/snp_old/prices_{freq}.parquet')
prices[indices].to_parquet(f'../data/snp_old/prices_sp500_{freq}.parquet')

[*********************100%***********************]  10 of 10 completed


### country indices

In [4]:
path = "../data/intl"

In [5]:
# Define tickers for major country ETFs or indices
tickers = {
    'Canada': 'EWC',        # iShares MSCI Canada ETF
    'France': 'EWQ',        # iShares MSCI France ETF
    'Germany': 'EWG',       # iShares MSCI Germany ETF
    'Italy': 'EWI',         # iShares MSCI Italy ETF
    'Japan': 'EWJ',         # iShares MSCI Japan ETF
    'Switzerland': 'EWL',   # iShares MSCI Switzerland ETF
    'UK': 'EWU',            # iShares MSCI United Kingdom ETF
    'US': 'SPY',            # SPDR S&P 500 ETF Trust
}

# freq = '1mo'
freq = '1d'

# Download monthly adjusted close prices
prices = yf.download(list(tickers.values()), start='2001-01-01', end='2021-12-31', interval=freq, auto_adjust=True)['Close']
# Rename columns to country names
prices.columns = tickers.keys()

# Check for NaN in data and print
# print(data.isna().sum())
# display(data[data.isna().any(axis=1)])
# data = data.dropna()

# Calculate monthly returns
returns = prices.pct_change().dropna()
print(returns.head())

returns.to_parquet(f'{path}/returns_{freq}.parquet')
prices.to_parquet(f'{path}/prices_{freq}.parquet')

[*********************100%***********************]  8 of 8 completed

              Canada    France   Germany     Italy     Japan  Switzerland  \
Date                                                                        
2001-01-03  0.019138  0.019169  0.000000  0.045454  0.036765    -0.015190   
2001-01-04  0.009389 -0.006269  0.000000 -0.048913 -0.028369     0.005141   
2001-01-05 -0.018605 -0.009464  0.000000 -0.005714 -0.014598     0.000000   
2001-01-08 -0.033175  0.000000 -0.002793  0.005747 -0.007408    -0.015345   
2001-01-09  0.014706 -0.009554  0.002801 -0.005714 -0.022388    -0.007793   

                  UK        US  
Date                            
2001-01-03  0.021053  0.048035  
2001-01-04 -0.027492 -0.010764  
2001-01-05  0.000000 -0.032643  
2001-01-08  0.003534  0.007740  
2001-01-09 -0.017606 -0.002641  





In [6]:
# get MSCI world for same timeframe and save separately
prices = yf.download(['^VIX', '^990100-USD-STRD'], start='2001-01-01', end='2021-12-31', interval=freq, auto_adjust=True)['Close']
# rename to MSCI World
prices.columns = ['VIX', 'MSCI World']
prices[['MSCI World']].to_parquet(f'{path}/prices_msci_{freq}.parquet')

[*********************100%***********************]  2 of 2 completed


In [39]:
# Calculate volatility metrics
msci_prices = prices['MSCI World'].dropna()
msci_returns = msci_prices.pct_change()  # simple returns

vol20 = msci_returns.rolling(20).std()
vol60 = msci_returns.rolling(60).std()
vol_ratio = vol20 / vol60

# Create df to hold vol metrics
vol_df = pd.DataFrame(
    {
        "vol20": vol20,
        "vol60": vol60,
        "vol_ratio": vol_ratio,
        "VIX": prices["VIX"],
    }
).dropna(how='any')

vol_df_std = vol_df.copy()
# Standardize the metrics using expanding lookback window to prevent look-ahead bias
for col in ['vol20', 'vol60', 'vol_ratio', 'VIX']:
    mean = vol_df[col].expanding().mean()
    std = vol_df[col].expanding().std()
    vol_df_std[col] = (vol_df[col] - mean) / std

# Drop the first row with NaN since there is no std yet
vol_df_std = vol_df_std.dropna(how='all')
vol_df = vol_df_std[['vol20', 'vol_ratio', 'VIX']]
vol_df.to_parquet(f'{path}/vola_{freq}.parquet', index=True, engine='pyarrow')