# Investments-Selected Quantitative Tools Project
Monna Dimitrova \
Áron Miklós \
Konrad Ochędzan

## Getting, cleaning and transforming data

In [6]:
import sys
!{sys.executable} -m pip install yfinance
!{sys.executable} -m pip install pandas_datareader

Collecting pandas_datareader
  Downloading pandas_datareader-0.10.0-py3-none-any.whl.metadata (2.9 kB)
Collecting lxml (from pandas_datareader)
  Downloading lxml-5.4.0-cp310-cp310-macosx_10_9_universal2.whl.metadata (3.5 kB)
Downloading pandas_datareader-0.10.0-py3-none-any.whl (109 kB)
Downloading lxml-5.4.0-cp310-cp310-macosx_10_9_universal2.whl (8.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.1/8.1 MB[0m [31m46.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lxml, pandas_datareader
Successfully installed lxml-5.4.0 pandas_datareader-0.10.0


In [7]:
import os
import pandas as pd
import yfinance as yf
from pandas_datareader import data as pdr

In [8]:
os.environ['FRED_API_KEY'] = '99e90ae628f35ef304441e83d7bf85cf'

Function to fetch daily close prices for an equity/index/ETF using Yahoo Finance.

In [9]:
def fetch_prices(ticker: str, start_date: str, end_date: str) -> pd.Series:
  price_df = yf.download(ticker, start=start_date, end=end_date)
  price_series = price_df['Close']
  clean_name = ticker.replace('^', '')
  price_series.name = clean_name
  return price_series

Function to fetch data from FRED like rates, macro data, etc.

In [10]:
def fetch_fred_data(id: str, start_date: str, end_date: str) -> pd.Series:
  raw_data = pdr.DataReader(id, 'fred', start_date, end_date)
  if isinstance(raw_data, pd.Series):
    fred_series = raw_data
  else:
    fred_series = raw_data.iloc[:, 0]
  return fred_series

In [13]:
START_DATE = '2009-01-01'
END_DATE = '2025-06-15'

# S&P 500 index
sp500 = fetch_prices('^GSPC', START_DATE, END_DATE)
sp500.rename(columns={'^GSPC': 'sp500'}, inplace=True)

# VIX implied volatility index
vix = fetch_prices('^VIX', START_DATE, END_DATE)
vix.rename(columns={'^VIX': 'vix'}, inplace=True)

# Treasury yields
tbill3m = fetch_fred_data('TB3MS', START_DATE, END_DATE)
yield2y = fetch_fred_data('DGS2', START_DATE, END_DATE)
yield10y = fetch_fred_data('DGS10', START_DATE, END_DATE)

# Foreign indices
nikkei = fetch_prices('^N225', START_DATE, END_DATE)
nikkei.rename(columns={'^N225': 'nikkei'}, inplace=True)
ftse = fetch_prices('^FTSE', START_DATE, END_DATE)
ftse.rename(columns={'^FTSE': 'ftse'}, inplace=True)
hsi = fetch_prices('^HSI', START_DATE, END_DATE)
hsi.rename(columns={'^HSI': 'hsi'}, inplace=True)

# Commodities

crude = fetch_prices('CL=F', START_DATE, END_DATE)
crude.rename(columns={'CL=F': 'crude'}, inplace=True)

gold = fetch_prices('GC=F', START_DATE, END_DATE)
gold.rename(columns={'GC=F': 'gold'}, inplace=True)

silver = fetch_prices('SI=F', START_DATE, END_DATE)
silver.rename(columns={'SI=F': 'silver'}, inplace=True)

copper = fetch_prices('HG=F', START_DATE, END_DATE)
copper.rename(columns={'HG=F': 'copper'}, inplace=True)

gas = fetch_prices('NG=F', START_DATE, END_DATE)
gas.rename(columns={'NG=F': 'gas'}, inplace=True)

# Currencies 

eurusd = fetch_prices('EURUSD=X', START_DATE, END_DATE)
eurusd.rename(columns={'EURUSD=X': 'eurusd'}, inplace=True)

yenusd = fetch_prices('JPY=X', START_DATE, END_DATE)
yenusd.rename(columns={'JPY=X': 'yenusd'}, inplace=True)

yuanusd = fetch_prices('CNY=X', START_DATE, END_DATE)
yuanusd.rename(columns={'CNY=X': 'yuanusd'}, inplace=True)

cadusd = fetch_prices('CAD=X', START_DATE, END_DATE)
cadusd.rename(columns={'CAD=X': 'cadusd'}, inplace=True)

gbpusd = fetch_prices('GBP=X', START_DATE, END_DATE)
gbpusd.rename(columns={'GBP=X': 'gbpusd'}, inplace=True)

chfusd = fetch_prices('CHF=X', START_DATE, END_DATE)
chfusd.rename(columns={'CHF=X': 'chfusd'}, inplace=True)

# Macro factors, here we have monhtly data, we will forward fill later
# Unemployment rate
unemployment = fetch_fred_data('UNRATE', START_DATE, END_DATE)
# CPI (Consumer Price Index)
cpi = fetch_fred_data('CPIAUCSL', START_DATE, END_DATE)

returns = sp500.pct_change()
term_spread = (yield10y - yield2y)
term_spread.rename('term_spread', inplace=True)

# Combine all data into a single dataframe and make map correct names
data = pd.concat([returns, vix, tbill3m, term_spread, nikkei, ftse, hsi, crude,gold,silver,gas,eurusd,yenusd,yuanusd,cadusd,gbpusd,chfusd,unemployment, cpi], axis=1).sort_index()
column_mapping = {
    'sp500': 'returns',
    'vix': 'vix',
    'TB3MS': 'tbill3m',
    'term_spread': 'term_spread',
    'UNRATE': 'unemployment',
    'nikkei':'nikkei', 
    'ftse':'ftse',
    'hsi':'hsi', 
    'crude':'crude',
    'gold':'gold',
    'silver':'silver',
    'gas':'gas',
    'eurusd':'eurusd',
    'yenusd':'yenusd',
    'yuanusd':'yuanusd',
    'cadusd':'cadusd',
    'gbpusd':'gbpusd',
    'chfusd':'chfusd',
    'CPIAUCSL': 'cpi'
}
data = data.rename(columns=column_mapping)

# Forward-fill and backward-fill to transform to daily data,
# drop NaN values
# These are published daily, but we want to avoid missing rows due to missing data
data[['vix', 'tbill3m', 'term_spread','nikkei', 'ftse', 'hsi', 'crude','gold','silver','gas','eurusd','yenusd','yuanusd','cadusd','gbpusd','chfusd']] = data[['vix', 'tbill3m', 'term_spread','nikkei', 'ftse', 'hsi', 'crude','gold','silver','gas','eurusd','yenusd','yuanusd','cadusd','gbpusd','chfusd']].ffill()
# These macro factors are monthly we go to daily data
data[['unemployment', 'cpi']] = data[['unemployment', 'cpi']].ffill().bfill()
data.dropna(inplace=True)

  price_df = yf.download(ticker, start=start_date, end=end_date)
[*********************100%***********************]  1 of 1 completed
  price_df = yf.download(ticker, start=start_date, end=end_date)
[*********************100%***********************]  1 of 1 completed
  price_df = yf.download(ticker, start=start_date, end=end_date)
[*********************100%***********************]  1 of 1 completed
  price_df = yf.download(ticker, start=start_date, end=end_date)
[*********************100%***********************]  1 of 1 completed
  price_df = yf.download(ticker, start=start_date, end=end_date)
[*********************100%***********************]  1 of 1 completed
  price_df = yf.download(ticker, start=start_date, end=end_date)
[*********************100%***********************]  1 of 1 completed
  price_df = yf.download(ticker, start=start_date, end=end_date)
[*********************100%***********************]  1 of 1 completed
  price_df = yf.download(ticker, start=start_date, end=end_dat

In [15]:
print(data.columns)

Index(['returns', 'vix', 'tbill3m', 'term_spread', 'nikkei', 'ftse', 'hsi',
       'crude', 'gold', 'silver', 'gas', 'eurusd', 'yenusd', 'yuanusd',
       'cadusd', 'gbpusd', 'chfusd', 'unemployment', 'cpi'],
      dtype='object')


In [24]:
data.to_csv('data_non_std.csv', index=True)

Now we standardize the data. This means removing the mean and scaling to unit variance. Standardization is useful, because many machine learning estimators do not behave well if the individual features do not look similar to standard normally distributed data. \
We also compute realized volatility and split the data into training, validation and testing sets.

In [17]:
import numpy as np
from sklearn.preprocessing import StandardScaler

In [None]:
# Compute realized volatility: We use a 21 day rolling window to calculate
# standard deviation of daily returns. Then we annualize it
data['realized_vol'] = data['returns'].rolling(window=21).std() * np.sqrt(252)
# remove NaNs (that is first 20 entries for which we cannot calculate it)
data = data.dropna(subset=['realized_vol'])

#Split by date: we train on past data, validate on more recent data and test on
# the newest data
split1 = '2016-01-01'
split2 = '2019-01-01'

train = data.loc[:split1].copy()
validate = data.loc[split1:split2].copy()
test = data.loc[split2:].copy()

# Standardize features, we fit only on training set to avoid looking into the
# future
feature_cols = ['vix', 'tbill3m', 'term_spread','nikkei', 'ftse', 'hsi', 'crude','gold','silver','gas','eurusd','yenusd','yuanusd','cadusd','gbpusd','chfusd', 'unemployment', 'cpi']
scaler = StandardScaler()
train[feature_cols] = scaler.fit_transform(train[feature_cols])
validate[feature_cols] = scaler.transform(validate[feature_cols])
test[feature_cols] = scaler.transform(test[feature_cols])

# Label rows with its split
train['split'] = 'train'
validate['split'] = 'validate'
test['split'] = 'test'

# Combine all data into a single dataframe
full_data = pd.concat([train, validate, test])

In [None]:
print(full_data)

             returns       vix    tbill3m  term_spread  unemployment  \
2009-02-03  0.015834  3.059095   3.713323    -0.226714      0.310523   
2009-02-04 -0.007490  3.163283   3.713323    -0.143517      0.310523   
2009-02-05  0.016366  3.147457   3.713323    -0.143517      0.310523   
2009-02-06  0.026896  3.099979   3.713323     0.043678      0.310523   
2009-02-09  0.001485  3.135587   3.713323    -0.039520      0.310523   
...              ...       ...        ...          ...           ...   
2025-06-09  0.000920 -0.356694  70.483724    -3.242625     -2.300866   
2025-06-10  0.005483 -0.384390  70.483724    -3.284224     -2.300866   
2025-06-11 -0.002744 -0.343506  70.483724    -3.263425     -2.300866   
2025-06-12  0.003822 -0.243274  70.483724    -3.284224     -2.300866   
2025-06-13 -0.011296  0.126000  70.483724    -3.305024     -2.300866   

                  cpi  realized_vol  split  
2009-02-03  -1.855722      2.490922  train  
2009-02-04  -1.855722      2.492364  train  


## Neural network models

Our goal is to decompose return prediction into two economic components: stochastic volatility and interest rate. Stochastic volatility captures the predictability of future variance while the interest rate reflects the effects of the specific term structure. Then we will use a mixture of these two components, using weights and scaling by microeconomic factors (unemployment, CPI).

In [None]:
import torch
from torch import Tensor
import torch.nn as nn
import torch.nn.functional as F

### Volatility agent

In [None]:
class VolAgent(nn.Module):
  def __init__(self, input_size: int, hidden_size: int = 64, num_layers: int = 2,
               dropout: float = 0.1) -> None:
    super().__init__()
    # We use LSTM to encode the volatility signals
    self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True,
                        dropout=dropout)
    # Linear layer to map the last hidden state to scalar
    self.fc = nn.Linear(hidden_size, 1)

  def forward(self, input_features: Tensor) -> Tensor:
    _, (hidden_states, _) = self.lstm(input_features)
    last_hidden = hidden_states[-1]
    predicted_vol_return = self.fc(last_hidden)
    return predicted_vol_return

## Interest rate agent

In [None]:
class InterestRateAgent(nn.Module):
  def __init__(self, input_size: int, hidden_units: list[int] = [64, 32],
               dropout: float = 0.1) -> None:
    super().__init__()
    layers = []
    input_dimension = input_size
    for layer_size in hidden_units:
      layers.extend([nn.Linear(input_dimension, layer_size), nn.ReLU(), nn.Dropout(dropout)])
      input_dimension = layer_size
    layers.append(nn.Linear(input_dimension, 1))
    self.net = nn.Sequential(*layers)

  def forward(self, input_features: Tensor) -> Tensor:
    predicted_rate_return = self.net(input_features)
    return predicted_rate_return

## Weighting network

This networks learns how to allocate weights between the volatility and interest rate agent.

In [None]:
class WeightingNetwork(nn.Module):
  def __init__(self, input_size: int, hidden_size: int = 32) -> None:
    super().__init__()
    self.fc1 = nn.Linear(input_size, hidden_size)
    self.fc2 = nn.Linear(hidden_size, 2)

  def forward(self, input_features: Tensor) -> Tensor:
    hidden = F.relu(self.fc1(input_features))
    raw_weights = self.fc2(hidden)
    weights = F.softmax(raw_weights, dim=1)
    return weights

## Macroeconomic scaler

This is a network that uses the macroeconomic factors to scale the combined signal coming from the two agents. It calculates a multiplier that determines whether to have a larger or smaller leverage, meaning an amplified or dampened signal.

In [None]:
class MacroScaler(nn.Module):
  def __init__(self, input_size: int) -> None:
    super().__init__()
    # Bias set to true: output = Wx + b, where b is the learned bias
    self.linear = nn.Linear(input_size, 1, bias=True)

    def forward(self, input_features: Tensor) -> Tensor:
      raw_leverage = 1 + self.linear(input_features)
      constrained_leverage = torch.tanh(raw_leverage)
      return constrained_leverage

## Mixture agent

The mixture agent combines the volatility and interest rate agents using the weighting and scaling networks defined above.

In [None]:
class MixtureAgent(nn.Module):
  def __init__(self, vol_input_size: int, interest_rate_input_size: int,
               weight_input_size: int, macro_input_size: int,
               vol_params: dict[str, object] = {},
               interest_rate_params: dict[str, object] = {}) -> None:
    super().__init__()
    self.vol_agent = VolAgent(vol_input_size, **vol_params)
    self.interest_rate_agent = InterestRateAgent(interest_rate_input_size, **interest_rate_params)
    self.weights = WeightingNetwork(weight_input_size)
    self.macro = MacroScaler(macro_input_size)

  def forward(self, vol_input: Tensor, interest_rate_input: Tensor,
              weights_input: Tensor, macro_input: Tensor) -> Tensor:
    predicted_vol_return = self.vol_agent(vol_input)
    predicted_interest_rate_return = self.interest_rate_agent(interest_rate_input)
    weights = self.weights(weights_input)
    macro = self.macro(macro_input)
    combined_returns = weights[:, :1] * predicted_vol_return + weights[:, 1:] * predicted_interest_rate_return
    scaled_returns = combined_returns * macro
    return scaled_returns.squeeze()