# Second stage summary

In the first stage, we established that selecting a simple portfolio built with classical methods is more reliable than attempting to select or compose a portfolio using machine learning, in particular neural networks.

During the second stage, we wanted to establish whether adding more relevant features to the training data and reducing the problem to a simple forecast (of future covariance or future returns) is a viable strategy. If this does not improve the base model, it will confirm our hypothesis that portfolio allocation should be performed based on fundamentals of portfolio theory and economics, rather than based on advanced analytics and forecasting.

This summary consists of following parts:
1. Base model performance testing (classical portfolio theory)
2. Training advanced forecasting model
  - dataset assembly
  - model architecture definition
  - training & validation of created model
3. Comparison of test results between base and advanced model

## Base model performance

In [1]:
import pandas as pd
import numpy as np

from numba import jit

from typing import Tuple

from utils.data_loader import *

from pypfopt.expected_returns import mean_historical_return
from pypfopt.risk_models import CovarianceShrinkage
from pypfopt.efficient_frontier import EfficientFrontier

In [46]:
class OneYearTest(object):
    """
    Scikit-learn style iterator returning training- and test-set indices,
    where test set is one-year-long and training set consists of n_train_years
    preceding the test set year.
    """
    def __init__(self, df: pd.DataFrame, n_train_years: int=10):
        self.index_years = df.index.year
        self.unique_years = np.unique(self.index_years)
        self.n_train_years = n_train_years
        
    @property
    def train_years(self):
        return set(self.unique_years[:self.idx])
    
    @property
    def test_year(self):
        return self.unique_years[self.idx]

    def __iter__(self):
        self.idx = self.n_train_years
        return self
    
    def __next__(self):
        if self.idx < len(self.unique_years):
            train_ids = self.index_years.isin(self.train_years)
            test_ids = self.index_years == self.test_year
            self.idx += 1
            return train_ids, test_ids
        else:
            raise StopIteration

In [47]:
@jit(nopython=True)
def portfolio_performance(allocation: np.array, fund_values: np.array) -> Tuple[float, float]:
    """
    Calculates total returns and daily returns volatility during entire period of provided fund values.
    """
    allocation_value = np.sum(allocation * fund_values, axis=1) / np.sum(allocation)
    period_returns = (allocation_value[-1] - allocation_value[0]) / allocation_value[0]
    daily_returns = (allocation_value[1:] - allocation_value[:-1]) / allocation_value[:-1]
    period_volatility = np.std(daily_returns)
    return period_returns, period_volatility

In [48]:
@jit(nopython=True)
def portfolio_srri(allocation: np.array, fund_values: np.array) -> int:
    """
    Calculates SRRI based on porfolio volatility 
    during last 5 years of provided fund_values,
    as described in https://bit.ly/2RDVib9
    """
    m = 260  # days in a year, as in the SRRI paper
    T = 5*260  # days in 5 years, as in the SRRI paper
    allocation_value = np.sum(allocation * fund_values[-T:], axis=1) / np.sum(allocation)
    daily_returns = (allocation_value[1:] - allocation_value[:-1]) / allocation_value[:-1]
    scaled_volatility = np.sqrt(m * np.sum((daily_returns - np.mean(daily_returns))**2) / (T-1))
    if scaled_volatility >= 0.25:
        return 7
    elif scaled_volatility >= 0.15:
        return 6
    elif scaled_volatility >= 0.1:
        return 5
    elif scaled_volatility >= 0.05:
        return 4
    elif scaled_volatility >= 0.02:
        return 3
    elif scaled_volatility >= 0.005:
        return 2
    else:
        return 1

In [49]:
def base_model(funds_df: pd.DataFrame) -> np.array:
    mu = mean_historical_return(funds_df)
    S = CovarianceShrinkage(funds_df).ledoit_wolf()
    ef = EfficientFrontier(mu, S)
    ef.efficient_return(0.1)
    return ef.weights

### Loading data

In [51]:
funds_df = load_funds()
funds_df.head()

Unnamed: 0_level_0,AP,ARR,ARW,G,OP,ORR,ORW
Daty,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2000-01-03,415.9,549.11,354.45,401.26,275.08,520.13,230.72
2000-01-04,404.41,533.89,357.14,401.42,275.08,520.02,229.63
2000-01-05,400.04,527.38,351.19,401.59,275.08,519.22,229.22
2000-01-06,410.15,522.02,347.96,401.75,275.07,519.62,228.82
2000-01-07,429.16,533.16,351.87,401.93,275.07,520.8,230.09


### Evaluating model performance

In [50]:
result_dfs = []
for train_ids, test_ids in OneYearTest(funds_df):
    train_, test_ = funds_df[train_ids], funds_df[test_ids]
    allocation = base_model(train_)
    allocation_data = {fund: allocation[idx] for idx, fund in enumerate(funds_df.columns)}
    srri = portfolio_srri(allocation, train_.values)
    returns, volatility = portfolio_performance(allocation, test_.values)
    performance_data = {'srri': srri, 'returns': returns, 'volatility': volatility}
    result_dfs.append(
        pd.DataFrame({**allocation_data, **performance_data}, index=[test_.index.year[0]])
    )
base_results_df = pd.concat(result_dfs, axis='index')
base_results_df

Unnamed: 0,AP,ARR,ARW,G,OP,ORR,ORW,srri,returns,volatility
2010,3.147186e-17,0.0,0.002972771,0.0,0.04265175,3.623375e-19,0.954375,4,0.111142,0.002708
2011,5.245311e-18,0.0,0.01766374,0.0,0.09503978,0.0,0.887296,4,0.057131,0.002736
2012,3.601152e-14,4.01575e-15,1.662624e-17,1.446091e-17,8.635049e-18,4.590105e-14,1.0,4,0.168538,0.001943
2013,0.0,7.782397e-18,0.0,0.0,0.1680585,2.314928e-18,0.831942,4,-0.047554,0.003428
2014,4.682031e-13,1.7846960000000002e-17,1.05909e-13,8.452873e-13,1.035153e-16,1.168058e-13,1.0,4,0.069262,0.002734
2015,4.880799e-13,3.841907e-13,2.010943e-18,7.502117e-13,6.925339e-13,2.721481e-14,1.0,3,0.006862,0.002654
2016,8.845414e-13,8.25874e-21,3.2582140000000002e-18,3.603049e-18,1.605328e-18,8.021075e-13,1.0,3,0.096736,0.003441
2017,8.767353e-14,1.169147e-12,7.322365e-17,1.233785e-16,5.395509e-17,3.3805330000000004e-17,1.0,3,0.097111,0.001755
2018,0.0,1.89071e-13,0.0,5.715151e-13,0.0,0.0,1.0,3,-0.047065,0.002686


## Advanced model performance

We will attempt to train a deep neural network, that will attempt to predict future returns as well as future fund value covariance.

In [93]:
from dataclasses import dataclass, asdict
import multiprocessing as mp

from tqdm import tqdm

import ta

### External data

### Feature generation

For our fund values, we calculate similar features to the ones mentioned by: *Bao, W., Yue, J., Rao, Y. (2017). A deep learning framework for financial time series
using stacked autoencoders and long-short term memory.*

In [100]:
fund_column_features = {
    'MACD': lambda col: ta.trend.macd(col, n_fast=80, n_slow=160),
    'BBL': lambda col: ta.volatility.bollinger_lband(col, n=120),
    'BBH': lambda col: ta.volatility.bollinger_hband(col, n=120),
    'EMA': lambda col: ta.trend.ema_indicator(col, n=240),
}

ohlc_column_features = {
    # TODO
}

In [101]:
features_df = pd.DataFrame(index=funds_df.index)
for key in tqdm(fund_column_features):
    for col in funds_df.columns:
        features_df[f'{col}_{key}'] = fund_column_features[key](funds_df[col])

100%|██████████| 4/4 [00:00<00:00, 66.48it/s]


### Target generation

In [81]:
@dataclass
class Target(object):
    fund_returns: np.array
    fund_covariance: np.array
    ideal_allocation: np.array
    ideal_returns: float

In [82]:
def calculate_target(funds_df: pd.DataFrame) -> Target:
    mu = mean_historical_return(funds_df)
    S = CovarianceShrinkage(funds_df).ledoit_wolf()
    ef = EfficientFrontier(mu, S)
    ef.efficient_risk(0.1)
    ideal_allocation = ef.weights
    ideal_returns = ef.portfolio_performance()[0]
    return Target(mu, S, ideal_allocation, ideal_returns)

We calculate all target data in parallel to save time:

In [83]:
%%time
YEAR_DAYS = 260

dfs = [
    funds_df[idx+1 : idx+1+YEAR_DAYS]
    for idx in range(funds_df.iloc[:-YEAR_DAYS].shape[0])
]
with mp.Pool(processes=12) as pool:
    targets = pool.map(calculate_target, dfs)

print(len(targets))

4541
CPU times: user 5.11 s, sys: 428 ms, total: 5.54 s
Wall time: 26.1 s


Then, we extract information necessary for our experiment and back it up:

In [86]:
returns_target = np.zeros_like(funds_df[:-YEAR_DAYS])
covariance_target = np.zeros(
    (len(funds_df[:-YEAR_DAYS]), funds_df.shape[1]**2)
)
sample_weights = np.zeros(len(funds_df[:-YEAR_DAYS]))

for idx, target in enumerate(targets):
    returns_target[idx] = target.fund_returns.values.ravel()
    covariance_target[idx] = target.fund_covariance.values.ravel()
    sample_weights[idx] = target.ideal_returns  # years with higher returns are more important

np.save('data/forecasting/returns_target.npy', returns_target)
np.save('data/forecasting/covariance_target.npy', covariance_target)
np.save('data/forecasting/sample_weights.npy', sample_weights)

We can also add past targets to our features:

In [105]:
returns_features = np.zeros_like(funds_df)
returns_features[:YEAR_DAYS] = np.nan
returns_features[YEAR_DAYS:] = returns_target

covariance_features = np.zeros(
    (len(funds_df), funds_df.shape[1]**2)
)
covariance_features[:YEAR_DAYS] = np.nan
covariance_features[YEAR_DAYS:] = covariance_target

weights_features = np.zeros(len(funds_df))
weights_features[:YEAR_DAYS] = np.nan
weights_features[YEAR_DAYS:] = sample_weights

### Model definition

In [114]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(42)
cuda_device = 0
assert(torch.cuda.is_available())

In [None]:
class LSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, batch_size, output_dim=1, num_layers=2):
        super(LSTM, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.num_layers = num_layers

        self.lstm = nn.LSTM(
            self.input_dim, 
            self.hidden_dim, 
            self.num_layers
        )
        self.linear = nn.Linear(self.hidden_dim, output_dim)

    def init_hidden(self):
        return (torch.zeros(self.num_layers, self.batch_size, self.hidden_dim),
                torch.zeros(self.num_layers, self.batch_size, self.hidden_dim))

    def forward(self, input):
        # Forward pass through LSTM layer
        # shape of lstm_out: [input_size, batch_size, hidden_dim]
        # shape of self.hidden: (a, b), where a and b both 
        # have shape (num_layers, batch_size, hidden_dim).
        lstm_out, self.hidden = self.lstm(input.view(len(input), self.batch_size, -1))
        
        # Only take the output from the final timetep
        # Can pass on the entirety of lstm_out to the next layer if it is a seq2seq prediction
        y_pred = self.linear(lstm_out[-1].view(self.batch_size, -1))
        return y_pred.view(-1)

### Training & evaluation

In [116]:
from sklearn.preprocessing import StandardScaler

In [144]:
all_features = np.hstack([features_df.values, returns_features, covariance_features, weights_features.reshape(-1, 1)])
features_valid = np.all(~np.isnan(all_features), axis=1)
feature_scaler = StandardScaler()
all_features[features_valid] = feature_scaler.fit_transform(all_features[features_valid])

all_targets = np.hstack([returns_target, covariance_target])
targets_valid = np.all(~np.isnan(all_targets), axis=1)
target_scaler = StandardScaler()
all_targets[targets_valid] = target_scaler.fit_transform(all_targets[targets_valid])

# unlike features, targets are only generated when they are valid
assert(sum(targets_valid) == len(targets_valid))
assert(len(targets_valid) < len(features_valid))
# so last year of features cannot have any target associated with it (no data to compute the target)
all_features = all_features[:len(targets_valid)]
features_valid = features_valid[:len(targets_valid)]

# this is created for one-year-validation to have a correct index for data splitting
meta_df = pd.DataFrame({'valid': features_valid}, index=funds_df.iloc[:len(features_valid)].index)