# Stacking

### Idea
Because training base models is hard, the possible optimal solution may be to take a lot of classical models
for portfolio creation and train a 2nd-layer model on their outputs, to select an optimal output.

The challenges we need to solve in this notebook are the following:
- selecting a target variable and metric
- generating as many simple models as possible
- adding features related to the long-term trends for 2nd layer model for it to perform a "more informed" choice
- choosing whether base models should be allowed to see entire past that we have or just the most recent fraction of it

### Loading data

In [1]:
import pandas as pd

prefix = '../input/'
FUNDS_CSV = prefix+'Quant_Invest_Fundusze.csv'
STOCKS_CSV = prefix+'all_indices_close.csv'
COMMODITIES_CSV = prefix+'all_commodities_close.csv'
RATES_CSV = prefix+'policy_rates.csv'
FX_CSV = prefix+'exchange_rates.csv'


def _load_indexed(path, index_col='Daty', sep=',') -> pd.DataFrame:
    df = pd.read_csv(path, index_col=index_col, sep=sep)
    df.index = pd.to_datetime(df.index)
    return df.sort_index()


def load_funds() -> pd.DataFrame:
    return _load_indexed(FUNDS_CSV, sep=';')


def load_stocks() -> pd.DataFrame:
    return _load_indexed(STOCKS_CSV, index_col='Data')


def load_commodities() -> pd.DataFrame:
    return _load_indexed(COMMODITIES_CSV, index_col='Data')


def load_rates() -> pd.DataFrame:
    policy_rates = pd.read_csv(RATES_CSV, index_col = 0)
    policy_rates = policy_rates.pivot(index='date', columns='reference_area', values='obs_value')
    policy_rates.index = pd.to_datetime(policy_rates.index)
    return policy_rates.iloc[:(-42)]  # keep only years 2000-2018


def load_fx() -> pd.DataFrame:
    exchange_rates = pd.read_csv(FX_CSV, index_col=0)
    exchange_rates.index = pd.to_datetime(exchange_rates['Date'])
    shortnames={}
    for col in exchange_rates.columns[1:]:
        start = col.index('(')
        end = col.index(')')
        shortnames[col] = col[(start+1):end]
    return exchange_rates.rename(columns = shortnames).drop(columns=['Date'])


def load_all() -> pd.DataFrame:
    return pd.concat([
        load_funds(),
        load_stocks(),
        load_commodities(),
        load_rates(),
        load_fx()
    ], axis='columns')

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = load_all()
df = df.loc[~df['AP'].isna()]
df.head()

Unnamed: 0,AP,ARR,ARW,G,OP,ORR,ORW,a5.c,wig2,^aex,^aor,^ath,^bel,^bet,^bux,^bvp,^cac,^cry,^dax,^djc,^dji,^djt,^dju,^fmi,^ftm,^hex,^hsi,^ibe,^ice,^ipc,^ips,^jci,^klc,^kos,^mda,^moe,^mrv,^ndq,^ndx,^nkx,...,HUF,ISK,INR,IDR,IRR,ILS,JPY,KZT,KRW,KWD,LYD,MYR,MTL,MUR,MXN,NPR,NZD,NOK,OMR,PKR,PEN,PHP,PLN,QAR,RUB,SAR,SGD,SIT,ZAR,LKR,SEK,CHF,THB,TTD,TND,AED,GBP,USD,UYU,VEB
2000-01-03,415.9,549.11,354.45,401.26,275.08,520.13,230.72,1204.88,1852.9,675.44,,5794.85,3311.05,,,16930.0,5917.37,156.83,6750.76,3170.2,11357.5,2943.67,276.71,41226.0,,15330.5,17369.6,11610.0,1511.86,7077.71,1139.63,,833.89,,4127.35,,551.83,4131.15,3790.55,,...,,,0.016713,,,,,,,2.39108,1.577,0.191236,,,,,,0.09095,1.88998,,,,,0.199642,,0.194044,0.437637,,,0.010081,0.085771,0.456726,,0.115867,,0.197875,,0.726696,,
2000-01-04,404.41,533.89,357.14,401.42,275.08,520.02,229.63,1194.41,1796.6,642.25,3124.1,5588.65,3188.09,,8709.17,15851.0,5672.02,155.49,6586.95,3090.91,10997.9,2862.17,278.51,40314.0,6432.1,14184.9,17072.8,11206.6,1511.9,6675.27,1123.98,700.22,832.8,1059.04,4048.81,,522.97,3901.69,3546.2,19002.86,...,0.002908,0.010105,0.016635,,0.000415,,0.00706,,0.000638,2.38295,1.577,0.190423,1.77834,,,0.010529,,0.091494,1.88195,0.013972,,,,0.198793,,0.19322,0.436566,,0.118208,0.010044,,0.465253,0.019568,0.115445,,0.197034,1.18701,0.723608,,0.001114
2000-01-05,400.04,527.38,351.19,401.59,275.08,519.22,229.22,1192.89,1777.0,632.31,3050.9,5369.38,3057.53,,8464.34,16245.0,5479.7,154.91,6502.07,3133.61,11122.7,2867.36,289.11,39452.0,6364.0,13012.0,15846.7,10863.1,1495.92,6764.31,1126.05,678.09,815.8,986.31,3994.87,173.0,532.68,3877.54,3507.31,18542.55,...,0.002947,0.010083,0.016627,,0.000413,,0.007019,,0.000643,2.38427,1.577,0.190428,1.78598,,,0.010529,0.375454,0.091428,1.88199,0.013972,,,,0.198799,,0.193225,0.436973,,0.119047,0.010036,0.08674,0.466615,0.019422,0.11551,,0.197039,1.18624,0.723627,,0.001114
2000-01-06,410.15,522.02,347.96,401.75,275.07,519.62,228.82,,1832.1,624.21,3030.1,,3061.09,,8483.29,16107.0,5450.11,154.99,6474.92,3177.96,11253.3,2937.08,292.64,38835.0,6406.7,,15153.2,,1501.61,6751.65,1122.57,688.52,818.43,960.79,4016.15,186.26,528.47,3727.13,3340.81,18168.27,...,0.002944,0.010097,0.016646,,0.000413,,0.006942,,0.000641,2.38695,1.577,0.190642,1.79284,,,0.010541,0.376491,0.091741,1.88411,0.013988,,,,0.199022,,0.193442,0.437463,,0.118868,0.010037,,0.46865,0.019427,0.115662,,0.19726,1.19474,0.724439,,0.001115
2000-01-07,429.16,533.16,351.87,401.93,275.07,520.8,230.09,1223.61,1933.2,644.86,3044.5,5410.82,3138.55,,8694.04,16309.0,5539.61,154.75,6780.96,3242.06,11522.6,2964.72,297.78,40194.0,6484.4,13539.9,15405.6,11102.4,1495.2,7047.09,1129.52,,,948.65,4095.51,,522.12,3882.62,3529.6,18193.41,...,0.002942,0.010079,0.016711,,0.000413,,0.006905,,0.000638,,1.577,0.191346,1.79132,,,0.01058,0.373845,0.091252,1.89106,0.014039,,,,0.199756,,0.194156,0.437098,,0.120015,,,0.465233,0.01941,0.115876,,0.197989,1.19596,0.727113,,0.001118


In [4]:
df.shape

(4801, 200)

In [5]:
fund_colnames = ['AP', 'ARR', 'ARW', 'G', 'OP', 'ORR', 'ORW']

In [6]:
funds_df = df[fund_colnames]

In [7]:
funds_df.shape

(4801, 7)

### Feature and target selection

Firstly, we will try to train a classifier that selects the best performing model.

In [8]:
from typing import List, Callable, Tuple
from dataclasses import dataclass, asdict

from pypfopt.expected_returns import mean_historical_return
from pypfopt.risk_models import CovarianceShrinkage
from pypfopt.efficient_frontier import EfficientFrontier

from numba import jit

from tqdm import tqdm

In [9]:
year_days = int(funds_df.groupby(funds_df.index.year).count().mean().mean())
month_days = int(year_days/12)
week_days = int(month_days/4)
print(f"Using default number of trading days in a year={year_days}, month={month_days}, week={week_days}")

max_volatility = 0.1
print(f"Using volatility threshold={max_volatility}")

Using default number of trading days in a year=252, month=21, week=5
Using volatility threshold=0.1


In [10]:
@dataclass
class Portfolio(object):
    AP: float
    ARR: float
    ARW: float
    G: float
    OP: float
    ORR: float
    ORW: float

    def as_weights(self):
        return np.array(list(asdict(self).values()))


def risk_free_rate(table, interval_len_in_days: int=1) -> int:
    """ For the purpose of this portfolio selection, we use G (cash fund returns) as a risk-free rate. """
    intervals_in_a_year = year_days / interval_len_in_days
    return np.mean((df['G'].shift(1) - df['G'].iloc[:-1]) / df['G'].iloc[:-1]) * intervals_in_a_year


def _ef_builder(table):
    """ Builder for all kinds of efficient frontier models. """
    table = table.copy().groupby(by=[table.index.year, table.index.month]).tail(n=1)
    mu = mean_historical_return(table)
    S = CovarianceShrinkage(table).ledoit_wolf()
    return EfficientFrontier(mu, S)

def _ef_meta_builder(ef):
    """ 
    Common metadata for all kinds of efficient frontier models:
    expected annualized returns, expected volatility, sharpe ratio
    """
    return np.array(ef.portfolio_performance())

def ef_max_sharpe(table, interval: int=1):
    ef = _ef_builder(table)
    weights = ef.max_sharpe(risk_free_rate(interval))
    return Portfolio(**weights), _ef_meta_builder(ef)

def ef_min_volatility(table, interval: int=1):
    ef = _ef_builder(table)
    weights = ef.min_volatility()
    return Portfolio(**weights), _ef_meta_builder(ef)

def ef_efficient_risk(table, risk_target: float, interval: int=1):
    ef = _ef_builder(table)
    weights = ef.efficient_risk(risk_target, risk_free_rate=risk_free_rate(interval))
    return Portfolio(**weights), _ef_meta_builder(ef)

def ef_efficient_return(table, target_return, interval: int=1):
    ef = _ef_builder(table)
    weights = ef.efficient_return(target_return)
    return Portfolio(**weights), _ef_meta_builder(ef)

In [11]:
def calculate_features(funds_df: pd.DataFrame) -> Tuple[np.array, List[Portfolio]]:
    """
    Returns features for the model along with the portfolios that were used for calculating these features.
    """
    monthly_table = funds_df.groupby([funds_df.index.year, funds_df.index.month]).tail(1)
    weekly_table = funds_df.groupby([funds_df.index.year, funds_df.index.week]).tail(1)
    portfolio_data = [
        ef_max_sharpe(funds_df),
        ef_max_sharpe(monthly_table, month_days),
        ef_max_sharpe(weekly_table, week_days),
        ef_min_volatility(funds_df),
        ef_min_volatility(monthly_table, month_days),
        ef_min_volatility(weekly_table, week_days),
        ef_efficient_risk(funds_df, 0.10),
        ef_efficient_risk(funds_df, 0.05),
        ef_efficient_risk(monthly_table, 0.10, month_days),
        ef_efficient_risk(monthly_table, 0.05, month_days),
        ef_efficient_risk(weekly_table, 0.10, week_days),
        ef_efficient_risk(weekly_table, 0.05, week_days),
        ef_efficient_return(funds_df, 0.04),
        ef_efficient_return(funds_df, 0.08),
        ef_efficient_return(monthly_table, 0.04, month_days),
        ef_efficient_return(monthly_table, 0.08, month_days),
        ef_efficient_return(weekly_table, 0.04, week_days),
        ef_efficient_return(weekly_table, 0.08, week_days),
    ]
    portfolios = [pd[0] for pd in portfolio_data]
    portfolio_weights = np.hstack([p.as_weights() for p in portfolios])
    portfolio_features = np.hstack([pd[1] for pd in portfolio_data])
    return np.hstack([portfolio_weights, portfolio_features]), portfolios
    

@jit(nopython=True)
def portfolio_performance(portfolio_allocation: np.array, fund_df_values: np.array) -> Tuple[float, float]:
    fund_returns = (fund_df_values[-1] - fund_df_values[0]) / fund_df_values[0]
    portfolio_returns = np.sum(portfolio_allocation*fund_returns) / np.sum(portfolio_allocation)
    # eliminating portfolios based on too high volatility
    # volatility is calculated using formula from https://bit.ly/2RDVib9
    portfolio_value = np.sum(portfolio_allocation*fund_df_values, axis=1)
    portfolio_weekly = portfolio_value[::week_days]
    portfolio_weekly_returns = (portfolio_weekly[:-1] - portfolio_weekly[1:]) / portfolio_weekly[1:]
    portfolio_volatility = np.sum((portfolio_weekly_returns-np.mean(portfolio_weekly_returns))**2) / week_days
    if portfolio_volatility > max_volatility:
        print("Exceeded max volatility:", portfolio_volatility)
        return -np.inf
    else:
        return portfolio_returns

In [12]:
%%time
features, pfs = calculate_features(funds_df)
print(features.shape, len(pfs), pfs[0])

(180,) 18 Portfolio(AP=0.0, ARR=0.024181468182686638, ARW=8.672517891848946e-18, G=0.6310321560035437, OP=0.18473105504718992, ORR=0.13726016386815026, ORW=0.022795156898429723)
CPU times: user 968 ms, sys: 164 ms, total: 1.13 s
Wall time: 1.13 s


In [13]:
portfolio_performance(np.array([0,0.5,0.5,0,0,0,0]), funds_df.values)

Exceeded max volatility: 0.1115490849185532


-inf

In [14]:
funds_df.shape

(4801, 7)

In [15]:
def compute_X_y(
        funds_df: pd.DataFrame,
        n_features: int,
        n_portfolios: int=18,
        min_test_idx: int=3*year_days, 
        test_len: int=year_days,
        max_train_len: int=None,
    ):
    max_test_idx = len(funds_df)
    n_samples = max_test_idx - min_test_idx - test_len
    X = np.zeros((n_samples, n_features))
    all_performances = np.zeros((n_samples, n_portfolios))
    y = np.zeros(n_samples).astype(np.int)
    sample_weights = np.zeros(n_samples)
    for idx in tqdm(range(n_samples)):
        test_starting_idx = min_test_idx + idx
        if max_train_len is None:
            present_data = funds_df.iloc[:test_starting_idx]
        else:
            present_data = funds_df.iloc[max(0, test_starting_idx-max_train_len):test_starting_idx]
        test_data = funds_df.iloc[test_starting_idx:test_starting_idx+test_len]
        X[idx], portfolios = calculate_features(present_data)
        portfolio_performances = np.array([portfolio_performance(p.as_weights(), test_data.values) for p in portfolios])
        all_performances[idx] = portfolio_performances
        y[idx] = np.argmax(portfolio_performances)
        sample_weights[idx] = 1 + np.max(portfolio_performances) - np.mean(portfolio_performances[portfolio_performances > -np.inf])
    return X, y, all_performances, sample_weights

## 1st run: entire timespan available

In [16]:
min_test_idx = 3*year_days

n_portfolios = 18
n_funds = 7
n_portfolio_features = 3

In [17]:
X, y, all_performances, sample_weights = compute_X_y(funds_df, n_features=180, min_test_idx=min_test_idx)
X.shape, y.shape, X.dtype, y.dtype

100%|██████████| 3793/3793 [1:02:41<00:00,  1.08it/s]


((3793, 180), (3793,), dtype('float64'), dtype('int64'))

In [18]:
assert(n_portfolios*(n_funds+n_portfolio_features) == X.shape[1])
# base-models were fitted on a rolling-window dataset, so we don't have the targets for its beginning:
funds_df.shape, X.shape

((4801, 7), (3793, 180))

In [19]:
# some utilities for converting numpy arrays back to indexed dataframes for simpler analysis:
import re

def generator_to_name(gen: str) -> str:
    name = re.sub('funds_df', 'daily', gen)
    name = re.sub('_table', '', name)
    name = re.sub('\w+_days', '', name)
    name = re.sub('\)+', '', name)
    name = re.sub('[\(, ]+', '_', name)
    if name[-1] == '_':
        return name[:-1]
    else:
        return name

portfolio_names = list(map(generator_to_name, [
    'ef_max_sharpe(funds_df)',
    'ef_max_sharpe(monthly_table, month_days)',
    'ef_max_sharpe(weekly_table, week_days)',
    'ef_min_volatility(funds_df)',
    'ef_min_volatility(monthly_table, month_days)',
    'ef_min_volatility(weekly_table, week_days)',
    'ef_efficient_risk(funds_df, 0.10)',
    'ef_efficient_risk(funds_df, 0.05)',
    'ef_efficient_risk(monthly_table, 0.10, month_days)',
    'ef_efficient_risk(monthly_table, 0.05, month_days)',
    'ef_efficient_risk(weekly_table, 0.10, week_days)',
    'ef_efficient_risk(weekly_table, 0.05, week_days)',
    'ef_efficient_return(funds_df, 0.04)',
    'ef_efficient_return(funds_df, 0.08)',
    'ef_efficient_return(monthly_table, 0.04, month_days)',
    'ef_efficient_return(monthly_table, 0.08, month_days)',
    'ef_efficient_return(weekly_table, 0.04, week_days)',
    'ef_efficient_return(weekly_table, 0.08, week_days)',
]))

allocation_colnames = [
    f'{basemodel}_{fundname}' 
    for basemodel in portfolio_names
    for fundname in funds_df.columns
]
print(len(allocation_colnames))
performance_colnames = [
    f'{basemodel}_{metric}' 
    for basemodel in portfolio_names
    for metric in ['e_returns', 'e_volatility', 'sharpe']
]
print(len(performance_colnames))
assert(len(performance_colnames)+len(allocation_colnames) == X.shape[1])

126
54


In [20]:
portfolio_allocations = pd.DataFrame(
    X[:,:n_portfolios*n_funds], 
    index=funds_df.index[min_test_idx:min_test_idx+X.shape[0]],
    columns=allocation_colnames
)
portfolio_features = pd.DataFrame(
    X[:,n_portfolios*n_funds:], 
    index=funds_df.index[min_test_idx:min_test_idx+X.shape[0]],
    columns=performance_colnames
)
portfolio_test_performances = pd.DataFrame(
    all_performances,
    index=funds_df.index[min_test_idx:min_test_idx+all_performances.shape[0]],
    columns=portfolio_names
)
optimal_portfolios = pd.Series(
    y, 
    index=funds_df.index[min_test_idx:min_test_idx+y.shape[0]]
).map(lambda i: portfolio_names[i])

In [21]:
filename = "18-funds-data-" + str(pd.datetime.now().date())
print(filename)
portfolio_allocations.to_csv(f"{filename}-allocations.csv", index=True)
portfolio_features.to_csv(f"{filename}-features.csv", index=True)
portfolio_test_performances.to_csv(f"{filename}-test-performances.csv", index=True)
optimal_portfolios.to_csv(f"{filename}-optimal-portfolios.csv", index=True)

18-funds-data-2019-03-16


## 2nd run: model input contains no more than the last 5 years of data

In [22]:
min_test_idx = 3*year_days
max_train_len = 5*year_days

In [23]:
X, y, all_performances, sample_weights = compute_X_y(
    funds_df, 
    n_features=180, 
    min_test_idx=min_test_idx,
    max_train_len=max_train_len
)
X.shape, y.shape, X.dtype, y.dtype

100%|██████████| 3793/3793 [1:11:18<00:00,  1.15it/s]


((3793, 180), (3793,), dtype('float64'), dtype('int64'))

In [24]:
portfolio_allocations = pd.DataFrame(
    X[:,:n_portfolios*n_funds], 
    index=funds_df.index[min_test_idx:min_test_idx+X.shape[0]],
    columns=allocation_colnames
)
portfolio_features = pd.DataFrame(
    X[:,n_portfolios*n_funds:], 
    index=funds_df.index[min_test_idx:min_test_idx+X.shape[0]],
    columns=performance_colnames
)
portfolio_test_performances = pd.DataFrame(
    all_performances,
    index=funds_df.index[min_test_idx:min_test_idx+all_performances.shape[0]],
    columns=portfolio_names
)
optimal_portfolios = pd.Series(
    y, 
    index=funds_df.index[min_test_idx:min_test_idx+y.shape[0]]
).map(lambda i: portfolio_names[i])

In [25]:
filename = f"18-funds-data-clipped-{max_train_len}-{pd.datetime.now().date()}"
print(filename)
portfolio_allocations.to_csv(f"{filename}-allocations.csv", index=True)
portfolio_features.to_csv(f"{filename}-features.csv", index=True)
portfolio_test_performances.to_csv(f"{filename}-test-performances.csv", index=True)
optimal_portfolios.to_csv(f"{filename}-optimal-portfolios.csv", index=True)

18-funds-data-clipped-1260-2019-03-17
