# Stacking

### Idea
Because training base models is hard, the possible optimal solution may be to take a lot of classical models
for portfolio creation and train a 2nd-layer model on their outputs, to select an optimal output.

The challenges we need to solve in this notebook are the following:
- selecting a target variable and metric
- generating as many simple models as possible
- adding features related to the long-term trends for 2nd layer model for it to perform a "more informed" choice
- choosing whether base models should be allowed to see entire past that we have or just the most recent fraction of it

### Loading data

In [None]:
import pandas as pd

prefix = '../input/'
FUNDS_CSV = prefix+'Quant_Invest_Fundusze.csv'
STOCKS_CSV = prefix+'all_indices_close.csv'
COMMODITIES_CSV = prefix+'all_commodities_close.csv'
RATES_CSV = prefix+'policy_rates.csv'
FX_CSV = prefix+'exchange_rates.csv'


def _load_indexed(path, index_col='Daty', sep=',') -> pd.DataFrame:
    df = pd.read_csv(path, index_col=index_col, sep=sep)
    df.index = pd.to_datetime(df.index)
    return df.sort_index()


def load_funds() -> pd.DataFrame:
    return _load_indexed(FUNDS_CSV, sep=';')


def load_stocks() -> pd.DataFrame:
    return _load_indexed(STOCKS_CSV, index_col='Data')


def load_commodities() -> pd.DataFrame:
    return _load_indexed(COMMODITIES_CSV, index_col='Data')


def load_rates() -> pd.DataFrame:
    policy_rates = pd.read_csv(RATES_CSV, index_col = 0)
    policy_rates = policy_rates.pivot(index='date', columns='reference_area', values='obs_value')
    policy_rates.index = pd.to_datetime(policy_rates.index)
    return policy_rates.iloc[:(-42)]  # keep only years 2000-2018


def load_fx() -> pd.DataFrame:
    exchange_rates = pd.read_csv(FX_CSV, index_col=0)
    exchange_rates.index = pd.to_datetime(exchange_rates['Date'])
    shortnames={}
    for col in exchange_rates.columns[1:]:
        start = col.index('(')
        end = col.index(')')
        shortnames[col] = col[(start+1):end]
    return exchange_rates.rename(columns = shortnames).drop(columns=['Date'])


def load_all() -> pd.DataFrame:
    return pd.concat([
        load_funds(),
        load_stocks(),
        load_commodities(),
        load_rates(),
        load_fx()
    ], axis='columns')

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = load_all()
df = df.loc[~df['AP'].isna()]
df.head()

In [None]:
df.shape

In [None]:
fund_colnames = ['AP', 'ARR', 'ARW', 'G', 'OP', 'ORR', 'ORW']

In [None]:
funds_df = df[fund_colnames]

In [None]:
funds_df.shape

### Feature and target selection

Firstly, we will try to train a classifier that selects the best performing model.

In [None]:
from typing import List, Callable, Tuple
from dataclasses import dataclass, asdict

from pypfopt.expected_returns import mean_historical_return
from pypfopt.risk_models import CovarianceShrinkage
from pypfopt.efficient_frontier import EfficientFrontier

from numba import jit

from tqdm import tqdm

In [None]:
year_days = int(funds_df.groupby(funds_df.index.year).count().mean().mean())
month_days = int(year_days/12)
week_days = int(month_days/4)
print(f"Using default number of trading days in a year={year_days}, month={month_days}, week={week_days}")

max_volatility = 0.1
print(f"Using volatility threshold={max_volatility}")

In [None]:
@dataclass
class Portfolio(object):
    AP: float
    ARR: float
    ARW: float
    G: float
    OP: float
    ORR: float
    ORW: float

    def as_weights(self):
        return np.array(list(asdict(self).values()))


def risk_free_rate(table, interval_len_in_days: int=1) -> int:
    """ For the purpose of this portfolio selection, we use G (cash fund returns) as a risk-free rate. """
    intervals_in_a_year = year_days / interval_len_in_days
    return np.mean((df['G'].shift(1) - df['G'].iloc[:-1]) / df['G'].iloc[:-1]) * intervals_in_a_year


def _ef_builder(table):
    """ Builder for all kinds of efficient frontier models. """
    table = table.copy().groupby(by=[table.index.year, table.index.month]).tail(n=1)
    mu = mean_historical_return(table)
    S = CovarianceShrinkage(table).ledoit_wolf()
    return EfficientFrontier(mu, S)

def _ef_meta_builder(ef):
    """ 
    Common metadata for all kinds of efficient frontier models:
    expected annualized returns, expected volatility, sharpe ratio
    """
    return np.array(ef.portfolio_performance())

def ef_max_sharpe(table, interval: int=1):
    ef = _ef_builder(table)
    weights = ef.max_sharpe(risk_free_rate(interval))
    return Portfolio(**weights), _ef_meta_builder(ef)

def ef_min_volatility(table, interval: int=1):
    ef = _ef_builder(table)
    weights = ef.min_volatility()
    return Portfolio(**weights), _ef_meta_builder(ef)

def ef_efficient_risk(table, risk_target: float, interval: int=1):
    ef = _ef_builder(table)
    weights = ef.efficient_risk(risk_target, risk_free_rate=risk_free_rate(interval))
    return Portfolio(**weights), _ef_meta_builder(ef)

def ef_efficient_return(table, target_return, interval: int=1):
    ef = _ef_builder(table)
    weights = ef.efficient_return(target_return)
    return Portfolio(**weights), _ef_meta_builder(ef)

In [None]:
def calculate_features(funds_df: pd.DataFrame) -> Tuple[np.array, List[Portfolio]]:
    """
    Returns features for the model along with the portfolios that were used for calculating these features.
    """
    monthly_table = funds_df.groupby([funds_df.index.year, funds_df.index.month]).tail(1)
    weekly_table = funds_df.groupby([funds_df.index.year, funds_df.index.week]).tail(1)
    portfolio_data = [
        ef_max_sharpe(funds_df),
        ef_max_sharpe(monthly_table, month_days),
        ef_max_sharpe(weekly_table, week_days),
        ef_min_volatility(funds_df),
        ef_min_volatility(monthly_table, month_days),
        ef_min_volatility(weekly_table, week_days),
        ef_efficient_risk(funds_df, 0.10),
        ef_efficient_risk(funds_df, 0.05),
        ef_efficient_risk(monthly_table, 0.10, month_days),
        ef_efficient_risk(monthly_table, 0.05, month_days),
        ef_efficient_risk(weekly_table, 0.10, week_days),
        ef_efficient_risk(weekly_table, 0.05, week_days),
        ef_efficient_return(funds_df, 0.04),
        ef_efficient_return(funds_df, 0.08),
        ef_efficient_return(monthly_table, 0.04, month_days),
        ef_efficient_return(monthly_table, 0.08, month_days),
        ef_efficient_return(weekly_table, 0.04, week_days),
        ef_efficient_return(weekly_table, 0.08, week_days),
    ]
    portfolios = [pd[0] for pd in portfolio_data]
    portfolio_weights = np.hstack([p.as_weights() for p in portfolios])
    portfolio_features = np.hstack([pd[1] for pd in portfolio_data])
    return np.hstack([portfolio_weights, portfolio_features]), portfolios
    

@jit(nopython=True)
def portfolio_performance(portfolio_allocation: np.array, fund_df_values: np.array) -> Tuple[float, float]:
    fund_returns = (fund_df_values[-1] - fund_df_values[0]) / fund_df_values[0]
    portfolio_returns = np.sum(portfolio_allocation*fund_returns) / np.sum(portfolio_allocation)
    # eliminating portfolios based on too high volatility
    portfolio_period_values = np.sum(portfolio_allocation*fund_df_values[:-1], axis=1) / np.sum(portfolio_allocation)
    portfolio_period_shifted = np.sum(portfolio_allocation*fund_df_values[1:], axis=1) / np.sum(portfolio_allocation)
    portfolio_daily_returns = (portfolio_period_shifted - portfolio_period_values) / portfolio_period_values
    portfolio_volatility = np.var(portfolio_daily_returns)
    if portfolio_volatility > max_volatility:
        print("Exceeded max volatility:", portfolio_volatility)
        return -np.inf
    else:
        return portfolio_returns

In [None]:
%%time
features, pfs = calculate_features(funds_df)
print(features.shape, len(pfs), pfs[0])

In [None]:
portfolio_performance(np.array([0,0,0,0,0,0,1]), funds_df.values)

In [None]:
funds_df.shape

In [None]:
def compute_X_y(
        funds_df: pd.DataFrame,
        n_features: int,
        n_portfolios: int=18,
        min_test_idx: int=3*year_days, 
        test_len: int=year_days,
        max_train_len: int=None,
    ):
    max_test_idx = len(funds_df)
    n_samples = max_test_idx - min_test_idx - test_len
    X = np.zeros((n_samples, n_features))
    all_performances = np.zeros((n_samples, n_portfolios))
    y = np.zeros(n_samples).astype(np.int)
    sample_weights = np.zeros(n_samples)
    for idx in tqdm(range(n_samples)):
        test_starting_idx = min_test_idx + idx
        if max_train_len is None:
            present_data = funds_df.iloc[:test_starting_idx]
        else:
            present_data = funds_df.iloc[max(0, test_starting_idx-max_train_len):test_starting_idx]
        test_data = funds_df.iloc[test_starting_idx:test_starting_idx+test_len]
        X[idx], portfolios = calculate_features(present_data)
        portfolio_performances = np.array([portfolio_performance(p.as_weights(), test_data.values) for p in portfolios])
        all_performances[idx] = portfolio_performances
        y[idx] = np.argmax(portfolio_performances)
        sample_weights[idx] = 1 + np.max(portfolio_performances) - np.mean(portfolio_performances[portfolio_performances > -np.inf])
    return X, y, all_performances, sample_weights

## 1st run: entire timespan available

In [None]:
min_test_idx = 3*year_days

n_portfolios = 18
n_funds = 7
n_portfolio_features = 3

In [None]:
X, y, all_performances, sample_weights = compute_X_y(funds_df, n_features=180, min_test_idx=min_test_idx)
X.shape, y.shape, X.dtype, y.dtype

In [None]:
assert(n_portfolios*(n_funds+n_portfolio_features) == X.shape[1])
# base-models were fitted on a rolling-window dataset, so we don't have the targets for its beginning:
funds_df.shape, X.shape

In [None]:
# some utilities for converting numpy arrays back to indexed dataframes for simpler analysis:
import re

def generator_to_name(gen: str) -> str:
    name = re.sub('funds_df', 'daily', gen)
    name = re.sub('_table', '', name)
    name = re.sub('\w+_days', '', name)
    name = re.sub('\)+', '', name)
    name = re.sub('[\(, ]+', '_', name)
    if name[-1] == '_':
        return name[:-1]
    else:
        return name

portfolio_names = list(map(generator_to_name, [
    'ef_max_sharpe(funds_df)',
    'ef_max_sharpe(monthly_table, month_days)',
    'ef_max_sharpe(weekly_table, week_days)',
    'ef_min_volatility(funds_df)',
    'ef_min_volatility(monthly_table, month_days)',
    'ef_min_volatility(weekly_table, week_days)',
    'ef_efficient_risk(funds_df, 0.10)',
    'ef_efficient_risk(funds_df, 0.05)',
    'ef_efficient_risk(monthly_table, 0.10, month_days)',
    'ef_efficient_risk(monthly_table, 0.05, month_days)',
    'ef_efficient_risk(weekly_table, 0.10, week_days)',
    'ef_efficient_risk(weekly_table, 0.05, week_days)',
    'ef_efficient_return(funds_df, 0.04)',
    'ef_efficient_return(funds_df, 0.08)',
    'ef_efficient_return(monthly_table, 0.04, month_days)',
    'ef_efficient_return(monthly_table, 0.08, month_days)',
    'ef_efficient_return(weekly_table, 0.04, week_days)',
    'ef_efficient_return(weekly_table, 0.08, week_days)',
]))

allocation_colnames = [
    f'{basemodel}_{fundname}' 
    for basemodel in portfolio_names
    for fundname in funds_df.columns
]
print(len(allocation_colnames))
performance_colnames = [
    f'{basemodel}_{metric}' 
    for basemodel in portfolio_names
    for metric in ['e_returns', 'e_volatility', 'sharpe']
]
print(len(performance_colnames))
assert(len(performance_colnames)+len(allocation_colnames) == X.shape[1])

In [None]:
portfolio_allocations = pd.DataFrame(
    X[:,:n_portfolios*n_funds], 
    index=funds_df.index[min_test_idx:min_test_idx+X.shape[0]],
    columns=allocation_colnames
)
portfolio_features = pd.DataFrame(
    X[:,n_portfolios*n_funds:], 
    index=funds_df.index[min_test_idx:min_test_idx+X.shape[0]],
    columns=performance_colnames
)
portfolio_test_performances = pd.DataFrame(
    all_performances,
    index=funds_df.index[min_test_idx:min_test_idx+all_performances.shape[0]],
    columns=portfolio_names
)
optimal_portfolios = pd.Series(
    y, 
    index=funds_df.index[min_test_idx:min_test_idx+y.shape[0]]
).map(lambda i: portfolio_names[i])

In [None]:
filename = "18-funds-data-" + str(pd.datetime.now().date())
print(filename)
portfolio_allocations.to_csv(f"{filename}-allocations.csv", index=True)
portfolio_features.to_csv(f"{filename}-features.csv", index=True)
portfolio_test_performances.to_csv(f"{filename}-test-performances.csv", index=True)
optimal_portfolios.to_csv(f"{filename}-optimal-portfolios.csv", index=True)

## 2nd run: model input contains no more than the last 5 years of data

In [None]:
min_test_idx = 3*year_days
max_train_len = 5*year_days

In [None]:
X, y, all_performances, sample_weights = compute_X_y(
    funds_df, 
    n_features=180, 
    min_test_idx=min_test_idx,
    max_train_len=max_train_len
)
X.shape, y.shape, X.dtype, y.dtype

In [None]:
portfolio_allocations = pd.DataFrame(
    X[:,:n_portfolios*n_funds], 
    index=funds_df.index[min_test_idx:min_test_idx+X.shape[0]],
    columns=allocation_colnames
)
portfolio_features = pd.DataFrame(
    X[:,n_portfolios*n_funds:], 
    index=funds_df.index[min_test_idx:min_test_idx+X.shape[0]],
    columns=performance_colnames
)
portfolio_test_performances = pd.DataFrame(
    all_performances,
    index=funds_df.index[min_test_idx:min_test_idx+all_performances.shape[0]],
    columns=portfolio_names
)
optimal_portfolios = pd.Series(
    y, 
    index=funds_df.index[min_test_idx:min_test_idx+y.shape[0]]
).map(lambda i: portfolio_names[i])

In [None]:
filename = f"18-funds-data-clipped-{max_train_len}-{pd.datetime.now().date()}"
print(filename)
portfolio_allocations.to_csv(f"{filename}-allocations.csv", index=True)
portfolio_features.to_csv(f"{filename}-features.csv", index=True)
portfolio_test_performances.to_csv(f"{filename}-test-performances.csv", index=True)
optimal_portfolios.to_csv(f"{filename}-optimal-portfolios.csv", index=True)