# Simple Model using Learning-to-Rate and optimization

Input: single principal component based on 6 basic indicators 

Output: relevance based on ranking of shape ratios for the next period

## Data Pre-processing

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [24]:
import os
os.chdir(os.path.expanduser('../data'))

from dotenv import load_dotenv
load_dotenv()

from DataProvider import DataProvider

In [25]:
data_provider = DataProvider()

df_countries, benchmark = data_provider.get_etf_data()
acwi_weights = data_provider.get_acwi_weights()

In [26]:
days_to_recalculate = data_provider.get_days_to_recalculate()

# Start after 1 year to have enough historic data for first calculated period
days_to_recalculate = days_to_recalculate.iloc[12:]
days_to_recalculate = pd.DatetimeIndex(days_to_recalculate)
days_to_recalculate

DatetimeIndex(['2000-01-04', '2000-02-04', '2000-03-06', '2000-04-04',
               '2000-05-04', '2000-06-06', '2000-07-05', '2000-08-04',
               '2000-09-06', '2000-10-04',
               ...
               '2023-03-03', '2023-04-06', '2023-05-04', '2023-06-06',
               '2023-07-06', '2023-08-03', '2023-09-06', '2023-10-05',
               '2023-11-03', '2023-12-05'],
              dtype='datetime64[ns]', name='Days to rebalance', length=288, freq=None)

In [29]:
indicators = data_provider.calculate_principal_component_from_indicators(
        days_to_recalculate[0], periods=6)
indicators.T.values

array([[-0.75414967, -0.63148041, -0.68286332, -0.56792239, -0.57738397,
        -0.57691109],
       [-1.37506754, -1.36820842, -1.03389077, -1.13819681, -1.27770767,
        -1.66832854],
       [-0.94391362, -1.12277125, -0.81951497, -0.77896941, -0.97939907,
        -0.88658798],
       [-0.74243555, -0.67863144, -0.63840573, -0.49487319, -0.63207811,
        -0.7273617 ],
       [-1.03896833, -1.13225612, -0.92937268, -0.92076676, -0.89687453,
        -0.91243863],
       [-1.04410309, -1.04850716, -0.82580965, -0.85899611, -0.91826449,
        -0.93418662],
       [-0.98600242, -0.82084437, -1.0142055 , -1.04793988, -1.01108072,
        -0.84102264],
       [-0.8606228 , -0.85611744, -0.74943561, -0.64448667, -0.6643045 ,
        -0.66450215],
       [-0.84852124, -0.81512883, -0.452554  , -0.56783007, -0.72382879,
        -0.68927492],
       [-0.98119477, -1.24935114, -0.98647472, -0.67033475, -0.89899009,
        -1.04748732],
       [-1.97814178, -2.18338445, -2.33245775, -2.

In [31]:
x = []
for date in days_to_recalculate:
    indicators = data_provider.calculate_principal_component_from_indicators(
        date, periods=6)
    x.extend(indicators.T.values)

x = np.array(x)
x.shape

(7776, 6)

For target I calcutate sharpe ratio for the next period, sort it by values and set a relevance from 0 to 26 depending on sharpe value

In [56]:
data_period = df_countries.loc[days_to_recalculate[0]:days_to_recalculate[1]]

returns_period = np.log(data_period).diff().dropna()
sharpe_ratio = returns_period.mean()/returns_period.std()
sharpe_ratio

United States     0.048396
Japan             0.043928
United Kingdom   -0.328966
Canada            0.280395
France            0.178218
Switzerland      -0.173104
Germany           0.209775
Australia        -0.060210
Netherlands      -0.095462
Sweden            0.173481
Hong Kong        -0.064298
Spain            -0.072430
Italy             0.100664
Singapore        -0.213353
Denmark          -0.083756
Finland           0.137049
Belgium          -0.219865
Norway           -0.176771
China            -0.223044
Taiwan            0.304264
India             0.196962
Korea            -0.126472
Brazil            0.319706
Russia            0.197139
South Africa     -0.055532
Mexico            0.145966
Malaysia          0.272365
dtype: float64

In [72]:
sorted_idx = np.argsort(sharpe_ratio.values)
sorted_sharpe = sharpe_ratio.iloc[sorted_idx]
sorted_sharpe

United Kingdom   -0.328966
China            -0.223044
Belgium          -0.219865
Singapore        -0.213353
Norway           -0.176771
Switzerland      -0.173104
Korea            -0.126472
Netherlands      -0.095462
Denmark          -0.083756
Spain            -0.072430
Hong Kong        -0.064298
Australia        -0.060210
South Africa     -0.055532
Japan             0.043928
United States     0.048396
Italy             0.100664
Finland           0.137049
Mexico            0.145966
Sweden            0.173481
France            0.178218
India             0.196962
Russia            0.197139
Germany           0.209775
Malaysia          0.272365
Canada            0.280395
Taiwan            0.304264
Brazil            0.319706
dtype: float64

In [68]:
for country in data_provider.selected_countries:
    print(country, sorted_sharpe.index.get_loc(country))

United States 14
Japan 13
United Kingdom 0
Canada 24
France 19
Switzerland 5
Germany 22
Australia 11
Netherlands 7
Sweden 18
Hong Kong 10
Spain 9
Italy 15
Singapore 3
Denmark 8
Finland 16
Belgium 2
Norway 4
China 1
Taiwan 25
India 20
Korea 6
Brazil 26
Russia 21
South Africa 12
Mexico 17
Malaysia 23


In [73]:
y = []
for i in range(0,len(days_to_recalculate)):
    if i == len(days_to_recalculate) - 1:
        data_period = df_countries.loc[
            days_to_recalculate[i]:]
    else:
        data_period = df_countries.loc[
            days_to_recalculate[i]:days_to_recalculate[i+1]]
    i += 1

    returns_period = np.log(data_period).diff().dropna()
    sharpe_ratio = returns_period.mean()/returns_period.std()

    sorted_idx = np.argsort(sharpe_ratio.values)
    sorted_sharpe = sharpe_ratio.iloc[sorted_idx]

    for country in data_provider.selected_countries:
        y.append(sorted_sharpe.index.get_loc(country)) # relevance 

y = np.array(y)
y.shape

(7776,)

## Splitting data into 80% train and 20% test

In [74]:
test_split = int(0.8 * (x.shape[0]/27))*27
dates_split = (int)(test_split/27)

In [75]:
# Train
dates_for_training = days_to_recalculate[:dates_split]
print(dates_for_training)

x_train = x[:test_split]
print(x_train.shape)

y_train = y[:test_split]
print(y_train.shape)

qid_train = []
for i in range(dates_for_training.shape[0]):
    for _ in range(27):
        qid_train.append(i)

qid_train = np.array(qid_train)
print(qid_train.shape)

DatetimeIndex(['2000-01-04', '2000-02-04', '2000-03-06', '2000-04-04',
               '2000-05-04', '2000-06-06', '2000-07-05', '2000-08-04',
               '2000-09-06', '2000-10-04',
               ...
               '2018-05-04', '2018-06-05', '2018-07-06', '2018-08-03',
               '2018-09-05', '2018-10-04', '2018-11-06', '2018-12-06',
               '2019-01-10', '2019-02-08'],
              dtype='datetime64[ns]', name='Days to rebalance', length=230, freq=None)
(6210, 6)
(6210,)
(6210,)


In [76]:
# Test
dates_for_test = days_to_recalculate[dates_split:]
print(dates_for_test)

x_test = x[test_split:]
print(x_test.shape)

y_test = y[test_split:]
print(y_test.shape)

qid_test = []
for i in range(dates_for_test.shape[0]):
    for _ in range(27):
        qid_test.append(i)

qid_test = np.array(qid_test)
print(qid_test.shape)

DatetimeIndex(['2019-03-05', '2019-04-04', '2019-05-07', '2019-06-05',
               '2019-07-05', '2019-08-06', '2019-09-04', '2019-10-04',
               '2019-11-05', '2019-12-04', '2020-01-09', '2020-02-05',
               '2020-03-04', '2020-04-03', '2020-05-06', '2020-06-03',
               '2020-07-07', '2020-08-05', '2020-09-03', '2020-10-07',
               '2020-11-04', '2020-12-03', '2021-01-06', '2021-02-03',
               '2021-03-03', '2021-04-08', '2021-05-05', '2021-06-03',
               '2021-07-07', '2021-08-04', '2021-09-03', '2021-10-06',
               '2021-11-03', '2021-12-03', '2022-01-11', '2022-02-08',
               '2022-03-04', '2022-04-06', '2022-05-06', '2022-06-07',
               '2022-07-06', '2022-08-03', '2022-09-07', '2022-10-06',
               '2022-11-03', '2022-12-06', '2023-01-09', '2023-02-03',
               '2023-03-03', '2023-04-06', '2023-05-04', '2023-06-06',
               '2023-07-06', '2023-08-03', '2023-09-06', '2023-10-05',
      

## Learn to Rank with XGB Ranker

https://xgboost.readthedocs.io/en/latest/tutorials/learning_to_rank.html

In [80]:
import xgboost as xgb

In [81]:
ranker = xgb.XGBRanker(
    n_estimators=512,
    tree_method="hist",
    device="cuda",
    learning_rate=0.01,
    reg_lambda=1.5,
    subsample=0.8,
    sampling_method="gradient_based",
    # LTR specific parameters
    objective="rank:ndcg",
    # - Enable bias estimation
    lambdarank_unbiased=True,
    # - normalization (1 / (norm + 1))
    lambdarank_bias_norm=1,
    # - Focus on the top 12 documents
    lambdarank_num_pair_per_sample=12,
    lambdarank_pair_method="topk",
    ndcg_exp_gain=True,
    eval_metric=["ndcg@1", "ndcg@3", "ndcg@5", "ndcg@10"]
)

ranker.fit(
    x_train,
    y_train,
    qid=qid_train,
    verbose=True,
)

In [118]:
scores = ranker.predict(x_test)
reshaped_scores = scores.reshape(-1, 27)

In [121]:
sorted_scores_idx = np.argsort(reshaped_scores[0])[::-1]
scored_countries = pd.Series(data=reshaped_scores[0], index=data_provider.selected_countries)
sorted_countries = scored_countries.iloc[sorted_scores_idx]

relevance = []
for country in data_provider.selected_countries:
    relevance.append(sorted_countries.index.get_loc(country))

ranked_countries = pd.Series(data=relevance, index=data_provider.selected_countries)
ranked_countries

United States      9
Japan             17
United Kingdom     5
Canada            25
France            20
Switzerland       12
Germany            8
Australia         18
Netherlands       19
Sweden            24
Hong Kong         13
Spain              3
Italy              1
Singapore          6
Denmark            0
Finland           26
Belgium           22
Norway            23
China             10
Taiwan            21
India             11
Korea             15
Brazil             7
Russia            16
South Africa       2
Mexico            14
Malaysia           4
dtype: int64

## Backtesting

In [None]:
from helper_functions import calculate_returns_for_model, calculate_metrics

In [None]:
benchmark_test = benchmark[dates_for_test[0]:]
benchmark_returns = np.log(benchmark_test).diff().dropna()
cum_benchmark_returns = (1 + benchmark_returns).cumprod() - 1
cum_benchmark_returns.loc[dates_for_test[0]] = 0
cum_benchmark_returns.sort_index(inplace=True)

In [None]:
selected_countries = df_countries.columns
days_to_rebalance = dates_for_test.insert(29, df_countries.index.values[-1])
df_prices_test = df_countries[dates_for_test[0]:]
df_returns_test = np.log(df_prices_test).diff().fillna(0)

In [None]:
total_returns_random_forest, cum_total_returns_random_forest = calculate_returns_for_model(
    regressor_random_forest, x_test, dates_for_test, df_returns_test, selected_countries)

In [None]:
plt.figure(figsize=(20, 5))
plt.plot(cum_total_returns_random_forest, label='Model Random Forest')
plt.plot(cum_benchmark_returns, label='Benchmark')
plt.legend()
plt.title('Returns comparison')
plt.show()

In [None]:
df_results = pd.DataFrame(columns=['Annual Returns', 
                                   'Annual Volatility',
                                   'Sharpe Ratio',
                                   'Sortino Ratio',
                                   'Max Drawdown',
                                   'Max Time Under Water',
                                   'Calmar Ratio',
                                   'Information Ratio'])

calculate_metrics(benchmark_returns, df_results, 'Benchmark')
calculate_metrics(total_returns_random_forest, df_results, 'Model Random Forest', benchmark_returns)

df_results