# Machine Learning Asset Allocation

### Loading Libraries

In [1]:
# Randomness
import random

# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd
from pandas import Timestamp

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt

import plotly.graph_objects as go
import plotly.io as pio
%matplotlib inline

# Date & Time
from datetime import datetime, timedelta

# Typing
from typing import Tuple, List, Dict, Union, Optional, Any, Generator

# Scikit-Learn
from sklearn.pipeline import Pipeline
from sklearn.metrics import RocCurveDisplay
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection._split import _BaseKFold
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, log_loss

# Scientific Statistical Python
import scipy.cluster.hierarchy as sch
from scipy.stats import jarque_bera
from scipy.stats import rv_continuous, kstest, norm

#### Inverse Invariance Portafolio

In [2]:
def get_ivp(cov: np.ndarray, **kargs) -> np.ndarray:
    ivp = 1.0 / np.diag(cov)
    ivp /= ivp.sum()
    return ivp

#### Variance per Cluster

In [3]:
def get_cluster_var(cov: np.ndarray, cItems: np.ndarray) -> float:
    cov_ = cov.loc[cItems, cItems]    # matrix slice
    w_ = get_ivp(cov_).reshape(-1, 1)
    cVar = np.dot(np.dot(w_.T, cov_), w_)[0, 0]
    return cVar

#### Quasi-Diagonalization

In [4]:
def get_quasi_diag(link: np.ndarray) -> list:
    link = link.astype(int)
    sortIx = pd.Series([link[-1, 0], link[-1, 1]])
    numItems = link[-1, 3]    
    while sortIx.max() >= numItems:
        sortIx.index = range(0, sortIx.shape[0] * 2, 2)    
        df0 = sortIx[sortIx >= numItems]    
        i = df0.index
        j = df0.values - numItems
        sortIx[i] = link[j, 0]    
        df0 = pd.Series(link[j, 1], index=i+1)
        sortIx = sortIx.append(df0)    # item 2
        sortIx = sortIx.sort_index()    # re-sort
        sortIx.index = range(sortIx.shape[0])    
    lst =  sortIx.tolist()
    return lst

#### Hierarchical Risk Parity

In [5]:
def get_rec_bipart(cov: np.ndarray, sortIx: list) -> pd.Series:
    w = pd.Series([1] * len(sortIx), index=sortIx)
    cItems = [sortIx]    
    while len(cItems) > 0:
        cItems = [i[int(j): int(k)] for i in cItems
                  for j, k in ((0, len(i) / 2), (len(i) / 2, len(i))) if len(i) > 1]    
        for i in range(0, len(cItems), 2):    
            cItems0 = cItems[i]    
            cItems1 = cItems[i+1]    
            cVar0 = get_cluster_var(cov, cItems0)
            cVar1 = get_cluster_var(cov, cItems1)
            alpha = 1 - cVar0 / (cVar0 + cVar1)
            w[cItems0] *= alpha    
            w[cItems1] *= 1 - alpha    
    return w

#### Distance Matrix Correlation

In [6]:
def correl_dist(corr: np.ndarray) -> np.ndarray:
    dist = ((1 - corr) / 2.0) ** 0.5    
    return dist

#### Correlation Matrix Heat Maps

In [7]:
def plot_corr_matrix(corr: np.ndarray, labels: list = None, size: tuple = (9, 9)) -> None:
    fig, ax = plt.subplots(figsize=size)
    if labels is None:
        labels = []
    ax = sns.heatmap(corr)
    ax.set_yticks(np.arange(0.5, corr.shape[0] + 0.5), list(labels))
    ax.set_xticks(np.arange(0.5, corr.shape[0] + 0.5), list(labels))
    plt.show()

#### Data Generation

In [8]:
def generate_data(nObs: int, size0: int, size1: int, sigma1: float) -> Tuple[pd.DataFrame, list]:
    np.random.seed(seed=42)
    random.seed(42)
    x = np.random.normal(0, 1, size=(nObs, size0))    
    cols = [random.randint(0, size0 - 1) for i in range(size1)]
    y = x[:, cols] + np.random.normal(0, sigma1, size=(nObs, len(cols)))
    x = np.append(x, y, axis=1)
    x = pd.DataFrame(x, columns=range(1, x.shape[1] + 1))
    return x, cols

#### Running Simulation

In [9]:
def run_simulation() -> None:
    nObs, size0, size1, sigma1 = 10000, 5, 5, 0.25
    x, cols = generate_data(nObs, size0, size1, sigma1)
    print([(j + 1, size0 + i) for i, j in enumerate(cols, 1)])
    cov, corr = x.cov(), x.corr()
    plot_corr_matrix(corr, labels=corr.columns, size=(8, 6.5))
    dist = correl_dist(corr)
    link = sch.linkage(dist, 'single')
    sortIx = get_quasi_diag(link)
    sortIx = corr.index[sortIx].tolist()    
    df0 = corr.loc[sortIx, sortIx]    
    plot_corr_matrix(df0, labels=df0.columns, size=(8, 6.5))
    hrp = get_rec_bipart(cov, sortIx)
    print(hrp)

#### Random Shocks

In [10]:
def generate_data_mc(nObs: int, sLength: int, size0: int, size1: int, mu0: float, sigma0: float, sigma1F: float) -> Tuple[np.ndarray, list]:
    x = np.random.normal(mu0, sigma0, size=(nObs, size0))
    cols = [random.randint(0, size0 - 1) for i in range(size1)]
    y = x[:, cols] + np.random.normal(0, sigma0 * sigma1F, size=(nObs, len(cols)))
    x = np.append(x, y, axis=1)
    point = np.random.randint(sLength, nObs - 1, size=2)
    x[np.ix_(point, [cols[0], size0])] = np.array([[-0.5, -0.5], [2, 2]])
    point = np.random.randint(sLength, nObs - 1, size=2)
    x[point, cols[-1]] = np.array([-0.5, 2])
    return x, cols

#### Hierarchical Portafolio

In [11]:
def get_hrp(cov: np.ndarray, corr: np.ndarray) -> pd.Series:
    corr, cov = pd.DataFrame(corr), pd.DataFrame(cov)
    dist = correl_dist(corr)
    link = sch.linkage(dist, 'single')
    sortIx = get_quasi_diag(link)
    sortIx = corr.index[sortIx].tolist()    
    hrp = get_rec_bipart(cov,sortIx)
    return hrp.sort_index()

#### Monter-Carlo Hierarchical Portafolio

In [13]:
def hrp_mc(
    numIters: int = 1e2, nObs: int = 520, size0: int = 5, size1: int = 5, mu0: float = 0,
    sigma0: float = 1e-2, sigma1F: float = 0.25, sLength: int = 260, rebal: int = 22
) -> None:

    methods = [get_ivp, get_hrp]
    stats, numIter = {i.__name__: pd.Series() for i in methods}, 0
    pointers = range(sLength, nObs, rebal)
    while numIter < numIters:
        
        x, cols = generate_data_mc(nObs, sLength, size0, size1, mu0, sigma0, sigma1F)
        r = {i.__name__: pd.Series() for i in methods}
        
        for pointer in pointers:
            x_ = x[pointer - sLength: pointer]
            cov_, corr_ = np.cov(x_, rowvar=0), np.corrcoef(x_, rowvar=0)
            
            x_ = x[pointer: pointer + rebal]
            for func in methods:
                w_ = func(cov=cov_, corr=corr_)    # callback
                r_ = pd.Series(np.dot(x_, w_))
                r[func.__name__] = r[func.__name__].append(r_)
        
        for func in methods:
            r_ = r[func.__name__].reset_index(drop=True)
            p_ = (1 + r_).cumprod()
            stats[func.__name__].loc[numIter] = p_.iloc[-1] - 1
        numIter += 1
    
    stats = pd.DataFrame.from_dict(stats, orient='columns')
    df0, df1 = stats.std(), stats.var()
    print(pd.concat([df0, df1, df1 / df1['get_hrp'] - 1], axis=1))