# Supported Vector Regression

In [86]:
import numpy as np
from scipy.optimize import minimize
from tqdm import tqdm
import pandas as pd
import os
from datetime import datetime
import functions
from functions import *
from evaluate import *
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [274]:
# ticker_list = ['REE', 'SAM', 'HAP', 'GMD', 'GIL', 'TMS', 'SAV', 'DHA', 'MHC', 'HAS'] # 10 stocks with the most observations
ticker_list = ['REE', 'SAM', 'HAP'] # 3 stocks with the most observations
limits = {
    'hose':0.07,
    'hnx':0.1,
    'upcom':0.15
}
# Read and merge into 1 dataset

if "stock_data.csv" in os.listdir("data"):
    merged_df = pd.read_csv(
        os.path.join("data", "stock_data.csv"),
        index_col=None
    ).assign(
        date = lambda df : pd.to_datetime(df["date"])
    )
else:
    # Read and merge data
    hnx = pd.read_csv(os.path.join("data", "CafeF.HNX.Upto31.07.2025.csv")).assign(
        floor = "hnx"
    )
    hsx = pd.read_csv(os.path.join("data", "CafeF.HSX.Upto31.07.2025.csv")).assign(
        floor = "hose"
    )
    upcom = pd.read_csv(os.path.join("data", "CafeF.UPCOM.Upto31.07.2025.csv")).assign(
        floor = "upcom"
    )
    indexes = pd.read_csv(os.path.join("data", "CafeF.INDEX.Upto06.08.2025.csv")).assign(
        floor = "index"
    )

    # Rename columns
    hnx, hsx, upcom, indexes = [
        df.rename(columns={
            "<Ticker>":"ticker",
            "<DTYYYYMMDD>":"date",
            "<Open>":"open",
            "<High>":"high",
            "<Low>":"low",
            "<Close>":"close",
            "<Volume>":"volume"
        }) for df in [hnx, hsx, upcom, indexes]
    ]
        
    # Merge and clean data
    # UPCOM has missing tickers for some reason
    merged_df = pd.concat(
        [hnx, hsx, upcom, indexes],
        axis=0
    ).reset_index(drop=True).dropna(subset="ticker")\
    .assign(
        date=lambda df : df["date"].astype(str).apply(lambda x: datetime.strptime(x, "%Y%m%d").date())
    )
    merged_df.to_csv(
        os.path.join("data", "stock_data.csv"),
        index=False
    ) # Save merged data to save time in future runs


data = merged_df.sort_values(["ticker", "date"]).assign(
    returns = lambda df : df.groupby("ticker")["close"].pct_change()
)

data = data.loc[data["ticker"].str.len()==3] # Eliminate ETF, and indeces

data["limit"] = data["floor"].map(limits)
outliers = data.loc[data["returns"].abs() > data["limit"]]
clean_df = data.drop(outliers.index) # Remove outliers
print(f"% of observations removed: {round((len(outliers)/len(data))*100, 2)}%")

% of observations removed: 1.05%


In [273]:
def parkinson_variance(high, low):
    """
    Parkinson variance estimator with high and low prices
    var_{Pt} = [ln(H_t/L_t)^2]/(4*ln2)
    """
    return (np.log(high/low)**2) / (4*np.log(2))

def range_based_covariance_matrix(data:pd.DataFrame) -> pd.DataFrame:
    """
    Compute range-based covariance matrix for multiple assets
    Args:
        data: TxM dataframe in MultiIndex format (asset, price_type). Example: ('HPG', 'high), ('HPG', 'low'),...
    Returns:
        Range-based covariance matrix for multiple assets
    """

    tickers = data.columns.get_level_values(0).unique(0)
    n_assets = len(tickers)
    cov_matrices = {}

    for date, row in data.iterrows():
        variances = {}
        for asset in tickers:
            high_price, low_price = row[asset, 'high'], row[asset, 'low']
            variances[asset] = parkinson_variance(high_price, low_price)

        cov_matrix = pd.DataFrame(
            np.zeros((n_assets, n_assets)),
            index=tickers,
            columns=tickers
        ) # Initialize covariance matrix

        # Fill diagonal with variances estimated with Parkinson
        for asset in tickers:
            cov_matrix.loc[asset, asset] = variances[asset]

        # Off-diagonals
        for i, asset_i in enumerate(tickers):
            for j, asset_j in enumerate(tickers):
                if j>i: # Only get the upper triangular
                    high_sum = row[asset_i, "high"] + row[(asset_j, 'high')]
                    low_sum = row[asset_i, "low"] + row[asset_j, 'low']
                    var_sum = parkinson_variance(high_sum, low_sum)

                    cov = 0.5 * (var_sum - variances[asset_i] - variances[asset_j])
                    cov_matrix.loc[asset_i, asset_j] = cov
                    cov_matrix.loc[asset_j, asset_i] = cov
                
        cov_matrices[date] = cov_matrix
    
    return cov_matrices

def cholesky_decomposition(
    G: np.ndarray,
    tol=1e-12,
    jitter_start=1e-12,
    jitter_max=1e-3
):
    # Symmetrize
    Gs = 0.5 * (G + G.T)

    try:
        upper_triang = np.linalg.cholesky(Gs, upper=True)
        # return upper_triang
    except np.linalg.LinAlgError:
        # Eigenvalue correction
        w, Q = np.linalg.eigh(Gs)
        w_clipped = np.maximum(w, tol)
        G_corr = (Q * w_clipped) @ Q.T

        try:
            upper_triang = np.linalg.cholesky(G_corr, upper=True)
            # return upper_triang
        except np.linalg.LinAlgError:
            # Final fallback: escalating diagonal jitter
            jitter = jitter_start
            I = np.eye(G.shape[0])
            while jitter <= jitter_max:
                try:
                    upper_triang = np.linalg.cholesky(G_corr + jitter * I, upper=True)
                    # return upper_triang
                except np.linalg.LinAlgError:
                    jitter *= 10.0
            raise np.linalg.LinAlgError("Cholesky failed: matrix far from positive definite even after eigenvalue clipping and jitter")
    
    return upper_triang
# Step 3: For each entry of the cholesky factor, construct and train the autoregressive SVR model

def get_cholesky_series(chol_factors):
    dates = list(chol_factors.keys())
    P0 = chol_factors[dates[0]] # The first upper triangular matrix
    assets = P0.columns
    series_dict = {}
    n_assets = len(assets)

    for i in range(n_assets):
        for j in range(i, n_assets):
            series_dict[(i, j)] = pd.Series(
                [chol_factors[d].iloc[i, j] for d in dates],
                index=dates
            )
    
    return series_dict

def fit_SVR(series, lags=15):
    """
    Fit SVR to Cholesky entry
    Returns:
        Fitted series
    """
    y = series.values
    X = np.column_stack([np.roll(y, k) for k in range(1, lags + 1)])
    X, y = X[lags:], y[lags:]
    model = make_pipeline(
        StandardScaler(),
        SVR(kernel='rbf', C=1.0, epsilon=0.01)
    )
    model.fit(X, y)

    return model

def forecast_svr(model, hist, steps=1, lags=15):
    """
    Forecast with SVR
    Args:
        model: Fitted SVR model
        hist: Historical data
        steps: Forecast steps
        lags: Days to input into training
    Returns Cholesky entries
    """
    
    preds = []
    h = hist.copy()
    for _ in range(steps):
        x = h[-lags:].reshape(1, -1)
        pred = model.predict(x)[0]
        preds.append(pred)
        h = np.append(h, pred)
    
    return preds

def forecast_covariance(chol_factors, horizon=20, lags=20):
    series_dict = get_cholesky_series(chol_factors)
    models = {
        k: fit_SVR(v, lags=lags) for k, v in series_dict.items()
    }
    forecasts = {
        k: forecast_svr(models[k], series_dict[k].values
    , steps=horizon, lags=lags) for k in series_dict.keys()
    }
    n_assets = len(chol_factors[next(iter(chol_factors))]) # number of assets
    pred_covs = []
    for step in range(horizon):
        # Build forecasted P_t
        P_fc = np.zeros((n_assets, n_assets)) # Initialize matrix
        for (i, j), vals in forecasts.items():
            P_fc[i, j] = vals[step]
        
        # Covariance forecast
        G_fc = P_fc.T @ P_fc
        pred_covs.append(G_fc)
    
    return pred_covs

def svr_model_forecast(
        train_data, 
        horizon=20, 
        lags=30
    ):

    cov_matrices = range_based_covariance_matrix(train_data)

    # Step 2: The matrices are decomposed using Cholesky decomposition in the form G_t = P_t' P_t
    # This is to ensure the covariance matrix is always positive definite
    chol_factors = {}
    for date, cov in cov_matrices.items():
        upper_triang = cholesky_decomposition(cov.values)
        chol_factors[date] = pd.DataFrame(
            upper_triang,
            index=cov.index,
            columns=cov.columns
        )

    # Step 3: Predict covariances
    pred_covs = forecast_covariance(
        chol_factors=chol_factors,
        horizon=horizon,
        lags=lags # Experiment to find the best lags
    )

    return pred_covs

In [275]:

pivoted_data = clean_df.pivot_table(
    columns="ticker", 
    values=["open", "high", "low", "close", "returns"], 
    index="date"
)
pivoted_data.columns = pivoted_data.columns.swaplevel(0, 1)
pivoted_data = pivoted_data.sort_index(axis=1, level=0)
pivoted_data = pivoted_data.loc[:, pivoted_data.columns.get_level_values(0).isin(ticker_list)]
pivoted_data = pivoted_data.dropna() # Drop NA
train_df, test_df = split_train_test(pivoted_data)

test_df_returns = test_df.loc[:, test_df.columns.get_level_values(1) == "returns"]
test_df_hl = test_df.loc[:, test_df.columns.get_level_values(1).isin(["high", 'low'])]
train_df_hl = train_df.loc[:, train_df.columns.get_level_values(1).isin(["high", 'low'])]
train_df_returns = train_df.loc[:, test_df.columns.get_level_values(1) == "returns"]


In [None]:
# Step 1: Calculate NxN range-based covariance matrices of returns for the whole time series. The range-based variances of the returns are the diagonal entries of these matrices.
# To estimate the range-based covariance matrices, use estimator of the covariance of the returns, and parkinson estimator of the variance.
cov_matrices = range_based_covariance_matrix(train_df)
# Step 2: The matrices are decomposed using Cholesky decomposition in the form G_t = P_t' P_t
# This is to ensure the covariance matrix is always positive definite

chol_factors = {}
for date, cov in cov_matrices.items():
    upper_triang = cholesky_decomposition(cov.values)
    chol_factors[date] = pd.DataFrame(
        upper_triang,
        index=cov.index,
        columns=cov.columns
    )

# PARAMETERS ====================
horizon=20
lags = 30
#================================

act_covs = []
dates = []

# Step 3: Predict covariances
pred_covs = forecast_covariance(
    chol_factors=chol_factors,
    horizon=horizon,
    lags=lags # Experiment to find the best lags
)


In [None]:

class SVRCovarianceForecaster:
    def __init__(self, tol=1e-12, jitter_start=1e-12, jitter_max=1e-3):
        self.tol = tol
        self.jitter_start = jitter_start
        self.jitter_max = jitter_max

    @staticmethod
    def parkinson_variance(high, low):
        """
        Parkinson variance estimator with high and low prices
        var_{Pt} = [ln(H_t/L_t)^2]/(4*ln2)
        """
        return (np.log(high/low)**2) / (4*np.log(2))

    def range_based_covariance_matrix(self, data: pd.DataFrame):
        """
        Compute range-based covariance matrix for multiple assets
        Args:
            data: TxM dataframe in MultiIndex format (asset, price_type).
        Returns:
            Range-based covariance matrix for multiple assets
        """
        tickers = data.columns.get_level_values(0).unique(0)
        n_assets = len(tickers)
        cov_matrices = {}

        for date, row in data.iterrows():
            variances = {}
            for asset in tickers:
                high_price, low_price = row[asset, 'high'], row[asset, 'low']
                variances[asset] = self.parkinson_variance(high_price, low_price)

            cov_matrix = pd.DataFrame(
                np.zeros((n_assets, n_assets)),
                index=tickers,
                columns=tickers
            )

            for asset in tickers:
                cov_matrix.loc[asset, asset] = variances[asset]

            for i, asset_i in enumerate(tickers):
                for j, asset_j in enumerate(tickers):
                    if j > i:
                        high_sum = row[asset_i, "high"] + row[(asset_j, 'high')]
                        low_sum = row[asset_i, "low"] + row[asset_j, 'low']
                        var_sum = self.parkinson_variance(high_sum, low_sum)
                        cov = 0.5 * (var_sum - variances[asset_i] - variances[asset_j])
                        cov_matrix.loc[asset_i, asset_j] = cov
                        cov_matrix.loc[asset_j, asset_i] = cov

            cov_matrices[date] = cov_matrix

        return cov_matrices

    def cholesky_decomposition(self, G: np.ndarray):
        # Symmetrize
        Gs = 0.5 * (G + G.T)
        try:
            upper_triang = np.linalg.cholesky(Gs, upper=True)
        except np.linalg.LinAlgError:
            w, Q = np.linalg.eigh(Gs)
            w_clipped = np.maximum(w, self.tol)
            G_corr = (Q * w_clipped) @ Q.T
            try:
                upper_triang = np.linalg.cholesky(G_corr, upper=True)
            except np.linalg.LinAlgError:
                jitter = self.jitter_start
                I = np.eye(G.shape[0])
                while jitter <= self.jitter_max:
                    try:
                        upper_triang = np.linalg.cholesky(G_corr + jitter * I, upper=True)
                        break
                    except np.linalg.LinAlgError:
                        jitter *= 10.0
                else:
                    raise np.linalg.LinAlgError("Cholesky failed: matrix far from positive definite even after eigenvalue clipping and jitter")
        return upper_triang

    @staticmethod
    def get_cholesky_series(chol_factors):
        dates = list(chol_factors.keys())
        P0 = chol_factors[dates[0]]
        assets = P0.columns
        series_dict = {}
        n_assets = len(assets)
        for i in range(n_assets):
            for j in range(i, n_assets):
                series_dict[(i, j)] = pd.Series(
                    [chol_factors[d].iloc[i, j] for d in dates],
                    index=dates
                )
        return series_dict

    @staticmethod
    def fit_SVR(series, lags=15):
        y = series.values
        X = np.column_stack([np.roll(y, k) for k in range(1, lags + 1)])
        X, y = X[lags:], y[lags:]
        model = make_pipeline(
            StandardScaler(),
            SVR(kernel='rbf', C=1.0, epsilon=0.01)
        )
        model.fit(X, y)
        return model

    @staticmethod
    def forecast_svr(model, hist, steps=1, lags=15):
        preds = []
        h = hist.copy()
        for _ in range(steps):
            x = h[-lags:].reshape(1, -1)
            pred = model.predict(x)[0]
            preds.append(pred)
            h = np.append(h, pred)
        return preds

    def forecast_covariance(self, chol_factors, horizon=20, lags=20):
        series_dict = self.get_cholesky_series(chol_factors)
        models = {
            k: self.fit_SVR(v, lags=lags) for k, v in series_dict.items()
        }
        forecasts = {
            k: self.forecast_svr(models[k], series_dict[k].values, steps=horizon, lags=lags)
            for k in series_dict.keys()
        }
        n_assets = len(chol_factors[next(iter(chol_factors))])
        pred_covs = []
        for step in range(horizon):
            P_fc = np.zeros((n_assets, n_assets))
            for (i, j), vals in forecasts.items():
                P_fc[i, j] = vals[step]
            G_fc = P_fc.T @ P_fc
            pred_covs.append(G_fc)
        return pred_covs


In [None]:

# Get actual covariance
act_covs = []
pred_covs = []
dates = []

for i in tqdm(range(len(test_df_returns) - horizon + 1)):
    window = test_df_returns.iloc[i : (i+horizon)]
    act_cov_matrix = np.cov(test_df_returns.T, bias=True)
    act_covs.append(act_cov_matrix)
    dates.append(window.index[-1])


In [265]:

train_df_returns.columns = train_df_returns.columns.get_level_values(0)
svr_pred_covs, svr_port_returns, svr_port_vars, svr_act_covs = [], [], [], []


for start in tqdm(range(0, len(test_df_hl) - horizon + 1, horizon)):
    train_data = pd.concat([
        train_df_hl, test_df_hl.iloc[:start]
    ])
    cov_list = svr_model_forecast(
        train_data=train_data,
        horizon=horizon,
        lags=lags
    )
    agg_covariance = sum(cov_list)

    # Get MVP weights
    mvp_weights, weights_dict = minimum_variance_portfolio(
        agg_covariance, train_df_returns
    )

    horizon_return = test_df_returns[start:start+horizon]

    port_return = np.array(horizon_return) @ mvp_weights
    svr_port_returns.append(port_return.sum())
    
    # Actual covariance
    act_covariance = np.cov(horizon_return.T)
    act_var = mvp_weights.T @ act_covariance @ mvp_weights
    
    svr_port_vars.append(act_var)
    svr_act_covs.append(act_covariance)
    svr_pred_covs.append(agg_covariance)

100%|██████████| 59/59 [08:39<00:00,  8.81s/it]


In [269]:
svr_act_covs = np.array(svr_act_covs)
svr_pred_covs = np.array(svr_pred_covs)

svr_results = pd.DataFrame({
    "realized_return":svr_port_returns,
    "realized_variance":svr_port_vars
})

svr_sr = svr_results["realized_variance"].mean()/svr_results["realized_return"].std()

svr_frob = np.mean([
    frobenius_loss(H_pred, H_true) for H_pred, H_true in zip(svr_pred_covs, svr_act_covs)
])
svr_stein = np.mean([
    stein_loss(H_pred, H_true) for H_pred, H_true in zip(svr_pred_covs, svr_act_covs)
])
svr_corr_loss = np.mean([
    correlation_loss(H_pred, H_true) for H_pred, H_true in zip(svr_pred_covs, svr_act_covs)
])
svr_port_aligned = np.mean([
    portfolio_aligned_loss(H_pred, H_true, mvp_weights) for H_pred, H_true in zip(svr_pred_covs, svr_act_covs)
])

print(f"""
SVR MODEL

- Sharpe Ratio = {svr_sr}
- Frobenius loss = {svr_frob} 
- Correlation loss = {svr_corr_loss}
- Portfolio aligned loss = {svr_port_aligned}
""")


SVR MODEL

- Sharpe Ratio = 0.0029304699431166635
- Frobenius loss = 0.0006345773102282686 
- Correlation loss = 3.6804051333419423
- Portfolio aligned loss = 1.254350940183958e-07

