# Supported Vector Regression

In [1]:
import numpy as np
from scipy.optimize import minimize
from tqdm import tqdm
import pandas as pd
import os
from datetime import datetime
import functions
from functions import *
from evaluate import *

In [3]:
# ticker_list = ['REE', 'SAM', 'HAP', 'GMD', 'GIL', 'TMS', 'SAV', 'DHA', 'MHC', 'HAS'] # 10 stocks with the most observations
ticker_list = ['REE', 'SAM', 'HAP'] # 3 stocks with the most observations
limits = {
    'hose':0.07,
    'hnx':0.1,
    'upcom':0.15
}
# Read and merge into 1 dataset

if "stock_data.csv" in os.listdir("data"):
    merged_df = pd.read_csv(
        os.path.join("data", "stock_data.csv"),
        index_col=None
    ).assign(
        date = lambda df : pd.to_datetime(df["date"])
    )
else:
    # Read and merge data
    hnx = pd.read_csv(os.path.join("data", "CafeF.HNX.Upto31.07.2025.csv")).assign(
        floor = "hnx"
    )
    hsx = pd.read_csv(os.path.join("data", "CafeF.HSX.Upto31.07.2025.csv")).assign(
        floor = "hose"
    )
    upcom = pd.read_csv(os.path.join("data", "CafeF.UPCOM.Upto31.07.2025.csv")).assign(
        floor = "upcom"
    )
    indexes = pd.read_csv(os.path.join("data", "CafeF.INDEX.Upto06.08.2025.csv")).assign(
        floor = "index"
    )

    # Rename columns
    hnx, hsx, upcom, indexes = [
        df.rename(columns={
            "<Ticker>":"ticker",
            "<DTYYYYMMDD>":"date",
            "<Open>":"open",
            "<High>":"high",
            "<Low>":"low",
            "<Close>":"close",
            "<Volume>":"volume"
        }) for df in [hnx, hsx, upcom, indexes]
    ]
        
    # Merge and clean data
    # UPCOM has missing tickers for some reason
    merged_df = pd.concat(
        [hnx, hsx, upcom, indexes],
        axis=0
    ).reset_index(drop=True).dropna(subset="ticker")\
    .assign(
        date=lambda df : df["date"].astype(str).apply(lambda x: datetime.strptime(x, "%Y%m%d").date())
    )
    merged_df.to_csv(
        os.path.join("data", "stock_data.csv"),
        index=False
    ) # Save merged data to save time in future runs


data = merged_df.sort_values(["ticker", "date"]).assign(
    returns = lambda df : df.groupby("ticker")["close"].pct_change()
)

data = data.loc[data["ticker"].str.len()==3] # Eliminate ETF, and indeces

data["limit"] = data["floor"].map(limits)
outliers = data.loc[data["returns"].abs() > data["limit"]]
clean_df = data.drop(outliers.index) # Remove outliers
print(f"% of observations removed: {round((len(outliers)/len(data))*100, 2)}%")

% of observations removed: 1.05%


In [14]:
pivoted_data = data.pivot_table(
    columns="ticker", 
    values=["open", "high", "low", "close"], 
    index="date"
)
pivoted_data.columns = pivoted_data.columns.swaplevel(0, 1)
pivoted_data = pivoted_data.sort_index(axis=1, level=0)
pivoted_data = pivoted_data.loc[:, pivoted_data.columns.get_level_values(0).isin(ticker_list)]

In [None]:
def parkinson_variance(high, low):
    """
    Parkinson variance estimator with high and low prices
    var_{Pt} = [ln(H_t/L_t)^2]/(4*ln2)
    """
    return (np.log(high/low)**2) / (4*np.log(2))

def range_based_covariance_matrix(data:pd.DataFrame) -> pd.DataFrame:
    """
    Compute range-based covariance matrix for multiple assets
    Args:
        data: TxM dataframe in MultiIndex format (asset, price_type). Example: ('HPG', 'high), ('HPG', 'low'),...
    Returns:
        Range-based covariance matrix for multiple assets
    """

    tickers = data.columns.get_level_values(0).unique(0)
    n_assets = len(tickers)
    cov_matrices = {}

    for date, row in data.iterrows():
        variances = {}
        for asset in tickers:
            high_price, low_price = row[asset, 'high'], row[asset, 'low']
            variances[asset] = parkinson_variance(high_price, low_price)

        cov_matrix = pd.DataFrame(
            np.zeros((n_assets, n_assets)),
            index=tickers,
            columns=tickers
        ) # Initialize covariance matrix

        # Fill diagonal with variances estimated with Parkinson
        for asset in tickers:
            cov_matrix.loc[asset, asset] = variances[asset]

        # Off-diagonals
        for i, asset_i in enumerate(tickers):
            for j, asset_j in enumerate(tickers):
                if j>i: # Only get the upper triangular
                    high_sum = row[asset_i, "high"] + row[(asset_j, 'high')]
                    low_sum = row[asset_i, "low"] + row[asset_j, 'low']
                    var_sum = parkinson_variance(high_sum, low_sum)

                    cov = 0.5 * (var_sum - variances[asset_i] - variances[asset_j])
                    cov_matrix.loc[asset_i, asset_j] = cov
                    cov_matrix.loc[asset_j, asset_i] = cov
                
        cov_matrices[date] = cov_matrix
    
    return cov_matrices


In [16]:
cov_matrices = range_based_covariance_matrix(pivoted_data)