In [5]:
import pandas as pd
from datetime import datetime
import os
import torch
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
from functions import *
from scipy.optimize import minimize
from MODELS import BEKK_GARCH_MODEL

In [6]:
# ticker_list = ['REE', 'SAM', 'HAP', 'GMD', 'GIL', 'TMS', 'SAV', 'DHA', 'MHC', 'HAS'] # 10 stocks with the most observations
ticker_list = ['REE', 'SAM', 'HAP'] # 3 stocks with the most observations
limits = {
    'hose':0.07,
    'hnx':0.1,
    'upcom':0.15
}

In [4]:
# Read and merge into 1 dataset

if "stock_data.csv" in os.listdir("data"):
    merged_df = pd.read_csv(
        os.path.join("data", "stock_data.csv"),
        index_col=None
    ).assign(
        date = lambda df : pd.to_datetime(df["date"])
    )
else:
    # Read and merge data
    hnx = pd.read_csv(os.path.join("data", "CafeF.HNX.Upto31.07.2025.csv")).assign(
        floor = "hnx"
    )
    hsx = pd.read_csv(os.path.join("data", "CafeF.HSX.Upto31.07.2025.csv")).assign(
        floor = "hose"
    )
    upcom = pd.read_csv(os.path.join("data", "CafeF.UPCOM.Upto31.07.2025.csv")).assign(
        floor = "upcom"
    )
    indexes = pd.read_csv(os.path.join("data", "CafeF.INDEX.Upto06.08.2025.csv")).assign(
        floor = "index"
    )

    # Rename columns
    hnx, hsx, upcom, indexes = [
        df.rename(columns={
            "<Ticker>":"ticker",
            "<DTYYYYMMDD>":"date",
            "<Open>":"open",
            "<High>":"high",
            "<Low>":"low",
            "<Close>":"close",
            "<Volume>":"volume"
        }) for df in [hnx, hsx, upcom, indexes]
    ]
        
    # Merge and clean data
    # UPCOM has missing tickers for some reason
    merged_df = pd.concat(
        [hnx, hsx, upcom, indexes],
        axis=0
    ).reset_index(drop=True).dropna(subset="ticker")\
    .assign(
        date=lambda df : df["date"].astype(str).apply(lambda x: datetime.strptime(x, "%Y%m%d").date())
    )
    merged_df.to_csv(
        os.path.join("data", "stock_data.csv"),
        index=False
    ) # Save merged data to save time in future runs


# Data cleaning and merging

data = merged_df[["date", "ticker", "floor", "close"]].sort_values(["ticker", "date"]).assign(
    returns = lambda df : df.groupby("ticker")["close"].pct_change(),
    log_returns_pct = lambda df : np.log(df["close"] / df.groupby("ticker")["close"].shift(1))*100
)

data = data.loc[data["ticker"].str.len()==3] # Eliminate ETF, and indeces

data["limit"] = data["floor"].map(limits)
outliers = data.loc[data["returns"].abs() > data["limit"]]
clean_df = data.drop(outliers.index) # Remove outliers
print(f"% of observations removed: {round((len(outliers)/len(data))*100, 2)}%")

# NOTE: try out different samples of stocks
pivoted_df = clean_df.pivot_table(values="returns", index="date", columns="ticker") # Pivot data for better usability
pivoted_df = pivoted_df[ticker_list].dropna()

display(pivoted_df.describe())
train_df, test_df = split_train_test(pivoted_df)

% of observations removed: 1.05%


ticker,REE,SAM,HAP
count,5951.0,5951.0,5951.0
mean,0.001091,0.000699,0.00077
std,0.021411,0.023733,0.024894
min,-0.069971,-0.069999,-0.069963
25%,-0.009689,-0.01162,-0.012434
50%,0.0,0.0,0.0
75%,0.011761,0.012037,0.012855
max,0.069962,0.069919,0.069927


In [8]:
df = train_df - train_df.mean() # Demean return


In [9]:
len(df)

4760

In [None]:
C, A, B, bekk = BEKK_GARCH_MODEL.fit_bekk(df.values)
cov_matrix_forecast = BEKK_GARCH_MODEL.bekk_forecast(C, A, B, df.values, horizon=20)

In [None]:
cov_matrix_forecast

[array([[7.99805514e-04, 2.80750292e-05, 5.77779794e-04],
        [2.80750292e-05, 8.74376714e-06, 1.68047352e-05],
        [5.77779794e-04, 1.68047352e-05, 4.57560021e-04]]),
 array([[6.22249332e-04, 1.35037203e-05, 4.65947806e-04],
        [1.34734909e-04, 1.46403123e-05, 9.07234618e-05],
        [1.02095633e-03, 1.88753480e-05, 8.11158577e-04]]),
 array([[5.99968417e-04, 5.45932257e-06, 4.65482525e-04],
        [1.97015419e-04, 1.68363004e-05, 1.36653839e-04],
        [1.65544072e-03, 1.96495701e-05, 1.32216631e-03]]),
 array([[7.02171572e-04, 8.12721818e-07, 5.58251552e-04],
        [2.54506482e-04, 1.73013093e-05, 1.82368824e-04],
        [2.60079889e-03, 1.94974978e-05, 2.08640607e-03]]),
 array([[ 9.35099464e-04, -2.08143391e-06,  7.52851146e-04],
        [ 3.33845066e-04,  1.69944662e-05,  2.47344964e-04],
        [ 4.03141149e-03,  1.84930597e-05,  3.24463612e-03]]),
 array([[ 1.33592402e-03, -4.11913413e-06,  1.08125804e-03],
        [ 4.59125297e-04,  1.63635780e-05,  3.5010

In [None]:
minimum_variance_portfolio(cov_matrix_forecast[11], train_df)

{'REE': 7.16093850883226e-15, 'SAM': 0.9999999999999969, 'HAP': 0.0}