# Imports

In [25]:
import json
import locale

import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from stock_modules.stock_transform import (create_batch_xy,
                                           create_transformer_onehot_xy)

from stock_modules.stock_ml import create_transformer_model

from tensorflow import keras
from keras.utils import plot_model

# Constants

In [26]:
ENCODING = locale.getpreferredencoding()
DF_PATH = "HEL_12-10-21to08-11-23.csv"
HISTORY_ARRAY_PATH = "./histories_arr.npy"
MODEL_PATH = "./model.h5"
SELECTED_TICKERS_PATH = "./TICKERS_TO_FOLLOW.json"

TEST_FRAC = 0.2
PREDICT_PRICES = False

MHOURS = 12

# Data Import

In [27]:
SELECTED_TICKERS = json.load(open(SELECTED_TICKERS_PATH,
                                  "r", encoding=ENCODING))
DATAFRAME = pd.read_csv(DF_PATH, encoding=ENCODING)

DATAFRAME.set_index("date", inplace=True)
HAS_TIMEDELTA = "Time Delta" in DATAFRAME.columns

# ind transformation tells the label of each index in the np_arr_test
IND_CONVERSION = {i: ticker for i, ticker in enumerate(DATAFRAME.columns) if ticker in SELECTED_TICKERS}
IND_CONVERSION = {i: ticker for i, ticker in enumerate(IND_CONVERSION.values())}

print("Selected tickers: \n", SELECTED_TICKERS)
print("Dataframe columns: \n", DATAFRAME.columns)
print("Dataframe shape: ", DATAFRAME.shape)
print("Dataframe head: \n", DATAFRAME.head(2))
print(f"Index conversion: \n {IND_CONVERSION}")

Selected tickers: 
 ['ALBBV.HE', 'CGCBV.HE', 'EQV1V.HE', 'KNEBV.HE', 'ORNBV.HE', 'OLVAS.HE', 'DETEC.HE', 'PON1V.HE', 'ORNAV.HE', 'VALMT.HE', 'NESTE.HE', 'HUH1V.HE', 'REG1V.HE', 'VAIAS.HE']
Dataframe columns: 
 Index(['REG1V.HE', 'NESTE.HE', 'ORNBV.HE', 'KNEBV.HE', 'OLVAS.HE', 'HUH1V.HE',
       'DETEC.HE', 'ORNAV.HE', 'CGCBV.HE', 'VAIAS.HE', 'ALBBV.HE', 'VALMT.HE',
       'EQV1V.HE', 'PON1V.HE'],
      dtype='object')
Dataframe shape:  (4389, 14)
Dataframe head: 
                       REG1V.HE   NESTE.HE   ORNBV.HE   KNEBV.HE   OLVAS.HE  \
date                                                                         
2021-10-12 07:00:00  55.950001  41.820000  35.689999  60.220001  53.099998   
2021-10-12 08:00:00  55.799999  41.720001  35.630001  60.419998  53.299999   

                      HUH1V.HE  DETEC.HE   ORNAV.HE   CGCBV.HE   VAIAS.HE  \
date                                                                        
2021-10-12 07:00:00  38.529999      23.0  38.049999  43.139999  

# Data Treatment

In [28]:
test_begin_idx = int(DATAFRAME.shape[0] * (1 - TEST_FRAC))

if PREDICT_PRICES:
    scaler = MinMaxScaler()

    scaler.fit(DATAFRAME.iloc[:test_begin_idx, :])
    transformed_df = pd.DataFrame(scaler.transform(DATAFRAME), columns=DATAFRAME.columns, index=DATAFRAME.index)
    transformed_np_arr = transformed_df.to_numpy()

    def inverse_transform(df):
        if isinstance(df, pd.DataFrame):
            return pd.DataFrame(scaler.inverse_transform(df), columns=df.columns, index=df.index)
        elif isinstance(df, np.ndarray):
            return scaler.inverse_transform(df)

# If we are predicting the up/down, we create a dataframe where we subtract the previous value from the current value
else:
    # Do not diff the Time Delta column
    df = DATAFRAME.copy()
    if HAS_TIMEDELTA:
        td_col = df["Time Delta"]
        df.drop("Time Delta", axis=1, inplace=True)
    transformed_df = df.diff()
    # The first row is NaN, so lets copy the second row there
    transformed_df.iloc[0, :] = transformed_df.iloc[1, :]
    # Add back the Time Delta column
    if HAS_TIMEDELTA:
        transformed_df["Time Delta"] = td_col
        # Make Time Delta the first column
        cols = transformed_df.columns.tolist()
        cols = cols[-1:] + cols[:-1]
        transformed_df = transformed_df[cols]
    transformed_np_arr = transformed_df.to_numpy()

    def inverse_transform(df):
        return df

print("Transformed df: \n", transformed_df.head(2))
print("Transformed df shape: ", transformed_df.shape)

Transformed df: 
                      REG1V.HE  NESTE.HE  ORNBV.HE  KNEBV.HE  OLVAS.HE  \
date                                                                    
2021-10-12 07:00:00 -0.150002 -0.099998 -0.059998  0.199997  0.200001   
2021-10-12 08:00:00 -0.150002 -0.099998 -0.059998  0.199997  0.200001   

                     HUH1V.HE  DETEC.HE  ORNAV.HE  CGCBV.HE  VAIAS.HE  \
date                                                                    
2021-10-12 07:00:00  0.030003       0.0       0.0  0.360001 -0.200001   
2021-10-12 08:00:00  0.030003       0.0       0.0  0.360001 -0.200001   

                     ALBBV.HE  VALMT.HE  EQV1V.HE  PON1V.HE  
date                                                         
2021-10-12 07:00:00  0.099998  0.139999       0.1  0.049999  
2021-10-12 08:00:00  0.099998  0.139999       0.1  0.049999  
Transformed df shape:  (4389, 14)


# Batch Generation

In [29]:
if PREDICT_PRICES:
    OUTPUT_SCALE = (0,1)
    X, Y = create_batch_xy(MHOURS, transformed_np_arr, overlap=True, y_updown=False, diff_data=True, output_scale=OUTPUT_SCALE)
else:
    print(type(DATAFRAME.index))
    X, X_MARK, Y = create_transformer_onehot_xy(MHOURS,
                        transformed_np_arr,
                        DATAFRAME.to_numpy(),
                        DATAFRAME.index.to_numpy(),
                        0.01)

split_idx = test_begin_idx - MHOURS
print("X shape:", X.shape)
print("X timestamps shape: ", X_MARK.shape)
print("Y shape:", Y.shape)

x_train = X[:split_idx,:,:]
x_mark_train = X_MARK[:split_idx,:,:]
y_train = Y[:split_idx,:,:]

x_test = X[split_idx:,:,:]
x_mark_test = X_MARK[split_idx:,:,:]
y_test = Y[split_idx:,:,:]

<class 'pandas.core.indexes.base.Index'>
X shape: (4377, 13, 14)
X timestamps shape:  (4377, 13, 4)
Y shape: (4377, 14, 3)


# Model Construction

In [30]:
model = create_transformer_model(m = MHOURS+1,
                                 n = len(SELECTED_TICKERS),
                                 output_dim = 3)
plot_model(model, to_file="./figures/model_plot.png",
           show_shapes=True, show_layer_names = True)

model.compile(optimizer=keras.optimizers.Adam(),
              loss=keras.losses.CategoricalCrossentropy(),
              metrics=[keras.metrics.CategoricalCrossentropy(),
                       keras.metrics.CategoricalAccuracy()])

model.fit(x = (x_train,x_mark_train,x_train,x_mark_train),
          y = y_train,
          batch_size=32,
          epochs=10,
          validation_split=0.25)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7fe094428390>