In [None]:
# Imports
import os, math
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from tqdm import tqdm
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler

import tensorflow as tf 

In [None]:
def format_Dataframes(data_path:str=None, type_file:str="csv") -> pd.DataFrame:

    '''
    data_path: Dataset path.
    type: File type dataset, for example: data.csv, data.xlsx, ...
    '''

    if data_path is None or os.path.exists(data_path) == False:
        print("The path of dataset does not exist. Please check again !!")
    else:

        df = None
        if type_file == "csv":
            df = pd.read_csv(data_path)
        elif type_file == "xlsx":
            df = pd.read_excel(data_path)
        else:
            print("Opening this file type is not supported !!")

        column_names = ["Tên", "Ngày", 'Đóng cửa', 'Điều chỉnh', "Thay đổi", "Thay đổi 1", "%", 
        'Khối lượng (Khớp lệnh)', 'Giá trị (Khớp lệnh)', 'Khối lượng (Thỏa thuận)', 'Giá trị (Thỏa thuận)', 
        'Mở cửa', 'Cao nhất', 'Thấp nhất']

        new_column_names = df.iloc[0]
        df = df[1:]
        df.columns = new_column_names
        df.reset_index(drop=True, inplace=True)
        df.columns = column_names

        for name in df.columns:
            if name not in ["Tên", "Ngày", 'Điều chỉnh', "Thay đổi", "Thay đổi 1", "%"]:
                df[name] =  pd.to_numeric(df[name], errors='coerce')
        return df

In [None]:
### Load Dataset
data_raw = format_Dataframes("/kaggle/input/fpt-stock/FPT_stock.xlsx",'xlsx')
data_raw.head(10)

### Preprocess Data

In [None]:
def preprocessing_dataframe(dataFrame: pd.DataFrame, fillna: str="mean", scale: str="std") -> pd.DataFrame:

    '''
    dataFrame: A data frame is data after reading from a csv file and having run it through the format_Dataframes() function.
    fillna: Type of fill data NaN, Null or None; [None, Zero, Mean].
    scale: Type of scale; [MinMaxScaler, StandardScaler]
    '''

    dataFrame.drop(columns=['Điều chỉnh', 'Thay đổi', 'Thay đổi 1', '%'], inplace=True)
    scaler = None

    if fillna == "zero":
        float_columns = dataFrame.select_dtypes(include=['float']).columns
        dataFrame[float_columns] = dataFrame[float_columns].fillna(0)
        int_columns = dataFrame.select_dtypes(include=['int']).columns
        dataFrame[int_columns] = dataFrame[int_columns].fillna(0)
    elif fillna == "mean":
        float_columns = dataFrame.select_dtypes(include=['float']).columns
        dataFrame[float_columns] = dataFrame[float_columns].fillna(dataFrame[float_columns].mean())
        int_columns = dataFrame.select_dtypes(include=['int']).columns
        dataFrame[int_columns] = dataFrame[int_columns].fillna(dataFrame[int_columns].mean())
    else:
        dataFrame.dropna(inplace=True)

    tmp_dataFrame_day = dataFrame["Ngày"]
    tmp_dataFrame_day.reset_index(drop=True, inplace=True)
    tmp_dataFrame_name = dataFrame["Tên"]
    tmp_dataFrame_name.reset_index(drop=True, inplace=True)
    dataFrame.drop(columns=["Ngày", "Tên"], inplace=True)
    dataFrame.reset_index(drop=True, inplace=True)

    if scale == "std":
        scaler = MinMaxScaler()
    else:
        scaler = StandardScaler()
    tmp_scaler = scaler.fit_transform(dataFrame)
    dataFrame =  pd.DataFrame(tmp_scaler, columns=dataFrame.columns)
    # tmp_dataFrame = pd.concat([tmp_dataFrame_day, tmp_dataFrame_name], axis=1)
    dataFrame = pd.concat([tmp_dataFrame_day, dataFrame], axis=1)
    dataFrame.set_index("Ngày", inplace=True)
    dataFrame.sort_values(by="Ngày", inplace=True)
    return dataFrame

In [None]:
df_process = preprocessing_dataframe(data_raw)
df_process.head(10)

### Data pipeline for Transformers
Here, I am creating a model for Univariate (Auto-Regressive) Multistep Forecasting i.e. Use 'T' feature's past n value to predict 'T' feature's future k values.

In [None]:
# Sliding Windows for Transformers

# Step parameters
n_steps_in = 512
n_steps_out = 24
n_features = 1
batch_size = df_process.shape[0] # Only one batch containing all samples

# Split data into train and test
train_size = int(len(df_process) * 0.75)
train_df = df_process.iloc[:train_size, 0]
test_df = df_process.iloc[train_size - n_steps_in:, 0].reset_index(drop = True) # Adjusting test size to account for lost steps.

# I am creating sliding windows where n_steps_in time lags will form one input and predict n_steps_out time lags as output
# In this experiment, I will be using teacher forcing strategy i.e. provide previous ground truth value at current time step of decoder input.

# It requires two things:
# 1) Inputs - Encoder Input (512 time lags) - in my case this is train_X and test_Y, 
#           - Decoder Input (24 time lags - where first lag is last input lag (acting as a start token) followed by next 23 time lags)
# 2) Labels - Decoder Output (24 time lags) - in my case this is train_Y and test_Y
# I need to create decoder inputs - which are train_Y_p and test_Y_p
train_X = tf.keras.utils.timeseries_dataset_from_array(train_df[:-n_steps_out], None, sequence_length = n_steps_in, batch_size = batch_size)
train_Y = tf.keras.utils.timeseries_dataset_from_array(train_df[n_steps_in:], None, sequence_length = n_steps_out, batch_size = batch_size)
train_Y_p = tf.keras.utils.timeseries_dataset_from_array(train_df[n_steps_in-1:-1], None, sequence_length = n_steps_out, batch_size = batch_size)
test_X = tf.keras.utils.timeseries_dataset_from_array(test_df[:-n_steps_out], None, sequence_length = n_steps_in, batch_size = batch_size, sequence_stride = n_steps_out)
test_Y = tf.keras.utils.timeseries_dataset_from_array(test_df[n_steps_in:], None, sequence_length = n_steps_out, batch_size = batch_size, sequence_stride = n_steps_out)
test_Y_p = tf.keras.utils.timeseries_dataset_from_array(test_df[n_steps_in-1:-1], None, sequence_length = n_steps_out, batch_size = batch_size, sequence_stride = n_steps_out)

# timeseries_dataset_from_array will return batch generator, I need some post processing so I will extract tensor from batches
for b in train_X: train_X = b
for b in train_Y: train_Y = b
for b in train_Y_p: train_Y_p = b
for b in test_X: test_X = b
for b in test_Y: test_Y = b
for b in test_Y_p: test_Y_p = b

# Renaming and typecasting variables
train_encoder_inputs = tf.expand_dims(tf.cast(train_X, dtype = tf.float32), axis = -1)
train_decoder_inputs = tf.expand_dims(tf.cast(train_Y_p, dtype = tf.float32), axis = -1) 
train_decoder_outputs = tf.expand_dims(tf.cast(train_Y, dtype = tf.float32), axis = -1)
test_encoder_inputs = tf.expand_dims(tf.cast(test_X, dtype = tf.float32), axis = -1)
test_decoder_inputs = tf.expand_dims(tf.cast(test_Y_p, dtype = tf.float32), axis = -1)
test_decoder_outputs = tf.expand_dims(tf.cast(test_Y, dtype = tf.float32), axis = -1)

# Verify shapes
print("Train Encoder Input: ",train_encoder_inputs.shape)
print("Train Decoder Input: ",train_decoder_inputs.shape)
print("Train Decoder Output: ",train_decoder_outputs.shape)
print("Test Encoder Input: ",test_encoder_inputs.shape)
print("Test Decoder Input: ",test_decoder_inputs.shape)
print("Test Decoder Output: ",test_decoder_outputs.shape)

In [None]:
# Verify Decoder Inputs & Outputs - Outputs will be shifted one to right
print(train_decoder_inputs[0])
print(train_decoder_outputs[0])
print(test_decoder_inputs[0])
print(test_decoder_outputs[0])

## Transformer Model

#### Defining Components

#### Positional Encoding
All the input embedings are added to positional encodings before passing into the first encoder. In next experiment, I will try Time2Vec instead of Positional Encoding to encode the inputs.

In [None]:
# Helper function to calculate positional encoding
def positional_encoding(length, depth):
    """
    length: width of one input i.e. number of time lags in one input sequence (n_steps_in)
    depth: dimension of embedding i.e. number of features used to represent a single time lag (word token in NLP) - it will be 1 for our time series.
    
    This code is taken from tensorflow's official documentation.
    """
    depth = depth / 2

    positions = np.arange(length)[:, np.newaxis]     # (seq, 1)
    depths = np.arange(depth)[np.newaxis, :] / depth   # (1, depth)

    angle_rates = 1 / (10000 ** depths)         # (1, depth)
    angle_rads = positions * angle_rates      # (pos, depth)

    pos_encoding = np.concatenate(
        [np.sin(angle_rads), np.cos(angle_rads)],
        axis = -1) 

    return tf.cast(pos_encoding, dtype = tf.float32)

In [None]:
class PositionalEmbedding(tf.keras.layers.Layer):
    """
    This code is taken from tensorflow's official documentation and modified according to requirements.
    Instead of Embedding layer, I am using Dense layer.
    """
    def __init__(self, d_model):
        """
        Removed vocab_size parameter as it was used in Embedding layer and I have replaced that with Dense.
        """
        super().__init__()
        self.d_model = d_model
        self.embedding = tf.keras.layers.Dense(units = d_model) 
        self.pos_encoding = positional_encoding(length = 512, depth = d_model)

    def compute_mask(self, *args, **kwargs):
        return self.embedding.compute_mask(*args, **kwargs)

    def call(self, x):
        length = tf.shape(x)[1]
        x = self.embedding(x)
        # This factor sets the relative scale of the embedding and positonal_encoding.
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x = x + self.pos_encoding[tf.newaxis, :length, :]
        return x

In [None]:
# Test PositionalEmbedding layer
embed_enc = PositionalEmbedding(d_model = 64)
embed_dec = PositionalEmbedding(d_model = 64)

ec = embed_enc(train_encoder_inputs[0])
dc = embed_dec(train_decoder_inputs[0])

print(ec.shape)
print(dc.shape)

#### Time2Vec Layer
Replacement for positional embedding. The aim of Time2Vec is to learn a feature representation (embedding) for time series data.

In [None]:
class Time2Vec(tf.keras.layers.Layer):
    def __init__(self, d_model, **kwargs):
        self.output_dim = d_model - 1
        super(Time2Vec, self).__init__(**kwargs)
    
    def build(self, input_shape):
        self.W = self.add_weight(name = 'W',
                      shape = (input_shape[-1], self.output_dim),
                      initializer = 'uniform',
                      trainable = True)
        self.P = self.add_weight(name = 'P',
                      shape = (input_shape[1], self.output_dim),
                      initializer = 'uniform',
                      trainable = True)
        self.w = self.add_weight(name = 'w',
                      shape = (input_shape[1], 1),
                      initializer = 'uniform',
                      trainable = True)
        self.p = self.add_weight(name = 'p',
                      shape = (input_shape[1], 1),
                      initializer = 'uniform',
                      trainable = True)
        super(Time2Vec, self).build(input_shape)
    
    def call(self, x):
        original = self.w * x + self.p
        sin_trans = tf.math.sin(tf.tensordot(x, self.W, axes = 1) + self.P)
        
        return tf.concat([sin_trans, original], -1)

In [None]:
# Testing Time2Vec Layer
embed_enc = Time2Vec(d_model = 4)
embed_dec = Time2Vec(d_model = 4)

ec = embed_enc(train_encoder_inputs[0:10])
dc = embed_dec(train_decoder_inputs[0:10])

print(ec.shape)
print(dc.shape)

#### Base Attention Layer
Acts as super class for all attention layers in encoder and decoder.

In [None]:
class BaseAttention(tf.keras.layers.Layer):
    """
    This code is taken from tensorflow's official documentation.
    """
    def __init__(self, **kwargs):
        super().__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
        self.layernorm = tf.keras.layers.LayerNormalization()
        self.add = tf.keras.layers.Add()

#### Cross Attention
Encoder-Decoder Attention in the Decoder block.

In [None]:
class CrossAttention(BaseAttention):
    """
    This code is taken from tensorflow's official documentation.
    """
    def call(self, x, context):
        attn_output, attn_scores = self.mha(
            query = x,
            key = context,
            value = context,
            return_attention_scores = True)

        # Cache the attention scores for plotting later.
        self.last_attn_scores = attn_scores

        x = self.add([x, attn_output])
        x = self.layernorm(x)

        return x

In [None]:
# Testing CrossAttention layer
sample_ca = CrossAttention(num_heads = 2, key_dim = 8)

print(ec.shape)
print(dc.shape)
print(sample_ca(dc, ec).shape) # Pass x first (decoder input), then context (output received from  last encoder layer)

#### Global Self Attention Layer
Attention layer used in encoder layers.

In [None]:
class GlobalSelfAttention(BaseAttention):
    """
    This code is taken from tensorflow's official documentation.
    """
    def call(self, x):
        attn_output = self.mha(
            query = x,
            value = x,
            key = x)
        x = self.add([x, attn_output])
        x = self.layernorm(x)
        return x

In [None]:
# Testing GlobalSelfAttention layer
sample_gsa = GlobalSelfAttention(num_heads = 3, key_dim = 8)

print(ec.shape)
print(sample_gsa(ec).shape)

#### Causal Self Attention Layer
Self Attention layer used in Decoder which is only applied on the initial decoder inputs / outputs of previous decoder layer. The one which requires masking of next time lags.

In [None]:
class CausalSelfAttention(BaseAttention):
    """
    This code is taken from tensorflow's official documentation.
    """
    def call(self, x):
        attn_output = self.mha(
            query = x,
            value = x,
            key = x,
            use_causal_mask = True)
        x = self.add([x, attn_output])
        x = self.layernorm(x)
        return x

In [None]:
# Testing CausalSelfAttention layer
sample_csa = CausalSelfAttention(num_heads = 2, key_dim = 8)

print(dc.shape)
print(sample_csa(dc).shape)

#### Feed Forward Layer
Used in both encoder and decoder layers.

In [None]:
class FeedForward(tf.keras.layers.Layer):
    """
    This code is taken from tensorflow's official documentation.
    """
    def __init__(self, d_model, dff, dropout_rate = 0.1):
        super().__init__()
        self.seq = tf.keras.Sequential([
              tf.keras.layers.Dense(dff, activation = 'relu'),
              tf.keras.layers.Dense(d_model),
              tf.keras.layers.Dropout(dropout_rate)
            ])
        self.add = tf.keras.layers.Add()
        self.layer_norm = tf.keras.layers.LayerNormalization()

    def call(self, x):
        x = self.add([x, self.seq(x)])
        x = self.layer_norm(x) 
        return x

In [None]:
# Testing FeedForward layer
sample_ff = FeedForward(d_model = 4, dff = 2048)

print(ec.shape)
print(sample_ff(ec).shape)

#### Encoder Layer
A single encder layer

In [None]:
class EncoderLayer(tf.keras.layers.Layer):
    """
    This code is taken from tensorflow's official documentation.
    """
    def __init__(self,*, d_model, num_heads, dff, dropout_rate = 0.1):
        super().__init__()
        self.self_attention = GlobalSelfAttention(num_heads = num_heads, key_dim = d_model, dropout = dropout_rate)
        self.ffn = FeedForward(d_model, dff)

    def call(self, x):
        x = self.self_attention(x)
        x = self.ffn(x)
        return x

In [None]:
# Testing EncoderLayer layer
sample_el = EncoderLayer(d_model = 4, num_heads = 2, dff = 2048)

print(ec.shape)
print(sample_el(ec).shape)

#### The Encoder
Encoder consisting of multiple encoder layers and the positional embedding layer

In [None]:
class Encoder(tf.keras.layers.Layer):
    """
    This code is taken from tensorflow's official documentation and modified according to requirements.
    """
    def __init__(self, *, num_layers, d_model, num_heads, dff, dropout_rate = 0.1):
        """
        Removed vocab_size parameter.
        """
        super().__init__()

        self.d_model = d_model
        self.num_layers = num_layers
        self.pos_embedding = Time2Vec(d_model = d_model)
        self.enc_layers = [EncoderLayer(d_model = d_model, num_heads = num_heads, dff = dff, dropout_rate = dropout_rate) for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(dropout_rate)

    def call(self, x):
        # `x` is token-IDs shape: (batch, seq_len)
        x = self.pos_embedding(x)  # Shape `(batch_size, seq_len, d_model)`.

        # Add dropout.
        x = self.dropout(x)
        
        for i in range(self.num_layers):
            x = self.enc_layers[i](x)

        return x  # Shape `(batch_size, seq_len, d_model)`.

In [None]:
# Testing Encoder 
sample_ec = Encoder(num_layers = 2, d_model = 4, num_heads = 4, dff = 2048)

print(train_encoder_inputs[0:10].shape)
print(sample_ec(train_encoder_inputs[0:10], training = False).shape) #Expected op - (batch_size, n_steps_in, d_model) or same as encoder inputs.

#### Decoder Layer
A single decoder layer

In [None]:
class DecoderLayer(tf.keras.layers.Layer):
    """
    This code is taken from tensorflow's official documentation.
    """
    def __init__(self, *, d_model, num_heads, dff, dropout_rate = 0.1):
        super(DecoderLayer, self).__init__()
        self.causal_self_attention = CausalSelfAttention(num_heads = num_heads, key_dim = d_model, dropout = dropout_rate)
        self.cross_attention = CrossAttention(num_heads = num_heads, key_dim = d_model, dropout = dropout_rate)
        self.ffn = FeedForward(d_model, dff)

    def call(self, x, context):
        x = self.causal_self_attention(x=x)
        x = self.cross_attention(x=x, context=context)

        # Cache the last attention scores for plotting later
        self.last_attn_scores = self.cross_attention.last_attn_scores

        x = self.ffn(x)  # Shape `(batch_size, seq_len, d_model)`.
        return x

In [None]:
# Testing DecoderLayer layer
sample_dl = DecoderLayer(d_model = 4, num_heads = 3, dff = 2048)

ec_o = sample_ec(train_encoder_inputs[0:10], training = False)
print(ec_o.shape)
print(dc.shape)

print(sample_dl(dc, ec_o).shape) # Shape (batch_size, n_steps_out, d_model) or same as decoder inputs.

#### The Decoder
Decoder consisting of multiple decoder layers and positional embedding layer.

In [None]:
class Decoder(tf.keras.layers.Layer):
    """
    This code is taken from tensorflow's official documentation and modified according to requirements.
    """
    def __init__(self, *, num_layers, d_model, num_heads, dff, dropout_rate = 0.1):
        """
        Removed vocab_size parameter.
        """
        super(Decoder, self).__init__()
        self.d_model = d_model
        self.num_layers = num_layers
        self.pos_embedding = PositionalEmbedding(d_model = d_model)
        self.dropout = tf.keras.layers.Dropout(dropout_rate)
        self.dec_layers = [DecoderLayer(d_model = d_model, num_heads = num_heads,dff = dff, dropout_rate = dropout_rate) for _ in range(num_layers)]
        self.last_attn_scores = None

    def call(self, x, context):
        # `x` is token-IDs shape (batch, target_seq_len)
        x = self.pos_embedding(x)  # (batch_size, target_seq_len, d_model)

        x = self.dropout(x)

        for i in range(self.num_layers):
            x  = self.dec_layers[i](x, context)

        self.last_attn_scores = self.dec_layers[-1].last_attn_scores

        # The shape of x is (batch_size, target_seq_len, d_model).
        return x

In [None]:
# Testing Decoder
sample_dc = Decoder(num_layers = 2, d_model = 4, num_heads = 3, dff = 2048)

ec_o = sample_ec(train_encoder_inputs[0:10], training = False)
print(ec_o.shape)

print(train_decoder_inputs[0:10].shape)
print(sample_dc(train_decoder_inputs[0:10], ec_o).shape) # Same as train_decoder_outputs or (batch_size, n_steps_out, d_model)

#### The Transformer

In [None]:
class Transformer(tf.keras.Model):
    """
    This code is taken from tensorflow's official documentation and modified according to requirements.
    """
    def __init__(self, *, num_layers, d_model, num_heads, dff, n_features, n_steps_out, dropout_rate = 0.1):
        """
        Removed input_vocab_size and output_vocab_size parameters, added n_features parameter for final dense layer and n_steps_out for inference loop.
        """
        super().__init__()
        self.encoder = Encoder(num_layers = num_layers, d_model = d_model, num_heads = num_heads, dff = dff, dropout_rate = dropout_rate)
        self.decoder = Decoder(num_layers = num_layers, d_model = d_model, num_heads = num_heads, dff = dff, dropout_rate = dropout_rate)
        self.final_layer = tf.keras.layers.Dense(n_features)

    def call(self, inputs):
        # To use a Keras model with `.fit` you must pass all your inputs in the
        # first argument.
        context, x  = inputs

        context = self.encoder(context)  # (batch_size, context_len, d_model)

        x = self.decoder(x, context)  # (batch_size, target_len, d_model)

        # Final linear layer output.
        logits = self.final_layer(x)  # (batch_size, target_len, target_vocab_size)

        try:
            # Drop the keras mask, so it doesn't scale the losses/metrics.
            del logits._keras_mask
        except AttributeError:
            pass

        # Return the final output and the attention weights.
        return logits

In [None]:
# Testing the Transformer
trf = Transformer(num_layers = 2, d_model = 4, num_heads = 3, dff = 1024, n_features = n_features, n_steps_out = n_steps_out)

inp = (train_encoder_inputs[0:10], train_decoder_inputs[0:10])
trf_op = trf(inp)
trf_op.shape

### Train Transforecaster

In [None]:
# Custom LR Scheduler as per Attention is all you need paper
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    """
    This code is taken from tensorflow's official documentation.
    """
    def __init__(self, d_model, warmup_steps = 760):
        super().__init__()
        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        step = tf.cast(step, dtype=tf.float32)
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)
        
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [None]:
# Define Hyperparameters
num_layers = 4
d_model = 128
num_heads = 7
dff = 1024
dropout_rate = 0.4

# Define Model
transforecaster = Transformer(
    num_layers = num_layers, 
    d_model = d_model, 
    num_heads = num_heads, 
    dff = dff, 
    n_features = n_features, 
    n_steps_out = n_steps_out,
    dropout_rate = dropout_rate
)

# Test and check no. of parameters
inp = (train_encoder_inputs[0:128], train_decoder_inputs[0:128])
print("Decoder O/P Shape: ",transforecaster(inp).shape, "\n")
transforecaster.summary()

In [None]:
learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1 = 0.9, beta_2 = 0.98, epsilon = 1e-9)

plt.plot(learning_rate(tf.range(8000, dtype = tf.float32)))
plt.ylabel('Learning Rate')
plt.xlabel('Train Step')

In [None]:
# Compile and Train
transforecaster.compile(
    loss='mean_squared_error',
    optimizer=optimizer,
    metrics=[
        tf.keras.metrics.MeanAbsoluteError(name='mae'),
        tf.keras.losses.Huber(name='huber_loss')
    ]
)

transforecaster.fit(
    x=(train_encoder_inputs, train_decoder_inputs),
    y=train_decoder_outputs,
    batch_size=64,
    epochs=100
)

In [None]:
# Evaluate model
results = transforecaster.evaluate(
    x=(test_encoder_inputs, test_decoder_inputs),
    y=test_decoder_outputs,
    batch_size=64
)

print("Test Loss (MSE):", results[0])
print("Test MAE:", results[1])
print("Test Huber Loss:", results[2])