In [385]:
# Run as is
import numpy as np
import pandas as pd
import sqlite3
import talipp as tp

# pd.set_option('display.max_rows', 20)  # Set to None to display all rows
# pd.set_option('display.max_columns', None)  # Set to None to display all columns
# pd.set_option('display.max_columns', 15)

In [386]:
def db_preprocessing():

    # Connect to the SQLite database
    connection = sqlite3.connect('etf_data.db')

    # List of table names
    etf_list = ['spy', 'tlt', 'hyg', 'lqd', 'vnq']

    df = pd.read_sql_query(f"SELECT * FROM {etf_list[0]}", connection)

    # # Loop through the list and display data for each table
    # for table_name in etf_list:
    #     df = pd.read_sql_query(f"SELECT * FROM {table_name}", connection)
    #     print(f"Data for table: {table_name}")
    #     # print(df)

    # Close the connection
    connection.close()

    # Rename last column to close
    df.rename(columns={df.columns[4]: 'close'}, inplace=True)

    # Assuming 'date' is the name of the column containing the dates in your DataFrame 'df'
    df = df.sort_values(by='date', ascending=True)

    # Reindex the sorted DataFrame
    df = df.reset_index(drop=True)

    return df

df = db_preprocessing()
# df

In [387]:
from talipp.ohlcv import OHLCV
from talipp.indicators import AccuDist, ADX, ALMA, AO, Aroon, ATR, BB, BOP, CCI, ChaikinOsc, ChandeKrollStop, CHOP, \
    CoppockCurve, DEMA, DonchianChannels, DPO, EMA, EMV, ForceIndex, HMA, Ichimoku, KAMA, KeltnerChannels, KST, KVO, \
    MACD, MassIndex, MeanDev, OBV, PivotsHL, ROC, RSI, ParabolicSAR, SFX, SMA, SMMA, SOBV, StdDev, Stoch, StochRSI, \
    SuperTrend, TEMA, TRIX, TSI, TTM, UO, VTX, VWAP, VWMA, WMA

def indicators(df):

    close = [float(row['close']) for index,row in df.iterrows()]

    # Assuming you have a DataFrame named 'df' with columns 'open', 'high', 'low', 'close', and 'volume'
    ohlcv = [OHLCV(float(row['open']), float(row['high']), float(row['low']), float(row['close']), float(row['volume']))
             for index, row in df.iterrows()]
    # close
    # ohlcv

    # Define the indicators
    bop = BOP(ohlcv)
    sobv = SOBV(7, ohlcv)
    stoch = Stoch(14, 3, ohlcv)
    force_index = ForceIndex(13, ohlcv)
    macd = MACD(12, 26, 9, close)

    # Create new columns in the DataFrame
    for i in range(5685):  # Assuming the length of the indicators matches the DataFrame length (5865 rows)
        df.loc[i, 'BOP'] = bop[i]
        df.loc[i, 'SOBV'] = sobv[i]
        df.loc[i, 'Stoch'] = stoch[i].k
        df.loc[i, 'ForceIndex'] = force_index[i]
        df.loc[i, 'MACD'] = macd[i].histogram

    return df

df = indicators(df)
# df

In [388]:
    # print(f'AccuDist: {AccuDist(ohlcv)[-1]}')
    # print(f'ADX: {ADX(14, 14, ohlcv)[-1]}')
    # print(f'ALMA: {ALMA(9, 0.85, 6, close)[-1]}')
    # print(f'AO: {AO(5, 34, ohlcv)[-1]}')
    # print(f'Aroon: {Aroon(14, ohlcv)[-1]}')
    # print(f'ATR: {ATR(14, ohlcv)[-1]}')
    # print(f'BB: {BB(20, 2, close)[-1]}')
    # print(f'BOP: {BOP(ohlcv)[-1]}')
    # print(f'CCI: {CCI(20, ohlcv)[-1]}')
    # print(f'ChaikinOsc: {ChaikinOsc(3, 10, ohlcv)[-1]}')
    # print(f'ChandeKrollStop: {ChandeKrollStop(10, 2, 9, ohlcv)[-1]}')
    # print(f'CHOP: {CHOP(14, ohlcv)[-1]}')
    # print(f'CoppockCurve: {CoppockCurve(11, 14, 10, close)[-1]}')
    # print(f'DEMA: {DEMA(20, close)[-1]}')
    # print(f'DonchianChannels: {DonchianChannels(20, ohlcv)[-1]}')
    # print(f'DPO: {DPO(20, close)[-1]}')
    # print(f'EMA: {EMA(20, close)[-1]}')
    # print(f'EMV: {EMV(14, 10000, ohlcv)[-1]}')
    # print(f'ForceIndex: {ForceIndex(13, ohlcv)[-1]}')
    # print(f'HMA: {HMA(9, close)[-1]}')
    # print(f'Ichimoku: {Ichimoku(26, 9, 52, 52, 26, ohlcv)[-1]}')
    # print(f'KAMA: {KAMA(14, 2, 30, close)[-1]}')
    # print(f'KeltnerChannels: {KeltnerChannels(20, 26, 1, 1, ohlcv)[-1]}')
    # print(f'KST: {KST(10, 10, 15, 10, 20, 10, 30, 15, 9, close)[-1]}')
    # print(f'KVO: {KVO(34, 55, ohlcv)[-1]}')
    # print(f'MACD: {MACD(12, 26, 9, close)[-1]}')
    # print(f'MassIndex: {MassIndex(9, 9, 10, ohlcv)[-1]}')
    # print(f'MeanDev: {MeanDev(10, close)[-1]}')
    # print(f'OBV: {OBV(ohlcv)[-1]}')
    # print(f'Pivots: {PivotsHL(15, 15, ohlcv)[-4:]}')
    # print(f'ROC: {ROC(9, close)[-1]}')
    # print(f'RSI: {RSI(14, close)[-1]}')
    # print(f"SAR: {ParabolicSAR(0.02, 0.02, 0.2, ohlcv)[-20:]}")
    # print(f'SFX: {SFX(12, 12, 3, ohlcv)[-1]}')
    # print(f'SMA: {SMA(20, close)[-1]}')
    # print(f'SMMA: {SMMA(7, close)[-1]}')
    # print(f'SOBV: {SOBV(7, ohlcv)[-1]}')
    # print(f'StdDev: {StdDev(7, close)[-1]}')
    # print(f'Stoch: {Stoch(14, 3, ohlcv)[-1]}')
    # print(f'StochRSI: {StochRSI(14, 14, 3, 3, close)[-1]}')
    # print(f'SuperTrend: {SuperTrend(10, 3, ohlcv)[-20:]}')
    # print(f'TEMA: {TEMA(20, close)[-1]}')
    # print(f'TRIX: {TRIX(18, close)[-1]}')
    # print(f'TSI: {TSI(13, 25, close)[-1]}')
    # print(f'TTM: {TTM(20, input_values = ohlcv)[-20:]}')
    # print(f'UO: {UO(7, 14, 28, ohlcv)[-1]}')
    # print(f'VTX: {VTX(14, ohlcv)[-1]}')
    # print(f'VWAP: {VWAP(ohlcv)[-1]}')
    # print(f'VWMA: {VWMA(20, ohlcv)[-1]}')
    # print(f'WMA: {WMA(9, close)[-1]}')

In [389]:
def features(df):

    # Assuming 'close' is the name of the column containing the close prices in your DataFrame 'df'
    # Calculate the percentage change 5 rows below the current row
    df['percentage_change'] = df['close'].shift(-5) / df['close'] - 1
    df['percentage_change'] = df['percentage_change'].shift(5)

    # Create the new column based on the conditions
    conditions = [
        (df['percentage_change'] >= 0.01),
        (df['percentage_change'] <= -0.01)
    ]

    values = [2, 0]

    # Default value (in between)
    df['label'] = np.select(conditions, values, default=1)

    # Drop the 'percentage_change' column if you don't need it anymore
    # df = df.drop(columns=['percentage_change'])

    # Print the DataFrame with the new column
    # df

    def create_lagged_columns(df, column_name):
        for i in range(1, 11):
            new_column_name = f'{column_name}_lag_{i}'
            df[new_column_name] = df[column_name].shift(i)

    list_of_indicators = ['BOP', 'SOBV', 'Stoch', 'ForceIndex', 'MACD']
    for indicator in list_of_indicators:
        create_lagged_columns(df, indicator)


    list_of_indicators = ['BOP', 'SOBV', 'Stoch', 'ForceIndex', 'MACD']
    for indicator in list_of_indicators:
        create_lagged_columns(df, indicator)

    nan_counts = df.isna().sum()
    # nan_counts

    # Drop na
    df = df.dropna()
    # df

    return df

df = features(df)
# df

In [516]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

def train_test(df, stack_size=8, step_size=1):

    df3 = df[['SOBV', 'Stoch', 'ForceIndex', 'MACD']]
    df4 = df['label']

    df3 = df3[:1440]
    df3 = df3.reset_index(drop=True)

    # Apply MinMaxScaler to the DataFrame
    scaler = MinMaxScaler()
    df3_scaled = scaler.fit_transform(df3)

    # Initialize an empty list to store the new arrays
    new_arrays = []

    # Iterate over the original DataFrame to create the new arrays
    for i in range(0, len(df3_scaled) - stack_size + 1, step_size):
        # Get a subset of arrays to stack
        arrays_subset = df3_scaled[i:i + stack_size, :]
        # Append the subset to the list of new arrays
        new_arrays.append(arrays_subset)

    # Convert the list of new arrays into a new 3D array
    df3_reshaped = np.stack(new_arrays)

    X = df3_reshaped.copy()
    y = df4[7:1440].values
    print(f"X.shape: {X.shape},\n y.shape: {y.shape},\n y: {y}")
    # count = 0
    # if count <=1: 
    #     print(f"X: {X}")
        # count += 1
    # Reshape y to a 2D array (required by OneHotEncoder)
    y = y.reshape(-1, 1)

    # Create an instance of the OneHotEncoder
    encoder = OneHotEncoder(sparse_output=False)

    # Fit and transform the data to one-hot encoded representation
    y_onehot = encoder.fit_transform(y)
    print(f"y_onehot.shape: {y_onehot.shape},\n y.shape: {y.shape},\n y_onehot[0]: {y_onehot[0]}")
    return X, y_onehot

class OrderbookDataset(Dataset):
    def __init__(self, train, target):
        self.train = train
        self.target = target
    def __len__(self):
        return self.train.shape[0]
    def __getitem__(self, idx):
        train = self.train[idx]
        target = self.target[idx]
        train = torch.tensor(train, dtype=torch.float32, requires_grad=True)
        target = torch.tensor(target, dtype=torch.float32)
        return train, target

# CD(X_train, y_train).__getitem__(1)
X, y = train_test(df)
data = OrderbookDataset(X, y)
X, y = data.__getitem__(1)
print(f"X: {X},\n y: {y}")
dl = DataLoader(data, batch_size=16, shuffle=False) #, num_workers=1, pin_memory=False

X.shape: (1433, 8, 4),
 y.shape: (1433,),
 y: [2 1 0 ... 1 1 2]
y_onehot.shape: (1433, 3),
 y.shape: (1433, 1),
 y_onehot[0]: [0. 0. 1.]
X: tensor([[0.1885, 0.3431, 0.3843, 0.6152],
        [0.1888, 0.0685, 0.3858, 0.5828],
        [0.1891, 0.1453, 0.4137, 0.6379],
        [0.1887, 0.2870, 0.3655, 0.6534],
        [0.1883, 0.0628, 0.3699, 0.6277],
        [0.1886, 0.0192, 0.4201, 0.5690],
        [0.1888, 0.2880, 0.4340, 0.5915],
        [0.1884, 0.4017, 0.4417, 0.7280]], requires_grad=True),
 y: tensor([0., 1., 0.])


In [492]:
# To check if dataloader is working
for d in dl:
    print(d)
    print(d[0].shape)
    # d[0] is your input
    # d[1] is your label
    break

[tensor([[[0.1873, 0.4184, 0.4108, 0.6577],
         [0.1885, 0.3431, 0.3843, 0.6152],
         [0.1888, 0.0685, 0.3858, 0.5828],
         [0.1891, 0.1453, 0.4137, 0.6379],
         [0.1887, 0.2870, 0.3655, 0.6534],
         [0.1883, 0.0628, 0.3699, 0.6277],
         [0.1886, 0.0192, 0.4201, 0.5690],
         [0.1888, 0.2880, 0.4340, 0.5915]],

        [[0.1885, 0.3431, 0.3843, 0.6152],
         [0.1888, 0.0685, 0.3858, 0.5828],
         [0.1891, 0.1453, 0.4137, 0.6379],
         [0.1887, 0.2870, 0.3655, 0.6534],
         [0.1883, 0.0628, 0.3699, 0.6277],
         [0.1886, 0.0192, 0.4201, 0.5690],
         [0.1888, 0.2880, 0.4340, 0.5915],
         [0.1884, 0.4017, 0.4417, 0.7280]],

        [[0.1888, 0.0685, 0.3858, 0.5828],
         [0.1891, 0.1453, 0.4137, 0.6379],
         [0.1887, 0.2870, 0.3655, 0.6534],
         [0.1883, 0.0628, 0.3699, 0.6277],
         [0.1886, 0.0192, 0.4201, 0.5690],
         [0.1888, 0.2880, 0.4340, 0.5915],
         [0.1884, 0.4017, 0.4417, 0.7280],
      

In [517]:
from pathlib import Path

def get_config():
    # Return a dictionary containing various configuration parameters for the Transformer model
    return {
        "batch_size": 8, # process 6 days at a time
        "num_epochs": 20, 
        "lr": 10**-4, 
        "seq_len": 8, # max sequence length - 24 hours - 1 day
        "d_model": 4, # embedding vector size if it was a word
        "h": 2, # number of heads in multi-head attention 
        "N": 4, # number of encoder and decoder layers
        "dropout": 0.1,
        "d_ff": 128, # feed forward layer size
        "num_classes": 3, # number of classes
        "model_folder": "weights",
        "model_basename": "tmodel_",
        "preload": None,
        # "tokenizer_file": "tokenizer_{0}.json",
        "experiment_name": "runs/tmodel"
    }

def get_weights_file_path(config, epoch: str):
    # Build the file path for saving the model weights for a given epoch
    model_folder = config['model_folder']
    model_basename = config['model_basename']
    model_filename = f"{model_basename}{epoch}.pt"
    return str(Path('.') /model_folder / model_filename)

In [519]:
import math
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm

class LayerNormalization(nn.Module):

    def __init__(self, eps: float = 10**-6) -> None: # eps is for numerical stability
        super().__init__()
        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(1)) # Multiplied. alpha is a learnable parameter
        self.bias = nn.Parameter(torch.zeros(1)) # Added. bias is a learnable parameter

    def forward(self, x):
        # x = x.float()
        
        # x: (Batch_size, seq_len, hidden_size)
        # Keep the dimension from broadcasting (mean cancels the dimension when applied so dimension so flag needed)
        mean = x.mean(-1, keepdim=True) # (Batch_size, seq_len, 1)
        std = x.std(-1, keepdim=True)
        # eps is to prevent dividing by zero when std is very small
        return self.alpha * (x - mean) / (std + self.eps) + self.bias
        # Does not change the diemension of input tensor

class FeedForwardBlock(nn.Module):

    def __init__(self, d_model: int, d_ff: int, dropout: float = 0.1) -> None:
        super().__init__()
        # Create the first linear layer (fully connected) with input size d_model and output size d_ff.
        self.linear_1 = nn.Linear(d_model, d_ff)  # W1 & B1 (bias is True by default)
        
        # Create a dropout layer with dropout probability specified by the dropout argument.
        self.dropout = nn.Dropout(dropout)
        
        # Create the second linear layer (fully connected) with input size d_ff and output size d_model.
        self.linear_2 = nn.Linear(d_ff, d_model)  # W2 & B2 (bias is True by default)

    def forward(self, x):
        # x: Input tensor representing a batch of hourly market features. -> (Batch_size, seq_len, d_model)
        
        # Apply the first linear layer to the input tensor. -> (Batch_size, seq_len, d_ff).
        output_linear_1 = self.linear_1(x)

        # Apply the ReLU activation function to the output of the first linear layer.
        # The ReLU activation function returns the element-wise maximum of 0 and the input tensor.
        # The output shape remains the same as (Batch_size, seq_len, d_ff).
        output_relu = torch.relu(output_linear_1)
        # try LeakyReLU, GELU, SiLU

        # Apply dropout to the output of the ReLU activation function.
        # Dropout sets a fraction of elements in the input tensor to zero randomly to improve regularization during training.
        # The output shape remains the same as (Batch_size, seq_len, d_ff).
        output_dropout = self.dropout(output_relu)

        # Apply the second linear layer to the output of the dropout layer.
        # The output shape is (Batch_size, seq_len, d_model), which matches the input shape.
        output_linear_2 = self.linear_2(output_dropout)

        # The final output of the FeedForwardBlock is the output of the second linear layer.
        # print(f"FeedForwardBlock shape: {output_linear_2.shape}")
        return output_linear_2

class SingleHeadAttentionBlock(nn.Module):

    def __init__(self, d_model: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model  # Embedding vector size = 1

        self.w_q = nn.Linear(d_model, d_model)  # Wq
        self.w_k = nn.Linear(d_model, d_model)  # Wk
        self.w_v = nn.Linear(d_model, d_model)  # Wv
        self.w_o = nn.Linear(d_model, d_model)  # Wo
        self.dropout = nn.Dropout(dropout)

    @staticmethod
    def attention(query, key, value, mask=None, dropout=nn.Dropout):
        d_k = query.shape[-1]  # Get the last dimension of the query tensor

        # Applying the attention formula from the paper (scaled dot-product attention)
        # (Batch_size, seq_len, d_k) -> (Batch_size, seq_len, seq_len)
        attention_scores = (query @ key.transpose(-2, -1)) / math.sqrt(d_k)

        # Apply masking if provided to handle padded tokens or future tokens
        if mask is not None:
            attention_scores = attention_scores.masked_fill(mask == 0, -1e9)
            # The large negative number becomes close to zero in the softmax function

        # Apply the softmax activation function along the last dimension to get attention weights
        # No change in shape: (Batch_size, seq_len, seq_len)
        attention_weights = torch.softmax(attention_scores, dim=-1)

        # Apply dropout to the attention weights if dropout is provided
        if dropout is not None:
            # m = dropout(0.1)
            attention_weights = dropout(attention_weights)

        # Compute the weighted sum of the value tensor using the attention weights
        # (attention_weights @ value) -> (Batch_size, seq_len, d_k) @ (Batch_size, seq_len, d_k)
        # attention is for visualization
        return (attention_weights @ value), attention_weights

    def forward(self, q, k, v, mask=None):
        # Apply linear transformations to get the query, key, and value tensors
        # (Batch_size, seq_len, d_model) -> (Batch_size, seq_len, d_model)
        query = self.w_q(q)
        key = self.w_k(k)
        value = self.w_v(v)

        # Calculate the attention scores using the static method attention()
        # x is the weighted sum of the value tensor using attention weights
        # (Batch_size, seq_len, d_k), self.attention_scores has the attention weights
        x, self.attention_scores = SingleHeadAttentionBlock.attention(query, key, value, mask, self.dropout)

        # Apply the output linear transformation and return the output tensor
        # (Batch_size, seq_len, d_model) -> (Batch_size, seq_len, d_model)
        return self.w_o(x)

class MultiHeadAttentionBlock(nn.Module):

    def __init__(self, d_model: int, h: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model  # Embedding vector size
        self.h = h  # Number of heads

        # d_model must be divisible by h because we will split d_model into h heads
        assert d_model % h == 0, "d_model must be divisible by h"  # Check for validity

        self.d_k = d_model // h  # Dimension of vector seen by each head

        # Linear transformations for query, key, value, and output projections for each head
        self.w_q = nn.Linear(d_model, d_model)  # Wq
        self.w_k = nn.Linear(d_model, d_model)  # Wk
        self.w_v = nn.Linear(d_model, d_model)  # Wv
        self.w_o = nn.Linear(d_model, d_model)  # Wo
        self.dropout = nn.Dropout(dropout)

        # print(f"w_q: {self.w_q.shape}, w_k: {self.w_k.shape}, w_v: {self.w_v.shape}, w_o: {self.w_o.shape}")


    @staticmethod
    def attention(query, key, value, mask=None, dropout=nn.Dropout):
        d_k = query.shape[-1]  # Get the last dimension of the query tensor

        # Applying the attention formula from the paper (scaled dot-product attention)
        # (Batch_size, h, seq_len, d_k) -> (Batch_size, h, seq_len, seq_len)
        attention_scores = (query @ key.transpose(-2, -1)) / math.sqrt(d_k)
        # print(f"attention_scores: {attention_scores.shape}")
        # order of execution:
        # 1. transpose(-2, -1) -> (Batch_size, h, seq_len, d_k) -> (Batch_size, h, d_k, seq_len)
        # 2. query @ key -> (Batch_size, h, seq_len, seq_len)
        # 3. / math.sqrt(d_k) -> (Batch_size, h, seq_len, seq_len)

        # Apply masking if provided to handle padded tokens or future tokens
        if mask is not None:
            attention_scores = attention_scores.masked_fill(mask == 0, -1e9)
            # The large negative number becomes close to zero in the softmax function

        # Apply the softmax activation function along the last dimension to get attention weights
        # No change in shape: (Batch_size, h, seq_len, seq_len)
        attention_weights = torch.softmax(attention_scores, dim=-1)

        # Apply dropout to the attention weights if dropout is provided
        if dropout is not None:
            # m = dropout(0.1)
            attention_weights = dropout(attention_weights)

        # Compute the weighted sum of the value tensor using the attention weights
        # (attention_weights @ value) -> (Batch_size, h, seq_len, seq_len) @ (Batch_size, h, seq_len, d_k)
        # attention is for visualization
        return (attention_weights @ value), attention_weights
        

    def forward(self, q, k, v, mask):
        # Apply linear transformations to get the query, key, and value tensors
        # (Batch_size, seq_len, d_model) -> (Batch_size, seq_len, d_model)
        query = self.w_q(q)
        key = self.w_k(k)
        value = self.w_v(v)

        # print(f"query: {query.shape}, key: {key.shape}, value: {value.shape}")
        # Split the query, key, and value into h different parts (h = Number of heads)
        # We split d_model into h parts where each part is d_k in size
        # (Batch_size, seq_len, d_model) -> (Batch_size, seq_len, h, d_k) -> (Batch_size, h, seq_len, d_k)
        query = query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1, 2)
        key = key.view(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1, 2)
        value = value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1, 2)

        # Calculate the attention scores using the static method attention()
        # x is the weighted sum of the value tensor using attention weights
        # (Batch_size, h, seq_len, d_k), self.attention_scores has the attention weights
        x, self.attention_scores = MultiHeadAttentionBlock.attention(query, key, value, mask, self.dropout)

        # Reshape and combine the heads to get the output tensor
        # .transpose() : (Batch_size, h, seq_len, d_k) -> (Batch_size, seq_len, h, d_k)
        # .view() : (Batch_size, seq_len, h, d_k) -> (Batch_size, seq_len, d_model)
        x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.h * self.d_k)

        # Apply the output linear transformation and return the output tensor
        # (Batch_size, seq_len, d_model) -> (Batch_size, seq_len, d_model)
        return self.w_o(x)

class ResidualConnection(nn.Module):

    def __init__(self, dropout: float) -> None:
        super().__init__()
        self.dropout = nn.Dropout(dropout)    # Dropout layer with the specified dropout rate
        self.norm = LayerNormalization()      # LayerNormalization instance

    def forward(self, x, sublayer):
        # Apply the normalization layer to the input tensor 'x'
        normalized_input = self.norm(x)
        
        # Apply the sublayer (e.g., a feedforward block or a multi-head attention block) to the normalized input
        # The sublayer can be any neural network component that processes the input tensor
        intermediate_output = sublayer(normalized_input)
        
        # In the attention is all you need paper, sublayer is applied before the normalization layer.
        # It doesn't matter, both work fine.
        # In fact normalizing before applying the sublayer helps with numerical stability.

        # Apply dropout to the intermediate output
        dropout_output = self.dropout(intermediate_output)
        
        # Add the dropout output to the original input tensor (residual connection)
        # This is the essence of the residual connection: x + Dropout(Sublayer(LayerNormalization(x)))
        # The addition operation allows the intermediate outputs of the sublayer to flow directly to the output
        # and be added to the original input, bypassing the sublayer in case it does not capture useful information.
        # This helps the model to learn residual mappings (the difference between the input and output)
        # and facilitates training of deeper neural networks.
        output = x + dropout_output

        return output        
        # All of the above operations can be combined into a single line as follows:
        # return x + self.dropout(sublayer(self.norm(x)))

class EncoderBlock(nn.Module):

    def __init__(self, self_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()

        # Multi-Head Self-Attention Block
        self.self_attention_block = self_attention_block

        # Feed-Forward Block
        self.feed_forward_block = feed_forward_block

        # Residual Connections
        # Create two ResidualConnection instances with the specified dropout rate
        self.residual_connections = nn.ModuleList([ResidualConnection(dropout) for _ in range(2)])

    def forward(self, x, src_mask):
        # Step 1: Self-Attention Sublayer
        # Apply Multi-Head Self-Attention to the input 'x' with the mask 'src_mask' (encoder mask)
        # and pass the result through a ResidualConnection.
        # The ResidualConnection adds the input tensor 'x' to the output of the self-attention sublayer.
        # This is the first residual connection in the encoder block.
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, src_mask))

        # Step 2: Feed-Forward Sublayer
        # Apply the Feed-Forward Block to the output of the self-attention sublayer and
        # pass the result through another ResidualConnection.
        # The ResidualConnection adds the input tensor 'x' to the output of the feed-forward sublayer.
        # This is the second residual connection in the encoder block.
        x = self.residual_connections[1](x, self.feed_forward_block)

        # Return the final output after both the self-attention and feed-forward sublayers.
        # The output tensor has shape (batch_size, seq_len, d_model)
        return x

class Encoder(nn.Module):

    def __init__(self, layers: nn.ModuleList) -> None:
        super().__init__()

        # Store the list of Encoder blocks (layers)
        self.layers = layers

        # Layer normalization instance to be applied after all Encoder blocks
        self.norm = LayerNormalization()

    def forward(self, x, src_mask):
        # Iterate through the Encoder blocks (layers) and apply each block's forward pass to the input tensor 'x'
        for layer in self.layers:
            x = layer(x, src_mask)

        # Apply layer normalization to the output tensor 'x'
        # Layer normalization is applied after processing all Encoder blocks.
        # shape of x: (batch_size, seq_len, d_model)
        return self.norm(x)

class ProjectionLayer(nn.Module):
    def __init__(self, d_model: int, num_classes: int) -> None:
        super().__init__()
        self.proj = nn.Linear(d_model, num_classes)

    def forward(self, x):
        logits = self.proj(x)  # Get the raw scores from the linear projection
        # print(f"logits shape: {logits.shape}\n logits: {logits[0]}")
        # print(f"LOGITS: {logits[0][-1]}")
        probabilities = torch.softmax(logits[:, -1, :], dim=-1)  # Apply softmax to get probabilities
        # print(f"probabilities shape: {probabilities.shape}\nprobabilities: {probabilities}")
        # Apply average pooling along the second dimension (seq_len)
        # pooled_probs = torch.mean(probabilities, dim=1)  # Shape: (16, 3)
        # return torch.argmax(probabilities, dim=-1)  # Return the class indices with highest probabilities
        return probabilities

class Transformer(nn.Module):
    
    def __init__(self, encoder: Encoder, projection_layer: ProjectionLayer) -> None:
        super().__init__()

        # Store the encoder, input embeddings and projection layer
        self.encoder = encoder
        # self.src_embed = src_embed
        self.projection_layer = projection_layer

    def encode(self, src, src_mask):
        # Step 1: Source Embedding
        # Convert the source input 'src' into embeddings using the src_embed layer.
        # src = self.src_embed(src)
        print(f"src shape: {src.shape}")
        print(f"src_mask shape: {src_mask.shape}")
        # Step 2: Encoding
        # Pass the source embeddings through the encoder to obtain contextual representations.
        encoder_output = self.encoder(src, src_mask)

        # Return the encoder output, which contains the contextual representations of the source sequence.
        return encoder_output
    
    def project(self, x):
        # Step 1: Projection
        # Pass the input tensor 'x' through the projection_layer to perform the final classification.
        # This maps the contextual representations to the number of classes for the classification task.
        return self.projection_layer(x)

def build_transformer(seq_len: int = 4, d_model: int = 1, num_classes: int = 3, batch_size: int = 24, N: int = 4, h: int = 2, dropout: float = 0.1, d_ff: int = 64) -> Transformer:
    # Create input embeddings
    # Input data has shape (batch_size, seq_len, num_features)
    # Batch size is the number of samples in the batch
    # InputEmbeddings class will individually process each sample in the batch
    # src_embed = InputEmbeddings(batch_size, seq_len, d_model)
    # print(f"src_embed shape: {src_embed}")
    encoder_blocks = []
    # Create N Encoder blocks
    for _ in range(N):
        # Create the masked multi-head self-attention block and feed-forward block
        self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)

        # Combine the self-attention and feed-forward blocks to create an Encoder block
        encoder_block = EncoderBlock(self_attention_block, feed_forward_block, dropout)

        # Add the Encoder block to the list
        encoder_blocks.append(encoder_block)
    
    # Create the Encoder
    encoder = Encoder(nn.ModuleList(encoder_blocks))

    # Create the projection layer
    projection_layer = ProjectionLayer(d_model, num_classes)

    # Create the Transformer
    transformer = Transformer(encoder, projection_layer)

    # Initialize the weights of the transformer using Xavier uniform initialization
    for p in transformer.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)

    # Return the built transformer
    return transformer

def causal_mask(seq_len):
    # Create a mask that prevents the decoder from attending to future positions.
    # The mask will have 0s above the main diagonal and 1s below it.
    mask = torch.triu(torch.ones(1, seq_len, seq_len), diagonal=1).type(torch.int)
    return mask == 0

def get_model(config):
    model = build_transformer(config['seq_len'], config['d_model'], config['num_classes'], config['batch_size'], config['N'], config['h'], config['dropout'], config['d_ff'])
    return model


def train_model(config):
    # Device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print('Using device:', device)

    # Make sure that weights foler exists
    Path(config['model_folder']).mkdir(parents=True, exist_ok=True)

    # print(f"Input df shape: {df2.shape}")
    # Load the datas
    # train_dataloader, val_dataloader, test_dataloader = get_ds(config, df2)
    
    X, y = train_test(df)
    # print(f"X shape: {X.shape}, X: {X}")
    # print(f"y shape: {y.shape}, y: {y}")
    data = OrderbookDataset(X, y)
    train_dataloader = DataLoader(data, batch_size=16, shuffle=False) 

    # Load the model
    model = get_model(config).to(device)

    # Tensorboard
    writer = SummaryWriter(config['experiment_name'])

    # Optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'])

    initial_epoch = 0
    global_step = 0
    if config['preload']:
        model_filename = get_weights_file_path(config, config['preload'])
        print(f"Preloading model weights from {model_filename}")
        state = torch.load(model_filename)
        initial_epoch = state['epoch'] + 1
        optimizer.load_state_dict(state['optimizer'])
        global_step = state['global_step']
    
    # Loss function
    criterion = nn.CrossEntropyLoss()

    # Training loop
    for epoch in range(initial_epoch, config['num_epochs']):
        model.train()
        batch_iterator = tqdm(train_dataloader, desc=f'Processing epoch {epoch:02d}')
        for batch in batch_iterator:
            # print(f"batch: {batch[0]}")
            encoder_input = batch[0].to(device) # (batch_size, seq_len)
            label = batch[1].to(device)

            # print(f"encoder_input.shape: {encoder_input.shape}")
            # print(f"label.shape: {label.shape}")
            # print(f"label dtype: {label.dtype}")

            # Generate a causal mask
            seq_len = encoder_input.size(1)
            causal_mask = torch.tril(torch.ones(seq_len, seq_len)).unsqueeze(0).to(device)
            # print(f"causal_mask.shape: {causal_mask.shape}")

            # Forward pass
            encoder_output = model.encoder(encoder_input, causal_mask) # (batch_size, seq_len, d_model)
            pred_label = model.projection_layer(encoder_output) # (batch_size, seq_len, num_classes)
            # print(f"pred_label.shape: {pred_label.shape}, pred_label: {pred_label}")
            # print(f"label.shape: {label.shape}", f"label: {label}")

            # Squeeze the label tensor to remove the extra dimension
            # Squeeze the extra dimension from the label tensor
            # label = label.squeeze(dim=1)  # (batch_size)
            # log_probs = label.float()

            # Detach the label tensor from the computation graph
            # with torch.inference_mode():
                # label_detached = label.detach()
            # label = label.float()

            def cross_entropy(input, target):
                return torch.mean(-torch.sum(target * torch.log(input), 1))


            # Compute the loss
            # loss = criterion(pred_label.float(), label)
            loss = cross_entropy(pred_label.float(), label.float())
            # print(f"loss: {loss}, loss_grad: {loss.grad_fn}")
            # Compute the loss
            # Reshape log_probs and label to align with the expected shape for NLLLoss
            # loss = criterion(log_probs.view(-1, log_probs.size(-1)), label.view(-1))
            batch_iterator.set_postfix({f'loss': f"{loss.item():6.3f}"})

            # Log the loss to Tensorboard
            writer.add_scalar('train loss', loss.item(), global_step)
            writer.flush()

            # Backpropagation
            loss.backward()

            # Update the weights
            optimizer.step()
            optimizer.zero_grad()

            global_step += 1

        # Save the model after each epoch
        model_filename = get_weights_file_path(config, f'{epoch:02d}')
        torch.save(
            {
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'global_step': global_step
            }, model_filename
        )


In [520]:
config = get_config()
train_model(config)

Using device: cpu
X.shape: (1433, 8, 4),
 y.shape: (1433,),
 y: [2 1 0 ... 1 1 2]
y_onehot.shape: (1433, 3),
 y.shape: (1433, 1),
 y_onehot[0]: [0. 0. 1.]


Processing epoch 00: 100%|██████████| 90/90 [00:01<00:00, 87.68it/s, loss=0.884] 
Processing epoch 01: 100%|██████████| 90/90 [00:00<00:00, 94.06it/s, loss=0.959] 
Processing epoch 02: 100%|██████████| 90/90 [00:00<00:00, 124.43it/s, loss=1.030]
Processing epoch 03: 100%|██████████| 90/90 [00:00<00:00, 110.99it/s, loss=0.996]
Processing epoch 04: 100%|██████████| 90/90 [00:00<00:00, 117.20it/s, loss=0.927]
Processing epoch 05: 100%|██████████| 90/90 [00:00<00:00, 113.39it/s, loss=0.941]
Processing epoch 06: 100%|██████████| 90/90 [00:00<00:00, 107.59it/s, loss=0.921]
Processing epoch 07: 100%|██████████| 90/90 [00:00<00:00, 121.28it/s, loss=0.982]
Processing epoch 08: 100%|██████████| 90/90 [00:00<00:00, 112.09it/s, loss=1.053]
Processing epoch 09: 100%|██████████| 90/90 [00:00<00:00, 110.46it/s, loss=0.998]
Processing epoch 10: 100%|██████████| 90/90 [00:00<00:00, 115.63it/s, loss=0.997]
Processing epoch 11: 100%|██████████| 90/90 [00:00<00:00, 109.77it/s, loss=0.988]
Processing epoch

In [488]:
def cross_entropy(input, target):
    return torch.mean(-torch.sum(target * torch.log(input), 1))


y = torch.Tensor([[0, 0, 1]])
yhat = torch.Tensor([[0.1, 0.2, 0.7]])
cross_entropy(yhat, y)

tensor(0.3567)

Training loop works but model does not learn. Need to tune params.

### Everything below this is rough work.

In [225]:
class CustDat()


class OrderbookDataset(Dataset):

    def __init__(self):
        # data loading
        xy = df2
        self.x = torch.from_numpy(df2[['SOBV', 'Stoch', 'ForceIndex', 'MACD']].values) # 1440 x 4
        self.y = torch.from_numpy(df2[['label']].values) # 1440
        self.n_samples = xy.shape[0] # 1440
        
    def __getitem__(self, index):
        return self.x[index], self.y[index]
    
    def __len__(self):
        return self.n_samples

# Create the dataset
dataset = OrderbookDataset()

X, y = dataset[:]

def create_rolling_windows(tensor, window_size):
    num_windows = tensor.shape[0] - window_size + 1
    windows = [tensor[i:i+window_size] for i in range(num_windows)]
    return torch.stack(windows)

# Define the rolling window size
window_size = 4

# Create rolling windows along the first dimension (new dimension as you mentioned)
X_input = create_rolling_windows(X, window_size)
y_input = y[3:]

# Print the resulting tensor
print(X_input.shape) 
print(y_input.shape) 


# Step 1: Manually split the data into training, testing, and validation sets
# You can use slicing to achieve this. For example, splitting 70% for training, 15% for testing, and 15% for validation:
# train_size = int(864) # 36 days
# test_size = int(288) # 12 days
# val_size = int(288) # 12 days

# train_data = dataset[:train_size]
# val_data = dataset[train_size:train_size+val_size]
# test_data = dataset[train_size+val_size:train_size+val_size+test_size]



# X_train, y_train = train_data
# X_val, y_val = val_data
# X_test, y_test = test_data

# print(f"X_train.shape: {X_train}")

# print(f"seq_len: {seq_len}")

# Create an upper triangular mask of shape (seq_len, seq_len)
# upper_triangular_mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1)
# upper_triangular_mask
# Broadcast the mask to the batch size and unsqueeze to match the encoder_mask shape
# encoder_mask = upper_triangular_mask.unsqueeze(0).expand(encoder_mask.shape[0], -1, -1)

# Now, encoder_mask will have the shape (batch_size, seq_len, seq_len)

torch.Size([1437, 4, 4])
torch.Size([1437, 1])


In [105]:
def causal_mask(seq_len):
    mask = torch.triu(torch.ones(1, seq_len, seq_len), diagonal=1).type(torch.int)
    return mask == 0

seq_len = 5
mask = causal_mask(seq_len)

print(mask)

tensor([[[ True, False, False, False, False],
         [ True,  True, False, False, False],
         [ True,  True,  True, False, False],
         [ True,  True,  True,  True, False],
         [ True,  True,  True,  True,  True]]])


In [94]:
# # Step 2: Convert the data into PyTorch tensors
# X_train = torch.tensor(df2.values, dtype=torch.float32)
# print(f"shape: {X_train.shape}")
# # Reshape X_train to have the desired shape (36 sets of 24 rows, each with 4 elements)
# num_sets = 36
# num_rows_per_set = 24
# num_elements = 4
# X_train = X_train.reshape(num_sets, num_rows_per_set, num_elements)
# X_train.shape

In [95]:
# X = df2[:1440].values  # (1440, 4) - Convert to numpy array
# y = df2[:1440].values   # (1440,) - Convert to numpy array

# # len(X)
# # Reshape X to the desired shape [batch_dimension, seq_len=24, d_model=4]
# num_sets = len(X) // 24
# X = X[:num_sets * 24].reshape(num_sets, 24, 4)

# print(X.shape)
# print(X)

# # Convert X and y to PyTorch tensors
# # X = torch.tensor(X, dtype=torch.float32)
# # y = torch.tensor(y, dtype=torch.float32)

In [None]:
class InputEmbeddings(nn.Module):
    def __init__(self, batch_size: int, seq_len: int, d_model: int) -> None:
        super().__init__()

        self.batch_size = batch_size
        self.seq_len = seq_len
        self.d_model = d_model
        self.embedding = nn.Linear(1, d_model)

    def forward(self, x):
        # x is the input tensor of dimension (d_model, seq_len) -> (4, 24)

        # Transpose the input tensor to dimension (seq_len, d_model) -> (24, 4)
        x = x.transpose(0, 1)

        # Add dimension for batch dimension (batch_size, seq_len, d_model) -> (1, 24, 4)
        x = x.unsqueeze(0)

        # Repeat the tensor for batch dimension
        # x = x.repeat(self.batch_size, 2, 1)

        # Scale the tensor by square root of d_model
        x = x * math.sqrt(self.d_model)

        # (1, 24, 4)
        print(f"InputEmbedding shape: {x.shape}")
        return x 

In [None]:
# df2
# df2.info()




# x[0], y[0]
# x, y

# df2.shape[0]
# # Step 1: Reshape the 2D tensor into a 3D tensor with 60 elements along the 0th dimension
# tensor_3d = X_train.view(60, 24, 4)

# # Step 2: Create a 1D tensor for the labels
# labels = y_train.view(60)

# # Print the resulting shapes
# print(tensor_3d.shape)  # torch.Size([60, 24, 4])
# print(labels.shape)     # torch.Size([60])

1440

In [30]:
import torch
from torch.utils.data import Dataset, DataLoader



# dataloader = DataLoader(dataset=dataset, batch_size=4, shuffle=False)

# dataiter = iter(dataloader)
# data = dataiter.__next__()
# features, labels = data
# print(features, labels)
# # Convert each row into a tensor
# row_tensors = [torch.tensor(row, dtype=torch.float32) for _, row in df2.iterrows()]

# # Concatenate all tensors along a new outer dimension (stacking them)
# X = torch.stack(row_tensors)
# # print(X)
# # X[2,1]
# # Convert the 'label' column into a tensor
# labels = torch.tensor(df['label'].values, dtype=torch.int64)

# # Print the resulting shape
# print(X.shape)
# print(labels.shape)

(tensor([ 4.4885e+06,  6.2780e+00, -8.8032e+06,  2.9213e-01],
        dtype=torch.float64),
 tensor([2]))

In [None]:
# Reshape the 2D tensor into a 3D tensor with 60 elements along the 0th dimension
tensor_3d = X.view(60, 24, 4)

# Print the resulting shape
print(tensor_3d.shape)


RuntimeError: shape '[60, 24, 4]' is invalid for input of size 7200

        (5): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_features=768, out_features=3072, bias=True)
            (lin2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        )
      )
    )
  )
  (l2): Dropout(p=0.3, inplace=False)
  (l3): Linear(in_features=768, out_features=1, bias=True)


how did they get the out features to be 1 in the end? that is what I am looking for.

what should I do after the below code to get a single output which I will apply softmax on?



In [None]:
# class InputData(nn.Module):

class ProjectionLayer(nn.Module):

    def __init__(self, d_model: int, class_size: int) -> None:
        super().__init__()

        # Create a linear projection layer that maps the input 'd_model' dimensional tensor to 'class_size' dimensional tensor.
        self.proj = nn.Linear(d_model, class_size)

    def forward(self, encoder_output):
        # encoder_output has shape (batch_size, seq_len, d_model)
        # We want to perform classification along the seq_len dimension, so we use mean pooling.
        # We can also use max pooling or other pooling strategies.
        pooled_output = torch.mean(encoder_output, dim=1)  # shape: (batch_size, d_model)
        
        # Apply the linear projection layer to the input tensor 'x', followed by a log-softmax operation along the last dimension (class_size).
        # The log-softmax operation converts the raw scores into log-probabilities, making it more numerically stable during training.
        # The shape changed from (batch_size, seq_len, d_model) to (batch_size, seq_len, class_size).
        return torch.log_softmax(self.proj(encoder_output), dim=-1)

import torch.nn.functional as F
class ClassificationHead(nn.Module):
    def __init__(self, d_model, num_classes):
        super().__init__()
        self.linear = nn.Linear(d_model, num_classes)

    def forward(self, encoder_output):
        # encoder_output has shape (batch_size, seq_len, d_model)
        # We want to perform classification along the seq_len dimension, so we use mean pooling.
        # We can also use max pooling or other pooling strategies.
        pooled_output = torch.mean(encoder_output, dim=1)  # shape: (batch_size, d_model)

        # Apply the linear layer for classification.
        logits = self.linear(pooled_output)  # shape: (batch_size, num_classes)

        # Apply the softmax function to get the probabilities for each class.
        probabilities = F.softmax(logits, dim=1)  # shape: (batch_size, num_classes)

        return logits, probabilities

class Transformer(nn.Module):

    def __init__(self, encoder: Encoder, src_embed: InputData,
                 projection_layer: ProjectionLayer) -> None:
        super().__init__()

        # Store the encoder, decoder, input embeddings, positional encodings, and projection layer
        self.encoder = encoder
        # self.decoder = decoder
        self.src_embed = src_embed # Gets data and just adds batch dimensions. Create class for this
        # self.tgt_embed = tgt_embed
        # self.src_pos = src_pos
        # self.tgt_pos = tgt_pos
        self.projection_layer = projection_layer

    def encode(self, src, src_mask):

        # Step 1: Source Embedding
        # Convert the source input 'src' into embeddings using the src_embed layer.
        src = self.src_embed(src)

        # Step 2: Encoding
        # Pass the source embeddings through the encoder to obtain contextual representations.
        encoder_output = self.encoder(src, src_mask)

        # Return the encoder output, which contains the contextual representations of the source sequence.
        return encoder_output

    # def decode(self, encoder_output, src_mask, tgt, tgt_mask):
    #     # Step 1: Target Embedding
    #     # Convert the target input 'tgt' into embeddings using the tgt_embed layer.
    #     tgt = self.tgt_embed(tgt)

    #     # Step 2: Target Positional Encoding
    #     # Apply the positional encoding to the target embeddings.
    #     tgt = self.tgt_pos(tgt)

    #     # Step 3: Decoding
    #     # Pass the target embeddings through the decoder along with encoder_output,
    #     # source mask 'src_mask', and target mask 'tgt_mask' to obtain decoder outputs.
    #     decoder_output = self.decoder(tgt, encoder_output, src_mask, tgt_mask)

    #     # Return the decoder output, which contains the contextual representations of the target sequence.
    #     return decoder_output

    def project(self, x):
        # Step 1: Projection
        # Pass the input tensor 'x' through the projection_layer to perform the final classification.
        # This maps the contextual representations to the number of classes for the classification task.
        return self.projection_layer(x)

In [None]:
def build_transformer(src_vocab_size: int, tgt_vocab_size: int, src_seq_len: int, tgt_seq_len: int = 512,
                      d_model: int=512, N: int = 6, h: int = 8, dropout: float = 0.1, d_ff: int = 2048) -> Transformer:
    # Create embedding layers for the source and target vocabularies
    src_embed = InputEmbeddings(d_model, src_vocab_size)
    tgt_embed = InputEmbeddings(d_model, tgt_vocab_size)

    # Create positional encoding layers for the source and target sequences
    src_pos = PositionalEncoding(d_model, src_seq_len, dropout)
    tgt_pos = PositionalEncoding(d_model, tgt_seq_len, dropout)

    # Create a list to hold encoder blocks
    encoder_blocks = []
    # Create 'N' encoder blocks and append them to the list
    for _ in range(N):
        # Create the self-attention block and feed-forward block for the encoder
        encoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        # Create an encoder block using the self-attention and feed-forward blocks
        encoder_block = EncoderBlock(encoder_self_attention_block, feed_forward_block, dropout)
        # Add the encoder block to the list
        encoder_blocks.append(encoder_block)

    # Create a list to hold decoder blocks
    decoder_blocks = []
    # Create 'N' decoder blocks and append them to the list
    for _ in range(N):
        # Create the self-attention block, cross-attention block, and feed-forward block for the decoder
        decoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        decoder_cross_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        # Create a decoder block using the self-attention, cross-attention, and feed-forward blocks
        decoder_block = DecoderBlock(decoder_self_attention_block, decoder_cross_attention_block, feed_forward_block, dropout)
        # Add the decoder block to the list
        decoder_blocks.append(decoder_block)

    # Create the encoder and decoder using the ModuleList of encoder and decoder blocks
    encoder = Encoder(nn.ModuleList(encoder_blocks))
    decoder = Decoder(nn.ModuleList(decoder_blocks))

    # Create the projection layer to map the contextual representations to the target vocabulary size
    projection_layer = ProjectionLayer(d_model, tgt_vocab_size)

    # Create the transformer using the encoder, decoder, input embeddings, positional encodings, and projection layer
    transformer = Transformer(encoder, decoder, src_embed, tgt_embed, src_pos, tgt_pos, projection_layer)

    # Initialize the weights of the transformer using Xavier uniform initialization
    for p in transformer.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)

    # Return the built transformer
    return transformer

In [None]:
from torch.utils.data import random_split

# Step 1: Split dataset into train set and temporary set (test + validation)
train_size = int(0.7 * len(X))
temp_size = len(X) - train_size
train_dataset, temp_dataset = random_split(dataset, [train_size, temp_size])

# Step 2: Split the temporary set into test and validation sets (without shuffling)
test_size = int(0.5 * len(temp_dataset))
val_size = len(temp_dataset) - test_size
test_dataset, val_dataset = random_split(temp_dataset, [test_size, val_size])

In [56]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = time_series_split(X, y, test_size=0.2, shuffle=False)

len(X_train), len(X_test), len(y_train), len(y_test)

(4533, 1134, 4533, 1134)

In [1]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

NameError: name 'X_train' is not defined

In [42]:
# Convert the data to PyTorch tensors
X_train = torch.FloatTensor(X_train.values)
X_test = torch.FloatTensor(X_test.values)
y_train = torch.LongTensor(y_train.values)
y_test = torch.LongTensor(y_test.values)

X_train, X_test, y_train, y_test

(tensor([[-7.8589e-01, -7.4098e+05,  4.1842e+01,  ..., -3.7641e-01,
          -4.2820e-01, -3.5978e-01],
         [ 8.0471e-01,  5.2894e+06,  3.4310e+01,  ..., -4.3196e-01,
          -3.7641e-01, -4.2820e-01],
         [ 3.7654e-01,  7.1006e+06,  6.8476e+00,  ..., -4.5715e-01,
          -4.3196e-01, -3.7641e-01],
         ...,
         [-3.1682e-01,  2.0341e+10,  6.0830e+01,  ...,  6.7134e-01,
           2.9493e-01, -1.8934e-02],
         [ 2.1426e-01,  2.0289e+10,  5.6835e+01,  ...,  6.7695e-01,
           6.7134e-01,  2.9493e-01],
         [-9.6058e-01,  2.0273e+10,  6.4006e+01,  ...,  4.8859e-01,
           6.7695e-01,  6.7134e-01]]),
 tensor([[-7.5918e-01,  2.0298e+10,  9.5028e+01,  ...,  1.1911e-01,
           4.8859e-01,  6.7695e-01],
         [ 8.3546e-01,  2.0387e+10,  9.9562e+01,  ..., -3.3670e-02,
           1.1911e-01,  4.8859e-01],
         [-1.7363e-01,  2.0438e+10,  8.2730e+01,  ...,  5.8430e-02,
          -3.3670e-02,  1.1911e-01],
         ...,
         [ 7.3064e-01,  2

In [43]:
y.value_counts()

1    2298
2    1960
0    1409
Name: label, dtype: int64

In [44]:
# Device agnostic
device = "mps" if torch.backends.mps.is_available() else "cpu"

In [45]:
# Calculate accuracy
def accuracy(y_true, y_pred):
    correct = torch.eq(y_true, y_pred.round()).sum().item()
    acc = correct / len(y_true) * 100
    return acc

In [79]:
# Build a model
class MulticlassModel(nn.Module):
    def __init__(self, num_features, output_features, hidden_units=256):
        super().__init__()
        # Define layers
        self.linear_layer_stack = nn.Sequential(
            nn.Linear(in_features=num_features, out_features=hidden_units),
            nn.ReLU(),
            nn.Linear(in_features=hidden_units, out_features=hidden_units),
            nn.ReLU(),
            nn.Linear(in_features=hidden_units, out_features=output_features)
            )
    # Define forward method
    def forward(self, X):
        return self.linear_layer_stack(X)

# Instantiate model class & send to target device
model_4 = MulticlassModel(num_features=55,
                          output_features=3
                          ).to(device)
model_4

MulticlassModel(
  (linear_layer_stack): Sequential(
    (0): Linear(in_features=55, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=256, bias=True)
    (3): ReLU()
    (4): Linear(in_features=256, out_features=3, bias=True)
  )
)

In [1]:
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Training Loop & Testing loop
# Set random seed
torch.manual_seed(42)

# Put data on device
X_train, X_test = X_train.to(device), X_test.to(device)
y_train, y_test = y_train.to(device), y_test.to(device)

# Loss and optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model_4.parameters(), lr=0.01)
scheduler = ReduceLROnPlateau(optimizer, mode='min', patience=10, factor=0.1, verbose=True)

epochs = 1000

for epoch in range(epochs):
    model_4.train()
    
    # Forward pass
    y_logits = model_4(X_train)
    y_pred = torch.softmax(y_logits, dim=1).argmax(dim=1) # logits -> probs -> labels

    # Calculate loss/accuracy
    loss = loss_fn(y_logits, y_train)
    train_acc = accuracy(y_true=y_train,
                         y_pred=y_pred)
    
    optimizer.zero_grad() # reset gradients to zero
    loss.backward() # backpropagation
    optimizer.step() # gradient descent

    model_4.eval()
    with torch.no_grad():  # Use torch.no_grad() instead of torch.inference_mode()
        
        test_logits = model_4(X_test)
        test_pred = torch.softmax(test_logits, dim=1).argmax(dim=1)

        test_loss = loss_fn(test_logits, y_test)
        # Update the learning rate based on the validation loss
        scheduler.step(test_loss)

        test_acc = accuracy(y_true=y_test,
                            y_pred=test_pred)
        
        if (epoch+1) % 100 == 0:
            print(f"Epoch: {epoch+1}, Loss: {loss:.5f}, Train Acc: {train_acc:.2f}%, Test loss: {test_loss:.5f}, Test acc: {test_acc:.2f}%")


NameError: name 'torch' is not defined

In [45]:
import numpy as np
import pandas as pd
import sqlite3
import talipp as tp
from talipp.ohlcv import OHLCV
from talipp.indicators import BOP, SOBV, Stoch, ForceIndex, MACD
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from torch.optim.lr_scheduler import ReduceLROnPlateau

In [46]:
connection = sqlite3.connect('etf_data.db')
etf_list = ['spy', 'tlt', 'hyg', 'lqd', 'vnq']
df = pd.read_sql_query(f"SELECT * FROM {etf_list[0]}", connection)
connection.close()

In [47]:
df.rename(columns={df.columns[4]: 'close'}, inplace=True)
df = df.sort_values(by='date', ascending=True)
df = df.reset_index(drop=True)

In [48]:
close = [float(row['close']) for index,row in df.iterrows()]
ohlcv = [OHLCV(float(row['open']), float(row['high']), float(row['low']), float(row['close']), float(row['volume']))
         for index, row in df.iterrows()]

In [49]:
# Define the indicators
bop = BOP(ohlcv)
sobv = SOBV(7, ohlcv)
stoch = Stoch(14, 3, ohlcv)
force_index = ForceIndex(13, ohlcv)
macd = MACD(12, 26, 9, close)

In [50]:
# Create new columns in the DataFrame
for i in range(5685):  # Assuming the length of the indicators matches the DataFrame length (5865 rows)
    df.loc[i, 'BOP'] = bop[i]
    df.loc[i, 'SOBV'] = sobv[i]
    df.loc[i, 'Stoch'] = stoch[i].k
    df.loc[i, 'ForceIndex'] = force_index[i]
    df.loc[i, 'MACD'] = macd[i].histogram

In [51]:
df['percentage_change'] = df['close'].shift(-5) / df['close'] - 1
df['percentage_change'] = df['percentage_change'].shift(5)

conditions = [
    (df['percentage_change'] >= 0.01),
    (df['percentage_change'] <= -0.01)
]
values = [2, 0]
df['label'] = np.select(conditions, values, default=1)

In [52]:
df.columns

Index(['date', 'open', 'high', 'low', 'close', 'daily_change', 'perct_chg',
       'volume', 'BOP', 'SOBV', 'Stoch', 'ForceIndex', 'MACD',
       'percentage_change', 'label'],
      dtype='object')

In [53]:
# Assuming 'df' is your DataFrame
# pd.set_option('display.max_rows', 20)  # Set to None to display all rows
# pd.set_option('display.max_columns', None)  # Set to None to display all columns

nan_counts = df.isna().sum()
nan_counts

date                   0
open                   0
high                   0
low                    0
close                  0
daily_change           0
perct_chg              0
volume                 0
BOP                  179
SOBV                 179
Stoch                179
ForceIndex           179
MACD                 187
percentage_change      5
label                  0
dtype: int64

In [54]:
# Drop na
df = df.dropna()

In [59]:
# Prepare the data for a attention transformer model
features = ['BOP', 'SOBV', 'Stoch', 'ForceIndex', 'MACD']

# Choose the target column
target = 'label'

# Create X and y
X = df[features]
y = df[target]

In [60]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# len(X_train), len(X_test), len(y_train), len(y_test)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4541, 5), (1136, 5), (4541,), (1136,))

In [61]:
# Convert the data to PyTorch tensors
# Convert the data to PyTorch tensors
X_train = torch.FloatTensor(X_train.values)
X_test = torch.FloatTensor(X_test.values)
y_train = torch.LongTensor(y_train.values.reshape(-1, 1))  # Reshape to [batch_size, 1]
y_test = torch.LongTensor(y_test.values.reshape(-1, 1))  # Reshape to [batch_size, 1]


X_train.shape, X_test.shape, y_train.shape, y_test.shape

(torch.Size([4541, 5]),
 torch.Size([1136, 5]),
 torch.Size([4541, 1]),
 torch.Size([1136, 1]))

In [62]:
# set(y_train)

In [18]:
import torch
import torch.nn as nn
import math

inputs - raw data (words)

embedding - vector of floats - (d_model)

positional encoding



In [35]:

seq_len, d_model = 5, 128

# Create a matrix of shape (seq_len, d_model) to hold the positional encodings.
pe = torch.zeros(seq_len, d_model)
print(f"pe shape: {pe.shape}")

# Create a vector of shape (seq_len, 1) containing values from 0 to (seq_len - 1).
position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1) # numerator
print(f"position shape: {position.shape}")

# Create a vector of shape (d_model, 1) containing values corresponding to the exponent for each dimension.
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) # denominator in log space (more numerically stable)
print(f"div_term shape: {div_term.shape}")

# Apply sine to even indices of the matrix to get the positional encoding for the even dimensions.
pe[:, 0::2] = torch.sin(position * div_term) # sin(position * (10000 ** (2i / d_model))
print(f"pe shape: {pe.shape}")

# Apply cosine to odd indices of the matrix to get the positional encoding for the odd dimensions.
pe[:, 1::2] = torch.cos(position * div_term) # cos(position * (10000 ** (2i / d_model))
print(f"pe shape: {pe.shape}")

# Add a batch dimension to the positional encoding to make it compatible with batched input.
pe = pe.unsqueeze(0) # (1, seq_len, d_model)
print(f"pe shape: {pe.shape}")

# pe

pe shape: torch.Size([5, 128])
position shape: torch.Size([5, 1])
div_term shape: torch.Size([64])
pe shape: torch.Size([5, 128])
pe shape: torch.Size([5, 128])
pe shape: torch.Size([1, 5, 128])


In [63]:
# Device agnostic
device = "mps" if torch.backends.mps.is_available() else "cpu"

In [81]:
embeddings
skip postional encodings not reqd
layer vs batch normalization

we need batch normalization (gamma & beta)
- faster convergence
- reduced sensitivity to initialization
- regularization effect
- higher learning rates



Decoder
Masked Multi head attention
- makes the model causal. i.e. it can only attend to the previous tokens in the sequence
- do that by making the dot product matrix negative infinity for the tokens that come after the current token (above the principal diagonal)
- we do this before applying the softmax function as the softmax function will take care of making it 0




ValueError: too many values to unpack (expected 2)

In [82]:
"""
A from scratch implementation of Transformer network,
following the paper Attention is all you need with a
few minor differences. I tried to make it as clear as
possible to understand and also went through the code
on my youtube channel!


"""

import torch
import torch.nn as nn


class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads

        assert (
            self.head_dim * heads == embed_size
        ), "Embedding size needs to be divisible by heads"

        self.values = nn.Linear(embed_size, embed_size)
        self.keys = nn.Linear(embed_size, embed_size)
        self.queries = nn.Linear(embed_size, embed_size)
        self.fc_out = nn.Linear(embed_size, embed_size)

    def forward(self, values, keys, query, mask):
        # Get number of training examples
        N = query.shape[0]

        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]

        values = self.values(values)  # (N, value_len, embed_size)
        keys = self.keys(keys)  # (N, key_len, embed_size)
        queries = self.queries(query)  # (N, query_len, embed_size)

        # Split the embedding into self.heads different pieces
        values = values.reshape(N, value_len, self.heads, self.head_dim)
        keys = keys.reshape(N, key_len, self.heads, self.head_dim)
        queries = queries.reshape(N, query_len, self.heads, self.head_dim)

        # Einsum does matrix mult. for query*keys for each training example
        # with every other training example, don't be confused by einsum
        # it's just how I like doing matrix multiplication & bmm

        energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])
        # queries shape: (N, query_len, heads, heads_dim),
        # keys shape: (N, key_len, heads, heads_dim)
        # energy: (N, heads, query_len, key_len)

        # Mask padded indices so their weights become 0
        if mask is not None:
            energy = energy.masked_fill(mask == 0, float("-1e20"))

        # Normalize energy values similarly to seq2seq + attention
        # so that they sum to 1. Also divide by scaling factor for
        # better stability
        attention = torch.softmax(energy / (self.embed_size ** (1 / 2)), dim=3)
        # attention shape: (N, heads, query_len, key_len)

        out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(
            N, query_len, self.heads * self.head_dim
        )
        # attention shape: (N, heads, query_len, key_len)
        # values shape: (N, value_len, heads, heads_dim)
        # out after matrix multiply: (N, query_len, heads, head_dim), then
        # we reshape and flatten the last two dimensions.

        out = self.fc_out(out)
        # Linear layer doesn't modify the shape, final shape will be
        # (N, query_len, embed_size)

        return out


class TransformerBlock(nn.Module):
    def __init__(self, embed_size, heads, dropout, forward_expansion):
        super(TransformerBlock, self).__init__()
        self.attention = SelfAttention(embed_size, heads)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)

        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, forward_expansion * embed_size),
            nn.ReLU(),
            nn.Linear(forward_expansion * embed_size, embed_size),
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, value, key, query, mask):
        attention = self.attention(value, key, query, mask)

        # Add skip connection, run through normalization and finally dropout
        x = self.dropout(self.norm1(attention + query))
        forward = self.feed_forward(x)
        out = self.dropout(self.norm2(forward + x))
        return out


class Encoder(nn.Module):
    def __init__(
        self,
        src_vocab_size,
        embed_size,
        num_layers,
        heads,
        device,
        forward_expansion,
        dropout,
        max_length,
    ):

        super(Encoder, self).__init__()
        self.embed_size = embed_size
        self.device = device
        self.word_embedding = nn.Embedding(src_vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_length, embed_size)

        self.layers = nn.ModuleList(
            [
                TransformerBlock(
                    embed_size,
                    heads,
                    dropout=dropout,
                    forward_expansion=forward_expansion,
                )
                for _ in range(num_layers)
            ]
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        N, seq_length = x.shape
        positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device)
        out = self.dropout(
            (self.word_embedding(x) + self.position_embedding(positions))
        )

        # In the Encoder the query, key, value are all the same, it's in the
        # decoder this will change. This might look a bit odd in this case.
        for layer in self.layers:
            out = layer(out, out, out, mask)

        return out


class DecoderBlock(nn.Module):
    def __init__(self, embed_size, heads, forward_expansion, dropout, device):
        super(DecoderBlock, self).__init__()
        self.norm = nn.LayerNorm(embed_size)
        self.attention = SelfAttention(embed_size, heads=heads)
        self.transformer_block = TransformerBlock(
            embed_size, heads, dropout, forward_expansion
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, value, key, src_mask, trg_mask):
        attention = self.attention(x, x, x, trg_mask)
        query = self.dropout(self.norm(attention + x))
        out = self.transformer_block(value, key, query, src_mask)
        return out


class Decoder(nn.Module):
    def __init__(
        self,
        trg_vocab_size,
        embed_size,
        num_layers,
        heads,
        forward_expansion,
        dropout,
        device,
        max_length,
    ):
        super(Decoder, self).__init__()
        self.device = device
        self.word_embedding = nn.Embedding(trg_vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_length, embed_size)

        self.layers = nn.ModuleList(
            [
                DecoderBlock(embed_size, heads, forward_expansion, dropout, device)
                for _ in range(num_layers)
            ]
        )
        self.fc_out = nn.Linear(embed_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_out, src_mask, trg_mask):
        N, seq_length = x.shape
        positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device)
        x = self.dropout((self.word_embedding(x) + self.position_embedding(positions)))

        for layer in self.layers:
            x = layer(x, enc_out, enc_out, src_mask, trg_mask)

        out = self.fc_out(x)

        return out


class Transformer(nn.Module):
    def __init__(
        self,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        trg_pad_idx,
        embed_size=512,
        num_layers=6,
        forward_expansion=4,
        heads=8,
        dropout=0,
        device="cpu",
        max_length=100,
    ):

        super(Transformer, self).__init__()

        self.encoder = Encoder(
            src_vocab_size,
            embed_size,
            num_layers,
            heads,
            device,
            forward_expansion,
            dropout,
            max_length,
        )

        self.decoder = Decoder(
            trg_vocab_size,
            embed_size,
            num_layers,
            heads,
            forward_expansion,
            dropout,
            device,
            max_length,
        )

        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device

    def make_src_mask(self, src):
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        # (N, 1, 1, src_len)
        return src_mask.to(self.device)

    def make_trg_mask(self, trg):
        N, trg_len = trg.shape
        trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(
            N, 1, trg_len, trg_len
        )

        return trg_mask.to(self.device)

    def forward(self, src, trg):
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        enc_src = self.encoder(src, src_mask)
        out = self.decoder(trg, enc_src, src_mask, trg_mask)
        return out


if __name__ == "__main__":
    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device = torch.device("cpu")
    print(device)

    x = torch.tensor([[1, 5, 6, 4, 3, 9, 5, 2, 0], [1, 8, 7, 3, 4, 5, 6, 7, 2]]).to(
        device
    )
    trg = torch.tensor([[1, 7, 4, 3, 5, 9, 2, 0], [1, 5, 6, 2, 4, 7, 6, 2]]).to(device)

    src_pad_idx = 0
    trg_pad_idx = 0
    src_vocab_size = 10
    trg_vocab_size = 10
    model = Transformer(src_vocab_size, trg_vocab_size, src_pad_idx, trg_pad_idx, device=device).to(
        device
    )
    out = model(x, trg[:, :-1])
    print(out.shape)

cpu
torch.Size([2, 7, 10])


In [65]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

# Define the Attention Transformer model
class AttentionTransformer(nn.Module):
    def __init__(self, input_size, output_size, hidden_size=16, num_layers=1, num_heads=4, dropout=0.1):
        super(AttentionTransformer, self).__init__()

        self.embedding = nn.Linear(input_size, hidden_size)
        self.positional_encoding = PositionalEncoding(hidden_size)

        encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_size, nhead=num_heads, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers)

        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x) * torch.sqrt(torch.tensor(x.size(-1), dtype=torch.float))
        x = self.positional_encoding(x)
        x = self.transformer_encoder(x)
        x = self.fc(x)
        return x

# Define the PositionalEncoding class for adding positional encodings
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.encoding = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        self.encoding[:, 0::2] = torch.sin(position * div_term)
        self.encoding[:, 1::2] = torch.cos(position * div_term)
        self.encoding = self.encoding.unsqueeze(0).transpose(0, 1)

    def forward(self, x):
        return x + self.encoding[:x.size(0), :]

# Create TensorDatasets for training and testing data
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

# Set batch size and create DataLoaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Instantiate the AttentionTransformer model
input_size = X.shape[1]  # Number of features in X
output_size = 3  # Number of unique classes in y
model = AttentionTransformer(input_size=input_size, output_size=output_size)

# Set loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10

In [68]:
model.train()

optimizer.zero_grad()

outputs = model(batch_X)

outputs.shape

# for epoch in range(num_epochs):
#     model.train()
#     for batch_X, batch_y in train_loader:
#         # Zero the gradients
#         optimizer.zero_grad()

#         # Forward pass
#         outputs = model(batch_X)

#         outputs.shape
#         # Calculate the loss
#         loss = criterion(outputs, batch_y)

#         # Backward pass and optimization
#         loss.backward()
#         optimizer.step()

#     print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

# # Evaluation on the test set
# model.eval()
# with torch.no_grad():
#     total_correct = 0
#     total_samples = 0
#     for batch_X, batch_y in test_loader:
#         outputs = model(batch_X)
#         _, predicted = torch.max(outputs, 1)
#         total_correct += (predicted == batch_y).sum().item()
#         total_samples += batch_y.size(0)

#     accuracy = total_correct / total_samples * 100
#     print(f"Test Accuracy: {accuracy:.2f}%")


torch.Size([32, 32, 3])

In [16]:
# Training Loop & Testing loop
# Set random seed
torch.manual_seed(42)

# Put data on device
X_train, X_test = X_train.to(device), X_test.to(device)
y_train, y_test = y_train.to(device), y_test.to(device)

# Loss and optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model_transformer.parameters(), lr=0.01)
scheduler = ReduceLROnPlateau(optimizer, mode='min', patience=10, factor=0.1, verbose=True)

epochs = 1000

for epoch in range(epochs):
    model_transformer.train()
    
    # Forward pass
    y_logits = model_transformer(X_train)
    y_pred = torch.softmax(y_logits, dim=1).argmax(dim=1) # logits -> probs -> labels

    # Calculate loss/accuracy
    loss = loss_fn(y_logits, y_train)
    train_acc = accuracy(y_true=y_train,
                         y_pred=y_pred)
    
    optimizer.zero_grad() # reset gradients to zero
    loss.backward() # backpropagation
    optimizer.step() # gradient descent

    model_transformer.eval()
    with torch.no_grad():  # Use torch.no_grad() instead of torch.inference_mode()
        
        test_logits = model_transformer(X_test)
        test_pred = torch.softmax(test_logits, dim=1).argmax(dim=1)

        test_loss = loss_fn(test_logits, y_test)
        # Update the learning rate based on the validation loss
        scheduler.step(test_loss)

        test_acc = accuracy(y_true=y_test,
                            y_pred=test_pred)
        
        if (epoch+1) % 100 == 0:
            print(f"Epoch: {epoch+1}, Loss: {loss:.5f}, Train Acc: {train_acc:.2f}%, Test loss: {test_loss:.5f}, Test acc: {test_acc:.2f}%")


: 

: 