In [2]:
## Importing necessary libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split
from functools import partial
import plotly.graph_objects as go
from datetime import datetime
import plotly.subplots as ms
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader,Dataset

### Data Pre-processing and Feature Extraction

In [3]:
## Reading data of Apple stock from a comma-separated .txt file into a pandas dataframe and removing the header 
apple_stock = pd.read_csv('APPL.csv')
# apple_stock = apple_stock.drop(columns=[4]) 

In [4]:
## Separating the date and time using str.split() for better interpretation
# apple_stock.columns = ['Open','High','Low','Close','Volume', 'Date']
# apple_stock[['Date','Time']] = apple_stock['Date and Time'].str.split(' ',1,expand = True)
## Dropping the original column and retaining the newly created columns
# apple_stock = apple_stock.drop('Date and Time',axis=1)
## Displaying the modified dataframe
display(apple_stock)

Unnamed: 0,Date,High,Open,Close,Low,Volume,close_ems_0,close_ems_1,close_ems_2,close_ems_3,close_ems_4,close_ems_5,close_ems_6
0,20100129,22.35,21.59,21.70,21.32,5.284370e+08,0.004981,-0.007888,0.029859,0.030317,-0.030210,0.006188,0.021598
1,20100201,21.77,21.71,21.24,21.00,6.026750e+08,-0.029684,-0.029749,0.009720,0.024786,-0.028706,0.004914,0.024498
2,20100202,21.96,21.26,21.37,21.26,6.204750e+08,-0.011880,-0.017990,-0.015981,-0.009157,0.011664,-0.015653,0.035750
3,20100203,22.58,21.53,22.58,21.24,1.214267e+09,0.023553,-0.003843,-0.010330,0.023523,0.029184,-0.018698,0.032221
4,20100204,22.45,22.33,22.20,22.01,6.752830e+08,-0.024795,-0.033569,-0.006393,-0.023653,-0.019699,-0.020172,-0.003472
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2135,20181224,9.45,9.40,9.42,9.31,4.771869e+08,0.002830,-0.032855,-0.004628,0.012152,0.015378,-0.010386,-0.023502
2136,20181225,9.43,9.29,9.34,9.21,5.452356e+08,-0.003459,0.002878,-0.001050,0.021730,-0.008739,0.003577,0.022280
2137,20181226,9.42,9.35,9.30,9.27,3.932151e+08,-0.009513,0.017691,0.035619,-0.011251,-0.002966,-0.022544,0.019264
2138,20181227,9.49,9.45,9.28,9.28,5.863437e+08,-0.031788,0.007745,0.003324,0.014035,0.022011,-0.023579,0.023065


In [5]:
# ## As we are interested in predicting stock prices for the next 5 days we only require day-wise data and not minute wise data
# # Thus, we group the data by 'Date' setting values for all the columns appropriately
# apple_stock = apple_stock.groupby('Date').agg({
#         'Close' : 'last',
#         'Open' : 'first',
#         'High' : 'max',
#         'Low' : 'min',
#         'Volume' : 'sum'
# }).reset_index()

In [6]:
# apple_stock.columns

In [7]:
## Creating features/indicators that are suitable for financial data and aid in stock price predictions as they help in determining the trends
## Creating a separate column for stochastic oscillator given by (C-L)/(H-L)
apple_stock['Stochastic Oscillator'] = (apple_stock['Close'] - apple_stock['Low'])/(apple_stock['High']-apple_stock['Low'])
## Defining absolute returns as c_t - c_t-1
apple_stock['Absolute Returns'] = apple_stock['Close'] - apple_stock['Close'].shift(1)
## Normalizing opening, closing, high and low prices using the previous days' Close
apple_stock['Close Normalized'] = apple_stock['Close']/apple_stock['Close'].shift(1)
apple_stock['Open Normalized'] = apple_stock['Open']/apple_stock['Close'].shift(1)
apple_stock['High Value Normalized'] = apple_stock['High']/apple_stock['Close'].shift(1)
apple_stock['Low Value Normalized'] = apple_stock['Low']/apple_stock['Close'].shift(1)
## Normalizing volume of stocks traded by a 5-day rolling mean of the volume
apple_stock['Volume Normalized'] =  apple_stock['Volume']/apple_stock['Volume'].shift(1).rolling(window=5).mean()
## Defining volatility by the variance of volume of stocks traded over a window of 9 days
apple_stock['Volatility'] = apple_stock['Volume'].rolling(window=9).var()/1e16
## Removing unnecessary rows and resetting the index
apple_stock.dropna(inplace = True)
apple_stock.reset_index(drop = True, inplace = True)

In [8]:
# ## Defining MACD () as  5 day EMA - 9 day EMA
# ## Defining the periods over which EMA is to be calculated
# period1 = 9
# period2 = 5
# # Calculate the smoothing factor (alpha)
# alpha1 = 2 / (period1 + 1)
# alpha2 = 2/(period2 +1 )
# # Calculate 9-day EMA and 5-day EMA using the pandas `ewm` method
# EMA9day = apple_stock['Close'].ewm(span=period1, adjust=False).mean()
# EMA5day = apple_stock['Close'].ewm(span=period2, adjust=False).mean()
# apple_stock['MACD'] = EMA5day - EMA9day

In [9]:
# ## Defining Gains/Losses for a particular day depending on whether O>C or C>O
# apple_stock['Gains'] = (apple_stock['Close'] - apple_stock['Open'])*100/apple_stock['Open']
# apple_stock['Gains'] = apple_stock['Gains'].apply(lambda x: max(0, x))
# apple_stock['Losses'] = (apple_stock['Open'] - apple_stock['Close'])*100/apple_stock['Open']
# apple_stock['Losses'] = apple_stock['Losses'].apply(lambda x: max(0, x))

In [10]:
# ## Computing the Relative Strength Index (RSI) using the gains/losses which is dependent on average gains and average losses
# rsi_data = []
# ## Defining epsilon to avoid nan related issues (small value)
# eps = 1e-8
# for i in range(9,len(apple_stock)):
#     ## Calculating the RSI gains for the last 9 days
#     rsi_gains = np.array(apple_stock.iloc[i-9:i]['Gains'])
#     ## Calculating average positive gains over the last 9 days
#     average_gains = np.mean(rsi_gains[rsi_gains > 0])
#     average_gains = 0 if np.isnan(average_gains) else average_gains
#     ## Calculating RSI losses for the last 9 days
#     rsi_losses = np.array(apple_stock.iloc[i-9:i]['Losses'])
#     average_losses = np.mean(rsi_losses[rsi_losses > 0])
#     average_losses = 0 if np.isnan(average_losses) else average_losses
#     ## Computing the RSI index
#     den = 1+ average_gains/(average_losses + eps)
#     rsi_data.append(100-(100/den))

# #Removing the first 9 rows, due to rolling mean
# apple_stock = apple_stock.iloc[9:]
# apple_stock['RSI'] = rsi_data

In [11]:
# ## Using periodic features that help to discover repetitive patterns or cycles within the data
# ## These cycles aid in predicting future price movements
# apple_stock.reset_index(inplace = True, drop = True)
# apple_stock['Sine'] = np.sin(2*np.pi/20*pd.DatetimeIndex(data = apple_stock['Date'], yearfirst = True).day)
# apple_stock['Cosine'] = np.cos(2*np.pi/20*pd.DatetimeIndex(data = apple_stock['Date'], yearfirst = True).day)

In [12]:
## Displaying the final dataset with all features
display(apple_stock)

Unnamed: 0,Date,High,Open,Close,Low,Volume,close_ems_0,close_ems_1,close_ems_2,close_ems_3,...,close_ems_5,close_ems_6,Stochastic Oscillator,Absolute Returns,Close Normalized,Open Normalized,High Value Normalized,Low Value Normalized,Volume Normalized,Volatility
0,20100210,22.38,22.20,22.28,22.02,564177024.0,-0.011256,-0.005505,0.019490,0.005371,...,0.027786,-0.013485,0.722222,0.28,1.012727,1.009091,1.017273,1.000909,0.810291,4.986343
1,20100211,22.35,22.21,22.11,22.08,308844000.0,-0.009037,0.016737,0.033113,-0.012798,...,-0.019664,0.016362,0.111111,-0.17,0.992370,0.996858,1.003142,0.991023,0.545423,6.157216
2,20100212,22.60,22.17,22.45,22.17,422488000.0,0.025628,-0.002779,-0.013408,0.019327,...,-0.017680,0.029322,0.651163,0.34,1.015378,1.002714,1.022162,1.002714,0.857046,6.594753
3,20100222,22.45,22.31,22.10,22.08,367932992.0,0.022317,-0.017556,0.027884,0.017637,...,0.010337,0.020838,0.054054,-0.35,0.984410,0.993764,1.000000,0.983519,0.804835,7.172232
4,20100223,22.05,22.03,21.55,21.40,513376992.0,-0.006957,-0.016416,-0.019102,-0.010108,...,-0.015142,0.029848,0.230769,-0.55,0.975113,0.996833,0.997738,0.968326,1.163296,1.370266
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2085,20181224,9.45,9.40,9.42,9.31,477186912.0,0.002830,-0.032855,-0.004628,0.012152,...,-0.010386,-0.023502,0.785714,-0.03,0.996825,0.994709,1.000000,0.985185,0.655370,4.364597
2086,20181225,9.43,9.29,9.34,9.21,545235584.0,-0.003459,0.002878,-0.001050,0.021730,...,0.003577,0.022280,0.590909,-0.08,0.991507,0.986200,1.001062,0.977707,0.771611,3.518071
2087,20181226,9.42,9.35,9.30,9.27,393215136.0,-0.009513,0.017691,0.035619,-0.011251,...,-0.022544,0.019264,0.200000,-0.04,0.995717,1.001071,1.008565,0.992505,0.556776,3.932607
2088,20181227,9.49,9.45,9.28,9.28,586343744.0,-0.031788,0.007745,0.003324,0.014035,...,-0.023579,0.023065,0.000000,-0.02,0.997849,1.016129,1.020430,0.997849,0.881905,3.860181


### Plots

### Sequencer and Dataloader

In [13]:
## Defining a Sequencer which converts the data into a format suitable for training
class StockDataset(Dataset):
    def __init__(self,data,sequence_length,prediction_length):
        ## Initializing the dataframe and the window length i.e. number of previous days which will be used at a time to predict
        ## today's Close
        self.data = data
        self.sequence_length = sequence_length
        self.prediction_length = prediction_length

    def __len__(self):
        ## As it picks indices randomly from [0,len], we keep len =  len(df) - seq_len which denotes the last index which can be
        ## used to create a batch as we need seq_len rows ahead of it
        return len(self.data) - self.sequence_length - self.prediction_length

    def __getitem__(self,index):
        ## Slicing the dataframe from input index to input index + seq_len to get the input data
        input_data = self.data[index : index + self.sequence_length]
        input_list = input_data.values.tolist()
        input = torch.Tensor(input_list)

        ## Returning the Closes of next day as the output for each day in the input
        ## Converting both the input and output to tensors before returning
        output = self.data.loc[index + self.sequence_length : index + self.sequence_length + self.prediction_length-1, 'Close Normalized'].values.tolist()
        output = torch.Tensor(output)

        return input,output

In [14]:
apple_stock.columns

Index(['Date', 'High', 'Open', 'Close', 'Low', 'Volume', 'close_ems_0',
       'close_ems_1', 'close_ems_2', 'close_ems_3', 'close_ems_4',
       'close_ems_5', 'close_ems_6', 'Stochastic Oscillator',
       'Absolute Returns', 'Close Normalized', 'Open Normalized',
       'High Value Normalized', 'Low Value Normalized', 'Volume Normalized',
       'Volatility'],
      dtype='object')

In [15]:
# input_features = ['Close Normalized', 'Open Normalized','High Value Normalized','Low Value Normalized','Volume Normalized']
# input_features = ['Close Normalized', 'Open Normalized','High Value Normalized','Low Value Normalized','Volume Normalized',
#                  'Stochastic Oscillator', 'Absolute Returns','Volatility','MACD','RSI','Sine','Cosine']
input_features = ['Close Normalized', 'Open Normalized','High Value Normalized','Low Value Normalized','Volume Normalized',
                 'close_ems_0', 'close_ems_1','close_ems_2','close_ems_3','close_ems_4','close_ems_5','close_ems_6']
df_app = apple_stock[input_features].copy()

In [16]:
df_app

Unnamed: 0,Close Normalized,Open Normalized,High Value Normalized,Low Value Normalized,Volume Normalized,close_ems_0,close_ems_1,close_ems_2,close_ems_3,close_ems_4,close_ems_5,close_ems_6
0,1.012727,1.009091,1.017273,1.000909,0.810291,-0.011256,-0.005505,0.019490,0.005371,-0.002313,0.027786,-0.013485
1,0.992370,0.996858,1.003142,0.991023,0.545423,-0.009037,0.016737,0.033113,-0.012798,-0.002228,-0.019664,0.016362
2,1.015378,1.002714,1.022162,1.002714,0.857046,0.025628,-0.002779,-0.013408,0.019327,0.031958,-0.017680,0.029322
3,0.984410,0.993764,1.000000,0.983519,0.804835,0.022317,-0.017556,0.027884,0.017637,-0.021207,0.010337,0.020838
4,0.975113,0.996833,0.997738,0.968326,1.163296,-0.006957,-0.016416,-0.019102,-0.010108,0.013027,-0.015142,0.029848
...,...,...,...,...,...,...,...,...,...,...,...,...
2085,0.996825,0.994709,1.000000,0.985185,0.655370,0.002830,-0.032855,-0.004628,0.012152,0.015378,-0.010386,-0.023502
2086,0.991507,0.986200,1.001062,0.977707,0.771611,-0.003459,0.002878,-0.001050,0.021730,-0.008739,0.003577,0.022280
2087,0.995717,1.001071,1.008565,0.992505,0.556776,-0.009513,0.017691,0.035619,-0.011251,-0.002966,-0.022544,0.019264
2088,0.997849,1.016129,1.020430,0.997849,0.881905,-0.031788,0.007745,0.003324,0.014035,0.022011,-0.023579,0.023065


In [17]:
sequence_length = 12
prediction_length = 5
sequenced_data = StockDataset(df_app,sequence_length,prediction_length)

In [18]:
type(sequenced_data)

__main__.StockDataset

In [19]:
#Splitting the data to 80% Training, 10% Validaiton and 10% Testing
split=0.8
#Splitting the indices of the sequences, so as to maintain order of time series
indices = list(range(len(sequenced_data)))

#splitting the indices according to the decided split
train_indices, test_indices = train_test_split(indices, train_size=split, shuffle=False)
val_indices, test_indices = train_test_split(test_indices, train_size=0.5, shuffle=False)

# Create the training , validation and test datasets
train_dataset = torch.utils.data.Subset(sequenced_data, train_indices)
val_dataset= torch.utils.data.Subset(sequenced_data, val_indices)
test_dataset = torch.utils.data.Subset(sequenced_data, test_indices)
train_size=len(train_dataset)
test_size=len(val_dataset)
val_size=len(test_dataset)
print(f"Train Size: {len(train_dataset)}/{len(sequenced_data)}")
print(f"Validation Size: {len(val_dataset)}/{len(sequenced_data)}")
print(f"Test Size: {len(test_dataset)}/{len(sequenced_data)}")

Train Size: 1658/2073
Validation Size: 207/2073
Test Size: 208/2073


In [20]:
train_dataloader=DataLoader(train_dataset,batch_size=16,shuffle=False)
val_dataloader=DataLoader(val_dataset,batch_size=16,shuffle=False)
test_dataloader=DataLoader(test_dataset,batch_size=16,shuffle=False)
entire_dataloader=DataLoader(sequenced_data,batch_size=16,shuffle=False)

### Model:
#### Diffusion Net

In [21]:
## Defining the class for diffusion module which keeps on adding gaussian noise with a fixed variance schedule to both input as wel as output
class DiffusionProcess(nn.Module):
    def __init__(self, num_diff_steps, vae, beta_start, beta_end, scale):
        super().__init__()
        to_torch = partial(torch.tensor, dtype = torch.float32)
        ## Initializing variables like number of time stamps, the Hierarchial VAE to make predictions, start and end values
        ## for beta, which governs the variance schedule
        self.num_diff_steps = num_diff_steps
        self.vae = vae
        self.beta_start = beta_start
        self.beta_end = beta_end
        ## Defining a linearly varying variance schedule for the conditional noise at every timestamp
        betas = np.linspace(beta_start, beta_end,  num_diff_steps, dtype = np.float32)

        ## Performing reparametrization to calculate output at time t directly using x_start
        alphas = 1 - betas
        alphas_target = 1 - betas*scale
        ## Computing the cumulative product for the input as well as output noise schedule
        alphas_cumprod = np.cumprod(alphas, axis = 0)
        alphas_target_cumprod = np.cumprod(alphas_target, axis = 0)

        ## Converting all the computed quantities to tensors and detaching them from the computation graph (setting requires_grad to False)
        betas = torch.tensor(betas, requires_grad = False)
        alphas_cumprod = torch.tensor(alphas_cumprod, requires_grad = False)
        alphas_target_cumprod = torch.tensor(alphas_target_cumprod, requires_grad = False)

        ## Computing scaling factors for mean and variance respectively
        self.sqrt_alphas_cumprod = torch.sqrt(alphas_cumprod).detach().requires_grad_(False)
        self.sqrt_one_minus_alphas_cumprod = torch.sqrt(1 - alphas_cumprod).detach().requires_grad_(False)
        self.sqrt_alphas_target_cumprod = torch.sqrt(alphas_target_cumprod).detach().requires_grad_(False)
        self.sqrt_one_minus_alphas_target_cumprod = torch.sqrt(1 - alphas_target_cumprod).detach().requires_grad_(False)

    ## Defining the forward pass
    def diffuse(self, x_start, y_target, timestamp):
        ## Generating a random noise vector sampled from a standard normal of the size x_start and y_target respectively
        noise = torch.randn_like(x_start)
        noise_target = torch.randn_like(y_target)

        ## Computing the sampled value using the reparametrization trick and using that to calculate x_noisy and y_noisy
        x_noisy = self.sqrt_alphas_cumprod[timestamp - 1]*x_start + self.sqrt_one_minus_alphas_cumprod[timestamp - 1]*noise
        y_noisy = self.sqrt_alphas_target_cumprod[timestamp - 1]*y_target + self.sqrt_one_minus_alphas_target_cumprod[timestamp - 1]*noise_target

        ## Performing a forward pass through the Hierarchial VAE to generate noisy predictions
        output = self.vae(x_noisy)
        return output, y_noisy

### Blocks required for Encoder and Decoder

In [22]:
import torch.nn.init as init

## Initializing weights using Xavier Initialization
def init_weights(layer):
    init.xavier_uniform_(layer.weight)
    layer_name=layer._class.name_
    if layer.find("Conv")!=-1:
        layer.weight.data.normal_(0.0,0.25)
    elif layer.find("BatchNorm")!=-1:
        layer.weight.data.normal(1.00,0.25)
        layer.bias.data.fill_(0.00)

## Defining a custom Conv2D class with the padding size such that the input size and output size remain the same
class Conv2D(nn.Module):
    def __init__(self,input_dim,output_dim,kernel_size,stride):
        super(Conv2D,self).__init__()
        ## Required padding size = kernel_size - 1/2
        padding=int((kernel_size-1)/2)
        self.layer=nn.Conv2d(input_dim,output_dim,kernel_size,stride=stride,padding=padding,bias=True)
    ## Performing the forward pass
    def forward(self, input):
        # Print the shape of the input tensor
        print(f"Input Tensor Shape conv2d: {input.shape}")
        
        # Perform the convolution
        output = self.layer(input)
        
        # Print the shape of the output tensor
        print(f"Output Tensor Shape: {output.shape}")
        
        return output

## Defining the module for Swish Activation or Sigmoid Linear Unit
class Swish(nn.Module):
    def __init__(self):
        super(Swish,self).__init__()
        self.layer=nn.SiLU()
    def forward(self,input):
        return self.layer(input)

## Performing Batch Normalization by inherting it from torch.nn
class BatchNorm(nn.Module):
    def __init__(self,batch_dim,size):
        super(BatchNorm,self).__init__()
        ## Equivalent to BatchNorm as first dimension is batch_size
        self.layer=nn.LayerNorm([batch_dim,size,size])

    def forward(self,input):
        return self.layer(input)
        
class SE(nn.Module):
    def __init__(self,channels_in,channels_out):
        super(SE,self).__init__()
        ## Defining number of units to be compressed into
        num_hidden=max(channels_out//16,4)
        
        ## Defining the network which compresses and expands to focus on features rather than noise
        ## 2 networks req as 2 different input output dimensions are present in the Hierarchial VAE
        self.se=nn.Sequential(nn.Linear(1,num_hidden),nn.ReLU(inplace=True),
                                nn.Linear(num_hidden, 144), nn.Sigmoid())
        self.se2=nn.Sequential(nn.Linear(1,num_hidden),nn.ReLU(inplace=True),
                                nn.Linear(num_hidden, 36), nn.Sigmoid())

    def forward(self, input):
        # Print the shape of the input tensor
        print(f"Input Tensor Shape se: {input.shape}")
        
        # Getting compressed vector
        se = torch.mean(input, dim=[1, 2])
        # Print the shape after mean operation
        print(f"Shape after mean operation: {se.shape}")
        
        # Flattening out the layer
        se = se.view(se.size(0), -1)
        # Print the shape after flattening
        print(f"Shape after flattening: {se.shape}")
        
        # Apply the appropriate SE network based on the number of input channels
        if input.size(1) == 12:
            se = self.se(se)
            se = se.view(se.size(0), 12, 12)
            # Print the shape after applying self.se and reshaping
            print(f"Shape after self.se and reshaping: {se.shape}")
        else:
            se = self.se2(se)
            se = se.view(se.size(0), 6, 6)
            # Print the shape after applying self.se2 and reshaping
            print(f"Shape after self.se2 and reshaping: {se.shape}")
        
        # Return the scaled input
        output = input * se
        # Print the shape of the output tensor
        print(f"Output Tensor Shape: {output.shape}")
        
        return output

## Performing pooling for downsampling using nn.AvgPool2D and using a kernel of size 2 to ensure that output size is halved
class Pooling(nn.Module):
    def __init__(self):
        super(Pooling,self).__init__()
        ## Using a 2x2 kernel and a stride of 2 in both directions
        self.mean_pool = nn.AvgPool2d(kernel_size=(2, 2),padding=0,stride=(2,2))
    def forward(self,input):
        return self.mean_pool(input)

## Defining a class to compute square of a quantity
class Square(nn.Module):
    def __init__(self):
        super(Square,self).__init__()
        pass
    def forward(self,input):
        return input**2

### Residual Encoder

In [23]:
## Defining the encoder block to be used in Hierarchial VAE to convert to input into its latent space representation
class Encoder_Block(nn.Module):
    def __init__(self,input_dim,size,output_dim):
        super().__init__()
        ## Initializing the in and out dimensions of the conv layers and SE block
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.size = size
        ## Defining the encoder layers i.e 2 Conv2D layers followed by Batch Normalization, a Conv2D layer of kernel size 1 and Squeeze and excitation at the end
        self.seq=nn.Sequential(Conv2D(input_dim,input_dim,kernel_size=5,stride=1),
                               Conv2D(input_dim,input_dim,kernel_size=1,stride=1),
                               BatchNorm(input_dim,size),Swish(),
                               Conv2D(input_dim,input_dim,kernel_size=3,stride=1),
                               SE(input_dim,output_dim))
    def forward(self,input):
        ## Computing the final output as the sum of scaled encoded output and original input (result of skip connection i.e. residual encoder)
        return input +0.1*self.seq(input)

### Residual Decoder

In [24]:
## Defining the decoder to be used in Hierarchial VAE to convert from latent space representations to noisy outputs
class Decoder_Block(nn.Module):
    def __init__(self,dim,size,output_dim):
        super().__init__()
        ## Defining the decoder net which comprises of Conv2D layers, BatchNorm and SE Blocks
        ## We ensure that the dimension of the input and output stays the same at all instants as down/up sampling is done in a separate block
        self.seq = nn.Sequential(
            BatchNorm(dim,size),
            Conv2D(dim,dim,kernel_size=1,stride=1),
            BatchNorm(dim,size), Swish(),
            Conv2D(dim,dim, kernel_size=5, stride=1),
            BatchNorm(dim,size), Swish(),
            Conv2D(dim, dim, kernel_size=1, stride = 1),
            BatchNorm(dim,size),
            ## SE Block just compresses and expands which allows it to ignore noise and focus on actual indicators
            SE(dim,output_dim))
    ## Computing the final output similar to encoder taking into account the skip connection
    def forward(self,input):
        return input+0.1*self.seq(input)

### Hierarchial Variational Autoencoder

In [25]:
## Defining the class for the Hierarchial VAE which takes as input various hyperparameters and the classes for encoder and decoder blocks
class HierarchialVAE(nn.Module):
    def __init__(self, Encoder_Block, Decoder_Block, latent_dim2, latent_dim1, feature_size2, 
                 feature_size1, hidden_size, pred_length, num_features, seq_length, batch_size):
        super().__init__()
        ## Initializing the encoder at the beginning when x_start has 12 features
        self.Encoder1 = Encoder_Block(input_dim = batch_size, output_dim = batch_size, size = 12)
        ## Initializing the encoder reqd after downsampling when input has 6 features 
        self.Encoder2 = Encoder_Block(input_dim = batch_size, output_dim = batch_size, size = 6)
        ## Initializing the decoder reqd after upsampling which gives y_noisy at the output
        self.Decoder1 = Decoder_Block(dim = batch_size,size = 12,output_dim = batch_size)
        ## Initializing the first decoder which obtains an input of size batchx6x6
        self.Decoder2 = Decoder_Block(dim = batch_size,size = 6,output_dim = batch_size)
        
        ## Initializing dimensions of both latent vectors, feature size of both the intermediate feature maps 
        self.latent_dim2 = latent_dim2
        self.latent_dim1 = latent_dim1
        self.feature_size2 = feature_size2
        self.feature_size1 = feature_size1
        ## Initializing the initial hidden state with a tensor of zeros with dimension equal to that of the final latent vector
        self.hidden_size = hidden_size
        self.hidden_state = torch.zeros(self.latent_dim1)
        ## Initializing batch_size
        self.batch_size= batch_size
        
        ## Defining the upsampling blocks required at 2 different stages in the entire network (2 networks reqd as size of input feature map varies throughout the network)
        self.upsample1 = nn.Upsample(size=(6, 6), mode='bilinear', align_corners=False)
        self.upsample2 = nn.Upsample(size=(12, 12), mode='bilinear', align_corners=False)
        ## Defining linear layers that map flattened feature maps to latent space dimensions and vice versa
        self.fc12 = nn.Linear(feature_size2,2*latent_dim2)
        self.fc11 = nn.Linear(feature_size1,2*latent_dim1)
        self.fc22 = nn.Linear(latent_dim2, feature_size2)
        self.fc21 = nn.Linear(latent_dim1, feature_size1)
        ## Defining pooling layer for downsampling
        self.mean_pool = nn.AvgPool2d(kernel_size=(2, 2),padding=0,stride=(2,2))
        ## The final linear layer which maps the VAE output to the output dimension
        self.fc_final = nn.Linear(num_features*seq_length, pred_length)

        
    
    def forward(self,x_start):
        print("Vae x_start")
        print(x_start.shape)
        ## We pass the input through two encoder blocks followed by pooling which reduces the feature map size to 6x6
        out = self.Encoder1(x_start)
        print(out.shape)
        print("end1")
        out = self.Encoder1(out)
        print(out.shape)
        print("end1")
        out = self.mean_pool(out)
        print(out.shape)
        print("end1")
        ## Reshaping the feature map and storing as it is required for sampling 
        feature_map2 = out.view(out.size(0),6,6)
        print(feature_map2.shape)
        print("end1")
        ## Encoding and Pooling the output once again which reduces the feature map size to 3x3 
        out = self.Encoder2(out)
        print(out.shape)
        print("end1")
        out = self.mean_pool(out)
        print(out.shape)
        print("end1")
        ## Flattening the final feature map and passing it through the linear layer which maps it to a latent vector of 
        ## dimension 4 (latent vector is dimension 2, but we predict both the mean and variances)
        feature_map1 = out.view(out.size(0),-1)
        print(feature_map1.shape)
        print("end1")
        z1 = self.fc11(feature_map1)
        print(z1.shape)
        print("end1")
        ## Randomly sampling noise from a standard normal
        noise1 = torch.randn((out.size(0),self.latent_dim1))
        print(noise1.shape)
        ## Applying the reparametrization trick to get the sampled value
        sampled_z1 = self.reparametrize(noise1,z1)
        print(sampled_z1.shape)
        print("end1")
        ## Adding the initial hidden vector to the sampled output and converting it back to 3x3 feature map using a linear layer
        out = sampled_z1 + self.hidden_state
        print(out.shape)
        print("end2")
        out = self.fc21(out)
        print(out.shape)
        print("end2")
        out = out.view(out.size(0),3,3)
        print(out.shape)
        print("end2")
        ## Upsampling to dimension 6x6
        out = self.upsample1(out.unsqueeze(0)).squeeze(0)
        print(out.shape)
        print("end2")
        ## Passing it through the decoder and combining it with feature map 2 to sample from the 2nd latent vector
        out = self.Decoder2(out)
        print(out.shape)
        print("end2")
        ## Maps to a dimension of 10 after flattening the vector which means means and variances of a latent vector of dim = 5
        z_decoder = (feature_map2 + out).view(out.size(0),-1)
        print(z_decoder.shape)
        print("end2")
        z2 = self.fc12(z_decoder)
        print(z2.shape)
        print("end2")
        ## In a similar fashion, we get the sampled value from z2
        noise2 = torch.randn((out.size(0),self.latent_dim2))
        sampled_z2 = self.reparametrize(noise2,z2)
        ## We convert it back to dim = 36 using a linear layer followed by reshaping it to 6x6
        z2_upsampled = self.fc22(sampled_z2).view(out.size(0),6,6)
        ## Upsampling to the original dimension of 12x12
        out = out + z2_upsampled
        out = self.upsample2(out.unsqueeze(0)).squeeze(0)
        out = self.Decoder1(out)
        out = self.Decoder1(out)
        ## Passing it through the final linear layer to map it to the shape of output
        out = self.fc_final(out.view(out.size(0),-1))
        print("end")
        # raise RuntimeError("Stop before return out")
        return out
        
    def reparametrize(self,noise,z):
        ## Getting the batch_size
        zsize=int(z.size(1))
        ## Initializing tensors for mean and variances
        sampled_z = torch.zeros((noise.size(0),zsize//2))
        mu=torch.zeros((noise.size(0),zsize//2))
        sig=torch.zeros((noise.size(0),zsize//2))
        for i in range(0,zsize//2):
            mu[:,i]=z[:,i]
            sig[:,i]=z[:,zsize//2+i]
            ## Computing the sampled value
            sampled_z[:,i]=mu[:,i] + noise[:,i]*sig[:,i]
        return sampled_z
        

In [26]:
## Defining the network for denoising score matching
class Denoise_net(nn.Module):
    def __init__(self,in_channels,dim,size,number=5):
        super().__init__()
        ## 2*number is number of diffusion samples used for denoise calculation
        ## Initializing the input dimension (actually prediction length in this case)
        hw = size
        self.dim=dim
        ## Number of input channels (batched mapping)
        self.channels=in_channels
        ## Defining the network for energy calculation
        self.conv=Conv2D(self.channels,dim,3,1)
        self.conv1=Conv2D(dim,dim,3,1)
        self.relu1=nn.ELU()
        self.pool1=Pooling()
        self.conv2=Conv2D(dim,dim,3,1)
        self.relu2=nn.ELU()
        self.conv3=Conv2D(dim,dim,3,1)
        self.relu3=nn.ELU()
        ## Getting interaction energy and self energy component field terms
        self.f1=nn.Linear((int(hw/2)*number),1)
        self.f2=nn.Linear((int(hw/2)*number),1)
        self.fq=nn.Linear((int(hw/2)*number),1)
        self.square=Square()

    def forward(self,input):
        output=self.conv(input)
        output1=self.conv1(output)
        output2=self.relu1(output1)
        ## Resnet type output computation for stable gradient flow
        output2=output2+output1
        ## Pooling to increase the receptive field
        output3=self.pool1(output2)
        output4=self.conv2(output3)
        output5=self.relu2(output4)
        output5=output5+output4
        output7=self.conv3(output5)
        output8=self.relu3(output7)
        l1=self.f1(output8.view(input.size(0),-1))
        l2=self.f2(output8.view(input.size(0),-1))
        lq=self.fq(self.square(output8.view(input.size(0),-1)))
        ## Getting gradient of energy term per sample (gradient of energy term is what we are concerned with)
        out=l1*l2 +lq
        out=out.view(-1)
        return out

In [27]:
batch_size = 16
## Initializing the VAE and Diffusion block with appropriate hyperparameters
VAE = HierarchialVAE(Encoder_Block = Encoder_Block, Decoder_Block = Decoder_Block , latent_dim2 = 5, latent_dim1 = 2, feature_size2 = 36,
                 feature_size1 = 9, hidden_size = 2, pred_length = 5, num_features = 12, seq_length = 12, batch_size=16)
Diffusion_Process = DiffusionProcess(num_diff_steps = 10, vae = VAE, beta_start = 0.01, beta_end = 0.1, scale = 0.5)


In [28]:
## Initializing the Denoising network with appropriate hyperparameters
Denoise_Net = Denoise_net(in_channels = 16,dim = 16, size = 5)

In [29]:
## Defining the MSE Loss and optimizers for parameters of the VAE and denoising net
from torch.optim.lr_scheduler import StepLR,CosineAnnealingLR
criterion=nn.MSELoss()
optimizer1=optim.Adam(VAE.parameters(),lr=3e-3)
optimizer2=optim.Adam(Denoise_Net.parameters(),lr=3e-3)
## Using Step learning rate scheduler for both the optimizers to ensure stable convergence
scheduler1= StepLR(optimizer1, step_size=2, gamma=0.5)
scheduler2 = StepLR(optimizer2, step_size=2, gamma=0.5)


In [None]:
## Defining the training loop with number of epochs, VAE and Dnet's and dataloaders as the inputs
def train(epochs,train_dataloader,val_dataloader,VAE,dnet,num_diff_steps):

    ## List for accumulating training and validation losses
    train_loss=[]
    val_loss=[]

    ## Iterating over number of epochs
    for epoch in range(0,epochs):

        total_loss=0
        ## Setting the both the models into training mode
        VAE.train()
        dnet.train()
        for i,(x,y) in enumerate(train_dataloader):
            if(x.size(0)!=16):
                break
            ## Initializing the VAE and diffusion outputs
            vae_out = torch.zeros((y.size(0), y.size(1),num_diff_steps))
            diff_out = torch.zeros((y.size(0), y.size(1),num_diff_steps))

            ## For number of diffusion timestamps..
            for time in range(1,num_diff_steps + 1):
                ## We compute the diffused target as well as target predicted by the VAE
                output, y_noisy = Diffusion_Process.diffuse(x,y,time)
                vae_out[:,:,time-1] = output
                diff_out[:,:,time-1] = y_noisy
            ## To get a approximate distribution of the outputs of the VAE and those produced by the diffusion net by adding noise
            ## we use the mean and variances of all outputs of all timestamps (assuming the distribution to be normal)
            mean_vae = torch.mean(vae_out, dim = 2)
            mean_diff = torch.mean(diff_out, dim = 2)
            var_vae = torch.std(vae_out, dim = 2)
            var_diff = torch.std(diff_out, dim = 2)
            optimizer1.zero_grad()
            optimizer2.zero_grad()
            ## Computing the MSE loss between mean values of both the outputs
            mse_loss = criterion(mean_vae, mean_diff)
            ## Computing the KL divergence between both the distributions
            ## We used the standard formula for KL divergence between 2 multivariate gaussians
            term1 = (mean_vae - mean_diff) / var_diff
            term2 = var_vae / var_diff
            kl_loss =  0.5 * ((term1 * term1).sum() + (term2 * term2).sum()) - 40 - torch.log(term2).sum()
            kl_loss = kl_loss.sum()

            ran=torch.randint(low=1,high=num_diff_steps + 1,size=(1,))
            y_nn=vae_out[:,:,:]

            E = Denoise_Net(y_nn).sum()
            grad_x = torch.autograd.grad(E, y_nn, create_graph=True)[0]
            dsm_loss = torch.mean(torch.sum((y.unsqueeze(2)-y_nn+grad_x*1)**2, [0,1,2])).float()
            ## Combining all the 3 losses with appropriate weights which are hyperparameters
            loss = 4*mse_loss+0.01*kl_loss+ 0.1*dsm_loss
            total_loss+=loss
            ## Performing backpropogation and gradient descent
            loss.backward()
            optimizer1.step()
            optimizer2.step()

        ## Updating the learning rate of both the optimizers according to the specified schedule
        scheduler1.step()
        scheduler2.step()

        totalval_loss=0

        ## Setting the model to evaluation mode
        VAE.eval()
        dnet.eval()
        for i,(x,y) in enumerate(val_dataloader):
            if(x.size(0)!=16):
                break
            ## Initializing the VAE and diffusion outputs
            vae_out = torch.zeros((y.size(0), y.size(1),num_diff_steps))
            diff_out = torch.zeros((y.size(0), y.size(1),num_diff_steps))

            ## Performing forward pass and computing the losses in a fashion similar to training
            for time in range(1,num_diff_steps + 1):
                output, y_noisy = Diffusion_Process.diffuse(x,y,time)
                vae_out[:,:,time-1] = output
                diff_out[:,:,time-1] = y_noisy
            mean_vae = torch.mean(vae_out, dim = 2)
            mean_diff = torch.mean(diff_out, dim = 2)
            var_vae = torch.std(vae_out, dim = 2)
            var_diff = torch.std(diff_out, dim = 2)
            mse_loss = criterion(mean_vae, mean_diff)
            term1 = (mean_vae - mean_diff) / var_diff
            term2 = var_vae / var_diff
            kl_loss =  0.5 * ((term1 * term1).sum() + (term2 * term2).sum()) - 40 - torch.log(term2).sum()
            kl_loss = kl_loss.sum()
            ran=torch.randint(low=1,high=num_diff_steps + 1,size=(1,))
            y_nn=vae_out[:,:,:]
            E = Denoise_Net(y_nn).sum()
            grad_x = torch.autograd.grad(E, y_nn, create_graph=True)[0]
            dsm_loss = torch.mean(torch.sum((y.unsqueeze(2)-y_nn+grad_x*1)**2, [0,1,2])).float()
            ## Computing the total validation loss
            valloss = 4*mse_loss+0.01*kl_loss+ 0.1*dsm_loss
            totalval_loss+=valloss

        ## Averaging out the training loss over all batches and printing the losses after every epoch
        train_loss.append(total_loss/(len(train_dataloader)))
        val_loss.append(totalval_loss/(len(val_dataloader)))
        print(f"Epoch: {epoch+1}")
        print(f"Training :: Loss:{train_loss[epoch]}")
        print(f"Validation :: Loss:{val_loss[epoch]}")

    return train_loss,val_loss

In [None]:
train_loss,val_loss = train(epochs =20
                             ,train_dataloader = train_dataloader, val_dataloader = val_dataloader, VAE = VAE,dnet = Denoise_Net, num_diff_steps = 10)

Vae x_start
torch.Size([16, 12, 12])
Input Tensor Shape conv2d: torch.Size([16, 12, 12])
Output Tensor Shape: torch.Size([16, 12, 12])
Input Tensor Shape conv2d: torch.Size([16, 12, 12])
Output Tensor Shape: torch.Size([16, 12, 12])
Input Tensor Shape conv2d: torch.Size([16, 12, 12])
Output Tensor Shape: torch.Size([16, 12, 12])
Input Tensor Shape se: torch.Size([16, 12, 12])
Shape after mean operation: torch.Size([16])
Shape after flattening: torch.Size([16, 1])
Shape after self.se and reshaping: torch.Size([16, 12, 12])
Output Tensor Shape: torch.Size([16, 12, 12])
torch.Size([16, 12, 12])
end1
Input Tensor Shape conv2d: torch.Size([16, 12, 12])
Output Tensor Shape: torch.Size([16, 12, 12])
Input Tensor Shape conv2d: torch.Size([16, 12, 12])
Output Tensor Shape: torch.Size([16, 12, 12])
Input Tensor Shape conv2d: torch.Size([16, 12, 12])
Output Tensor Shape: torch.Size([16, 12, 12])
Input Tensor Shape se: torch.Size([16, 12, 12])
Shape after mean operation: torch.Size([16])
Shape aft

RuntimeError: Stop before return out

In [None]:
#Detaching the losses in epochs so as to plot
train_loss = [tensor.detach() for tensor in train_loss]
val_loss = [tensor.detach() for tensor in val_loss]
plt.figure(figsize=(11, 8))
#Plotting the loss vs epochs graph
plt.plot((np.arange(2,21,1)),train_loss[1:],label='Validation Loss')
plt.plot((np.arange(2,21,1)),val_loss[1:],label='Training Loss')
plt.title("Loss vs Epochs",fontsize=16)
plt.xlabel('Epochs')
plt.ylabel("Loss (MSE+KL+DSM)",fontsize=16)
plt.legend()
plt.xticks([2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20])
plt.grid()
plt.show()
#As can be seen the training loss keeps reducing and after certain epochs reaches a saturation point, so does the validation loss

In [None]:
#We define testing function
def test(test_dataloader,VAE,dnet,num_diff_steps):
    #to calculate kl,mse,dsm loss seperately, and get the predicted and target sequences
    totaltest_loss=0
    totalmse_loss=0
    totalkl_loss=0
    totaldsm_loss=0
    predicted_seq=[]
    inp_seq=[]
    target_seq=[]
    for i,(x,y) in enumerate(test_dataloader):
        if(x.size(0)!=16):
            break
        vae_out = torch.zeros((y.size(0), y.size(1),num_diff_steps))
        diff_out = torch.zeros((y.size(0), y.size(1),num_diff_steps))

        #Similar to the training loop
        for time in range(1,num_diff_steps + 1):
            output, y_noisy = Diffusion_Process.diffuse(x,y,time)
            vae_out[:,:,time-1] = output
            diff_out[:,:,time-1] = y_noisy
        mean_vae = torch.mean(vae_out, dim = 2)
        mean_diff = torch.mean(diff_out, dim = 2)
        var_vae = torch.std(vae_out, dim = 2)
        var_diff = torch.std(diff_out, dim = 2)
        mse_loss = criterion(mean_vae, mean_diff)
        term1 = (mean_vae - mean_diff) / var_diff
        term2 = var_vae / var_diff
        kl_loss =  0.5 * ((term1 * term1).sum() + (term2 * term2).sum()) - 40 - torch.log(term2).sum()
        kl_loss = kl_loss.sum()
        ran=torch.randint(low=1,high=num_diff_steps + 1,size=(1,))
        y_nn=vae_out[:,:,:]
        E = Denoise_Net(y_nn).sum()
        grad_x = torch.autograd.grad(E, y_nn, create_graph=True)[0]
        dsm_loss = torch.mean(torch.sum((y.unsqueeze(2)-y_nn+grad_x*1)**2, [0,1,2])).float()
        testloss = 4*mse_loss+0.01*kl_loss+ 0.1*dsm_loss
        totalmse_loss+=4*mse_loss
        totalkl_loss+=0.01*kl_loss
        totaldsm_loss+=0.1*dsm_loss
        totaltest_loss+=testloss
          #The predicted sequence will be nothing but the mean of vae - of the scaled gradient
        inp_seq.append(x)
        predicted_seq.append(mean_vae - 0.1*torch.mean(grad_x,dim=2))
        target_seq.append(y)
        #Computing and printing all of the losses
    avg_test_loss=totaltest_loss/(len(test_dataloader))
    avg_mse_loss=totalmse_loss/(len(test_dataloader))
    avg_kl_loss=totalkl_loss/(len(test_dataloader))
    avg_dsm_loss=totaldsm_loss/(len(test_dataloader))
    print(f"Test total Loss : {avg_test_loss}")
    print(f"Test MSE Loss : {avg_mse_loss}")
    print(f"Test KL Loss : {avg_kl_loss}")
    print(f"Test DSM Loss : {avg_dsm_loss}")
    return inp_seq,predicted_seq,target_seq,avg_test_loss,avg_mse_loss,avg_kl_loss,avg_dsm_loss


In [None]:
#Testing the model based on losses on the testing dataset
_,_,_,loss,mse,kl,dsm = test(test_dataloader,VAE,Denoise_net,10)

In [None]:
#Predicting sequences for the entire dataset
inp,pred,tar,_,_,_,_ = test(entire_dataloader,VAE,Denoise_net,10)

In [None]:
#Separting all of the sequences so as to remove the batch dimension
target_sequence=[]
for i in range (0,len(tar)):
    for j in range(0,16):
        target_sequence.append(tar[i][j])
pred_sequence=[]
for i in range(0,len(pred)):
    for j in range(0,16):
        pred_sequence.append(pred[i][j])

In [None]:
#Joining all of the sequences so as to create one list of predicted stock prices,
#we got these for all the sequences,
#we took every 5th predicted sequence to construct the plot
tarcont_seq=[]
for i in range(0,len(target_sequence)):
    if(i%5==0):
        tarcont_seq.append(target_sequence[i])
tarcont_seq = [item.item() for sublist in tarcont_seq for item in sublist]
predcont_seq=[]
for i in range(0,len(pred_sequence)):
    if(i%5==0):
        predcont_seq.append(pred_sequence[i])
predcont_seq = [item.item() for sublist in predcont_seq for item in sublist]

In [None]:
print(len(tarcont_seq))
print(len(predcont_seq))


In [None]:
#Denormalizing the sequences by multiplying with shifted Closes
denorm_tar=apple_stock['Close'][6+5:2051+15]*tarcont_seq
denorm_pred=apple_stock['Close'][6+5:2051+15]*predcont_seq


In [None]:
#Plotting Predicted vs actual for the training dataset
plt.figure(figsize=(14,8))
plt.plot(np.arange(0,len(denorm_tar[0:2048]),1),denorm_tar[0:2048],label='Target')
plt.plot(np.arange(0,len(denorm_pred[0:2048]),1),denorm_pred[0:2048],label='Predicted')
plt.grid()
plt.legend()
plt.title('Actual vs Predicted Stock Price (Training Data)',fontsize=16)
plt.xlabel('Days',fontsize=16)
plt.ylabel('Stock Price (Denormalized)',fontsize=16)
plt.plot()

#Plotting Predicted vs actual for the validation dataset
plt.figure(figsize=(14,8))
plt.plot(np.arange(0,len(denorm_tar[2048:2048+256]),1),denorm_tar[2048:2048+256],label='Target')
plt.plot(np.arange(0,len(denorm_pred[2048:2048+256]),1),denorm_pred[2048:2048+256],label='Predicted')
plt.grid()
plt.legend()
plt.title('Actual vs Predicted Stock Price (Validation Data)',fontsize=16)
plt.xlabel('Days',fontsize=16)
plt.ylabel('Stock Price (Denormalized)',fontsize=16)
plt.plot()

#Plotting Predicted vs actual for the testing dataset
plt.figure(figsize=(14,8))
plt.plot(np.arange(0,len(denorm_tar[2048+256:]),1),denorm_tar[2048+256:],label='Target')
plt.plot(np.arange(0,len(denorm_pred[2048+256:]),1),denorm_pred[2048+256:],label='Predicted')
plt.grid()
plt.legend()
plt.title('Actual vs Predicted Stock Price (Test Data)',fontsize=16)
plt.xlabel('Days',fontsize=16)
plt.ylabel('Stock Price (Denormalized)',fontsize=16)
plt.plot()


In [None]:
# Assuming denorm_tar and denorm_pred are NumPy arrays containing target and predicted values

# Calculate MAPE for training data
mape_train = np.mean(np.abs((denorm_tar[0:2048] - denorm_pred[0:2048]) / denorm_tar[0:2048])) * 100
print(f'MAPE for Training Data: {mape_train}%')

# Calculate MAPE for validation data
mape_val = np.mean(np.abs((denorm_tar[2048:2048+256] - denorm_pred[2048:2048+256]) / denorm_tar[2048:2048+256])) * 100
print(f'MAPE for Validation Data: {mape_val}%')

# Calculate MAPE for test data
mape_test = np.mean(np.abs((denorm_tar[2048+256:] - denorm_pred[2048+256:]) / denorm_tar[2048+256:])) * 100
print(f'MAPE for Test Data: {mape_test}%')


In [None]:
#As can be seen our model predicts the stock prices for the next 5 days quiet appreciably!
from sklearn.metrics import mean_squared_error
import numpy as np

# Assuming denorm_tar and denorm_pred are NumPy arrays containing target and predicted values

# Calculate RMSE for training data
rmse_train = np.sqrt(mean_squared_error(denorm_tar[0:2048], denorm_pred[0:2048]))
print(f'RMSE for Training Data: {rmse_train}')

# Calculate RMSE for validation data
rmse_val = np.sqrt(mean_squared_error(denorm_tar[2048:2048+256], denorm_pred[2048:2048+256]))
print(f'RMSE for Validation Data: {rmse_val}')

# Calculate RMSE for test data
rmse_test = np.sqrt(mean_squared_error(denorm_tar[2048+256:], denorm_pred[2048+256:]))
print(f'RMSE for Test Data: {rmse_test}')
