In [104]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

import time
import logging
from torch.cuda.amp import GradScaler, autocast
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.set_float32_matmul_precision('high')

import torch._dynamo
torch._dynamo.config.verbose = True
torch._dynamo.config.suppress_errors = True

In [105]:
input_file = r'D:\School\ADMU\4Y\SEM 1\MATH 199.11\Final\input_gwap_luz.csv'
data = pd.read_csv(input_file)

data = data[['GWAP', 'FLOW_LUZ','GWAP_DR','GWAP_FR','GWAP_RD','GWAP_RU']]  # Select the relevant time series column



scaler = MinMaxScaler(feature_range=(0, 1))
data_scaled = scaler.fit_transform(data.values)

columns = ['GWAP', 'FLOW_LUZ','GWAP_DR','GWAP_FR','GWAP_RD','GWAP_RU']
        
X  = data[columns].values
y = data['GWAP'].values.reshape(-1,1)


In [106]:
class TimeSeriesDataset(Dataset):
    
    def __init__(self, X, y, seq_len):
        self.X = torch.tensor(X).float()
        self.y = torch.tensor(y).float()
        self.seq_len = seq_len
    
    def __len__(self):
        return len(self.X) - self.seq_len + 1
        
    def __getitem__(self, idx):
        return (self.X[idx:idx+self.seq_len], self.y[idx+self.seq_len-1])

In [107]:
train_size = int(0.7 * len(data))  # 70% for training
val_size = int(0.15 * len(data))   # 15% for validation
test_size = len(data) - train_size - val_size  # Remaining 15% for testing

train_data = X[:train_size]
train_labels = y[:train_size]

val_data = X[train_size:train_size + val_size]
val_labels = y[train_size:train_size + val_size]

test_data = X[train_size + val_size:]
test_labels = y[train_size + val_size:]
seq_len=1440
batch_size=64

train_dataset = TimeSeriesDataset(train_data, train_labels, seq_len)
train_dataloader = DataLoader(train_dataset, batch_size, shuffle=False)

val_dataset = TimeSeriesDataset(val_data, val_labels, seq_len)    
val_dataloader = DataLoader(val_dataset, batch_size, shuffle=False) 

test_dataset = TimeSeriesDataset(test_data, test_labels, seq_len)
test_dataloader = DataLoader(test_dataset, batch_size, shuffle=False)


In [108]:
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # LSTM layer
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        # Fully connected layer
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):

    
        # Forward propagate LSTM
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])  # Take the output from the last time step
        return out

In [109]:
class CustomLSTMCell(nn.Module):
    def __init__(self, input_size, hidden_size, dropout=0.0):
        super(CustomLSTMCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.dropout = dropout
        
        # Combined weights for all gates
        self.weight_ih = nn.Parameter(torch.Tensor(4 * hidden_size, input_size))
        self.weight_hh = nn.Parameter(torch.Tensor(4 * hidden_size, hidden_size))
        self.bias = nn.Parameter(torch.Tensor(4 * hidden_size))
        
        self.reset_parameters()
        self.dropout_layer = nn.Dropout(dropout)

    def reset_parameters(self):
        stdv = 1.0 / torch.sqrt(torch.tensor(self.hidden_size, dtype=torch.float))
        for weight in self.parameters():
            nn.init.uniform_(weight, -stdv, stdv)

    def forward(self, input, state):
        hx, cx = state
        hx = self.dropout_layer(hx)  # Apply dropout to the hidden state
        gates = F.linear(input, self.weight_ih, self.bias) + F.linear(hx, self.weight_hh)

        ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)

        ingate = torch.sigmoid(ingate)
        forgetgate = torch.sigmoid(forgetgate)
        cellgate = F.relu(cellgate)  # Use ReLU here
        outgate = torch.sigmoid(outgate)

        cy = (forgetgate * cx) + (ingate * cellgate)
        hy = outgate * F.relu(cy)  # Use ReLU here as well

        return hy, cy

In [110]:
class CustomLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(CustomLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.cell_list = nn.ModuleList([CustomLSTMCell(input_size, hidden_size)])
        self.cell_list.extend([CustomLSTMCell(hidden_size, hidden_size) for _ in range(1, num_layers)])
        
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        batch_size, seq_len, _ = x.size()
        hidden = [torch.zeros(batch_size, self.hidden_size).to(x.device) for _ in range(self.num_layers)]
        cell = [torch.zeros(batch_size, self.hidden_size).to(x.device) for _ in range(self.num_layers)]

        for t in range(seq_len):
            for l in range(self.num_layers):
                if l == 0:
                    hidden[l], cell[l] = self.cell_list[l](x[:, t, :], (hidden[l], cell[l]))
                else:
                    hidden[l], cell[l] = self.cell_list[l](hidden[l-1], (hidden[l], cell[l]))

        out = self.fc(hidden[-1])
        return out

In [111]:
class LSTMCustomCell(nn.Module):
    def __init__(self, input_size, hidden_size, activation_fn):
        super(LSTMCustomCell, self).__init__()
        self.hidden_size = hidden_size
        self.activation_fn = activation_fn
        
        # Combine all gate matrices into one large matrix for efficiency
        self.W_ih = nn.Linear(input_size, 4 * hidden_size, bias=False)
        self.W_hh = nn.Linear(hidden_size, 4 * hidden_size, bias=False)
        self.bias = nn.Parameter(torch.zeros(4 * hidden_size))
        
    def forward(self, x, hidden):
        h, c = hidden
        
        # Optimized matrix multiplication and bias addition
        gates = self.W_ih(x) + self.W_hh(h) + self.bias
        
        # Split into 4 gate vectors
        i_gate, f_gate, o_gate, g_gate = torch.chunk(gates, 4, dim=1)
        
        # Sigmoid activations for gates
        i_gate = torch.sigmoid(i_gate)
        f_gate = torch.sigmoid(f_gate)
        o_gate = torch.sigmoid(o_gate)
        
        # Apply the custom activation function for the cell gate
        g_gate = self.activation_fn(g_gate)
        
        # Compute the new cell state
        c_next = f_gate * c + i_gate * g_gate
        
        # Compute the new hidden state using the custom activation function
        h_next = o_gate * self.activation_fn(c_next)
        
        return h_next, c_next

In [112]:
class LSTMCustom(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1, activation_fn=torch.tanh, batch_first=False):
        super(LSTMCustom, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.activation_fn = activation_fn
        self.batch_first = batch_first
        
        # Create a list of LSTM cells
        self.cells = nn.ModuleList([LSTMCustomCell(input_size if i == 0 else hidden_size, hidden_size, activation_fn) for i in range(num_layers)])
        
        # Add a fully connected layer to map the hidden size to the output size (1)
        self.fc = nn.Linear(hidden_size, 1)
        
    def forward(self, x, hidden=None):
        if self.batch_first:
            batch_size, seq_len, _ = x.size()
            x = x.transpose(0, 1)  # Convert to (seq_len, batch_size, input_size)
        else:
            seq_len, batch_size, _ = x.size()
        
        if hidden is None:
            h = [torch.zeros(batch_size, self.hidden_size, device=x.device) for _ in range(self.num_layers)]
            c = [torch.zeros(batch_size, self.hidden_size, device=x.device) for _ in range(self.num_layers)]
        else:
            h, c = hidden
        
        outputs = []
        
        for t in range(seq_len):
            x_t = x[t, :, :]  # Input at time step t
            for i, cell in enumerate(self.cells):
                h[i], c[i] = cell(x_t, (h[i], c[i]))
                x_t = h[i]  # Pass hidden state to the next layer
            outputs.append(h[-1].unsqueeze(0))  # Collect output from the last layer
        
        # Stack the outputs across time steps
        outputs = torch.cat(outputs, dim=0)  # Shape will be (seq_len, batch_size, hidden_size)
        
        # Apply the fully connected layer to the output of the last time step
        outputs = self.fc(outputs[-1])  # Get the last output
        outputs = outputs.view(-1, 1)  # Pass it through the fully connected layer

        

        # Return outputs and the last hidden and cell states
        return outputs, (torch.stack(h), torch.stack(c))

In [113]:
# Define model parameters
input_size = train_data.shape[1]  # Number of features
hidden_size = 64
output_size = train_labels.shape[1]  # Number of output features
num_layers = 2
#model = LSTMModel(input_size, hidden_size,output_size, num_layers).to(device)
criterion=nn.MSELoss()
activation_fn = torch.relu
model = LSTMCustom(input_size, hidden_size, num_layers, activation_fn).to(device)

In [114]:
#model = torch.compile(model, backend="inductor")

In [115]:
# Initialize the GradScaler
scaler = torch.amp.GradScaler('cuda')
torch.cuda.synchronize()
def train(model, train_dataloader, device, optimizer, criterion):
    model.train()
    total_loss = 0.0  # Initialize total loss to 0

    start_data_time = time.time()
    for i, (inputs, target) in enumerate(train_dataloader):
        print(f"Data loading time: {time.time() - start_data_time:.4f} seconds")
        inputs, target = inputs.to(device), target.to(device)
        print(inputs.shape)
        print(target.shape)
        optimizer.zero_grad()
        
        # Forward pass with mixed precision
        with torch.amp.autocast('cuda'):
            outputs, _ = model(inputs)  # Unpack the output tuple
            print(f"Outputs shape after model: {outputs.shape}")  # Debugging line
            
            # At this point, outputs should be of shape (batch_size, 1)
            print(f"Adjusted outputs shape for loss: {outputs.shape}")  # Should be (128, 1)
            
            # Calculate loss
            loss = criterion(outputs, target)

        # Backward pass
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        # Accumulate the loss
        total_loss += loss.item() * inputs.size(0)

    # Return the average loss over all batches
    return total_loss / len(train_dataloader.dataset)

In [116]:
@torch.no_grad()
def evaluate(model, test_dataloader, device, criterion):
    model.eval()
    total_loss = 0.0  # Initialize total loss


    for i, (inputs, target) in enumerate(test_dataloader):  # Use `test_dataloader`
        inputs, target = inputs.to(device), target.to(device)

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, target)

        # Accumulate the loss
        total_loss += loss.item() * inputs.size(0)

    # Return the average loss over all batches
    
    return total_loss/len(test_dataloader.dataset)


In [117]:
def run(model, train_dataloader, test_dataloader, device, epoch):
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-2,weight_decay=1e-3)
    
    for epoch in range(epoch):
         # Profiling the training phase
        with torch.profiler.profile() as prof:
            train_loss = train(model, train_dataloader, device, optimizer, criterion)
        
        # Print the profiling results
        print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
        print(f"Epoch {epoch+1}/{epochs}, Training Loss: {train_loss}")
        loss = train(model, train_dataloader, device, optimizer,criterion)
        
        test_loss = evaluate(model, test_dataloader, device,criterion)
        print(epoch, loss, test_loss)
        if (epoch + 1) == 100 or (epoch + 1) % 20 == 0:
            print(f'Epoch {epoch+1:04d} | loss: {loss:.4f} '
                f'test_loss: {test_loss:.4f} ')
    
        if loss < 1e-3 and test_loss < 1e-3:
            break

In [118]:
epoch=100
run(model, train_dataloader, test_dataloader, device, epoch)

Data loading time: 0.0030 seconds
torch.Size([64, 1440, 6])
torch.Size([64, 1])
Outputs shape after model: torch.Size([1440, 1])
Adjusted outputs shape for loss: torch.Size([1440, 1])


  return F.mse_loss(input, target, reduction=self.reduction)


RuntimeError: The size of tensor a (1440) must match the size of tensor b (64) at non-singleton dimension 0

In [14]:
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Current device: {torch.cuda.current_device()}")
print(f"Device name: {torch.cuda.get_device_name()}")

CUDA available: True
Current device: 0
Device name: NVIDIA GeForce RTX 3060


In [40]:
import torch
from torch.utils.data import Dataset, DataLoader

class SimpleDataset(Dataset):
    def __init__(self, size):
        self.data = torch.randn(size, 10)
        self.labels = torch.randn(size, 1)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index], self.labels[index]

simple_dataset = SimpleDataset(1000)
simple_dataloader = DataLoader(simple_dataset, batch_size=64, shuffle=True, num_workers=0)

for inputs, targets in simple_dataloader:
    print(inputs.shape, targets.shape)


torch.Size([64, 10]) torch.Size([64, 1])
torch.Size([64, 10]) torch.Size([64, 1])
torch.Size([64, 10]) torch.Size([64, 1])
torch.Size([64, 10]) torch.Size([64, 1])
torch.Size([64, 10]) torch.Size([64, 1])
torch.Size([64, 10]) torch.Size([64, 1])
torch.Size([64, 10]) torch.Size([64, 1])
torch.Size([64, 10]) torch.Size([64, 1])
torch.Size([64, 10]) torch.Size([64, 1])
torch.Size([64, 10]) torch.Size([64, 1])
torch.Size([64, 10]) torch.Size([64, 1])
torch.Size([64, 10]) torch.Size([64, 1])
torch.Size([64, 10]) torch.Size([64, 1])
torch.Size([64, 10]) torch.Size([64, 1])
torch.Size([64, 10]) torch.Size([64, 1])
torch.Size([40, 10]) torch.Size([40, 1])


In [10]:
def generate_sine_wave(seq_length, num_samples):
    x = np.linspace(0, 100, num_samples)
    y = np.sin(x)
    
    sequences = []
    targets = []
    for i in range(len(y) - seq_length):
        sequences.append(y[i:i + seq_length])
        targets.append(y[i + seq_length])
    
    sequences = np.array(sequences)
    targets = np.array(targets)
    
    return torch.tensor(sequences, dtype=torch.float32), torch.tensor(targets, dtype=torch.float32)

seq_length = 10
num_samples = 100
X, y = generate_sine_wave(seq_length, num_samples)

# Reshape to fit LSTM input (seq_len, batch_size, input_size)
X = X.unsqueeze(-1)  # Add an extra dimension for input size (which is 1 for univariate time series)
y = y.unsqueeze(-1)  # Add an extra dimension for target size

# Split into train and test
train_size = int(0.8 * len(X))
X_train, y_train = X[:train_size], y[:train_size]
X_test, y_test = X[train_size:], y[train_size:]


In [16]:
# Define the custom model
input_size = 1
hidden_size = 20
num_layers = 2

model = LSTMCustom(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
criterion = nn.MSELoss()  # Mean Squared Error for regression
optimizer = optim.AdamW(model.parameters(), lr=0.001,weight_decay=1e-4)
model = torch.compile(model, backend="inductor")

# Training loop
num_epochs = 100
batch_size = 16

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    
    # Train in batches
    for i in range(0, len(X_train), batch_size):
        X_batch = X_train[i:i+batch_size]
        y_batch = y_train[i:i+batch_size]
        
        optimizer.zero_grad()
        
        # Forward pass
        outputs, _ = model(X_batch)
        loss = criterion(outputs[:, -1, :], y_batch)  # Compare the last output with the target
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss / len(X_train)}")

# Test the model on the test set
model.eval()
with torch.no_grad():
    test_outputs, _ = model(X_test)
    test_loss = criterion(test_outputs[:, -1, :], y_test)
    print(f"Test Loss: {test_loss.item()}")

Epoch 1/100, Loss: 0.03363672147194544
Epoch 2/100, Loss: 0.03290561379657851
Epoch 3/100, Loss: 0.032128037263949714
Epoch 4/100, Loss: 0.031258573134740196
Epoch 5/100, Loss: 0.030249755829572678
Epoch 6/100, Loss: 0.0290426653292444
Epoch 7/100, Loss: 0.02755696243709988
Epoch 8/100, Loss: 0.025670189410448074
Epoch 9/100, Loss: 0.023176424619224336
Epoch 10/100, Loss: 0.0197226500345601
Epoch 11/100, Loss: 0.014812210988667276
Epoch 12/100, Loss: 0.008450245174268881
Epoch 13/100, Loss: 0.0036447161498169103
Epoch 14/100, Loss: 0.004164617094728682
Epoch 15/100, Loss: 0.0038826227084630066
Epoch 16/100, Loss: 0.002749183422161473
Epoch 17/100, Loss: 0.0026872331897417703
Epoch 18/100, Loss: 0.0025876302065120805
Epoch 19/100, Loss: 0.0022334373659557765
Epoch 20/100, Loss: 0.002003047020278043
Epoch 21/100, Loss: 0.0018587648972041076
Epoch 22/100, Loss: 0.0016525645429889362
Epoch 23/100, Loss: 0.00146198986719052
Epoch 24/100, Loss: 0.0012976009295218522
Epoch 25/100, Loss: 0.001