# Long Short-Term Memory with Pytorch + Lightning 

In [30]:
import torch 
import torch.nn as nn
import torch.nn.functional as F 
from torch.optim import Adam 

import lightning as L 
from torch.utils.data import TensorDataset, DataLoader
from lightning.pytorch.loggers import TensorBoardLogger


```shell
class LSTMbyHand(L.LightningModule):
    def __init__(): 
    # Create and initialize Weights and Bias Tensors
    def lstm_unit(self, input_value, long_memory, short_memory):
    # Do the LSTM math
    def forward(self,input):
    # Make a forward pass through unrolled LSTM
    def configure_optimizers(self):
    # Configure adam optimizer 
    def training_step(self, batch, batch_idx):
    # Calculate loss and log training progress
```

In [31]:
class LSTMbyHand(L.LightningModule):
    def __init__(self):
        super().__init__()
        # Use Normal Distribution to randomly select and initialize value for each weight. For example given a normal distribution with mean = 0 and standard deviation =1, we will use it to generate random number for weights. 
        mean = torch.tensor(0.0)
        std = torch.tensor(1.0)
        
        self.wlr1 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.wlr2 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.blr1 = nn.Parameter(torch.tensor(0.), requires_grad=True)
        
        self.wpr1 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.wpr2 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.bpr1 = nn.Parameter(torch.tensor(0.), requires_grad=True)
        
        self.wp1 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.wp2 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.bp1 = nn.Parameter(torch.tensor(0.), requires_grad=True)
        
        self.wo1 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.wo2 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.bo1 = nn.Parameter(torch.tensor(0.), requires_grad=True)
    
    def lstm_unit(self, input_value, long_memory, short_memory):
        # First Stage: Calculate the percentage of the long term memory to remember
        long_remember_percent = torch.sigmoid((short_memory * self.wlr1)+(input_value * self.wlr2)+self.blr1)
        # Second Stage: Creates a new potential long term memory and determines what percentage of it to remember
        potential_remember_percent = torch.sigmoid((short_memory * self.wpr1)+(input_value * self.wpr2)+self.bpr1)
        potential_memory = torch.tanh((short_memory * self.wp1)+(input_value * self.wp2)+self.bp1)
        # Then uopdate the long term memory
        updated_long_memory = ((long_memory * long_remember_percent) + (potential_memory * potential_remember_percent))
        # Third Stage: We create short term memory and determine what percentage to remember
        output_percent = torch.sigmoid((short_memory * self.wo1)+(input_value * self.wo2)+ self.bo1)
        updated_short_memory = torch.tanh(updated_long_memory) * output_percent
        # Last Stage: Return the updated long and short term memories
        return([updated_long_memory, updated_short_memory])
    
    def forward(self, input):
        long_memory = 0 
        short_memory = 0 
        day1 = input[0]
        day2 = input[1]
        day3 = input[2]
        day4 = input[3]
        
        long_memory, short_memory = self.lstm_unit(day1, long_memory, short_memory)
        long_memory, short_memory = self.lstm_unit(day2, long_memory, short_memory)
        long_memory, short_memory = self.lstm_unit(day3, long_memory, short_memory)
        long_memory, short_memory = self.lstm_unit(day4, long_memory, short_memory)
        
        return short_memory
    
    def configure_optimizers(self):
        return Adam(self.parameters())
    
    def training_step(self, batch, batch_idx):
        input_i, label_i = batch 
        output_i = self.forward(input_i[0])
        loss = (output_i - label_i)**2
        
        self.log("train_loss", loss)
        
        if(label_i == 0):
            self.log("out_0", output_i)
        else:
            self.log("out_1",output_i)
        
        return loss 

In [32]:
model = LSTMbyHand()

print("\nNow lets compare the observed and predicted values...")
print("Company A: Observed=0, Predicted=",model(torch.tensor([0.,0.5,0.25,1.])).detach())
print("Company B: Observed=1, Predicted=",model(torch.tensor([1.,0.5,0.25,1.])).detach())


Now lets compare the observed and predicted values...
Company A: Observed=0, Predicted= tensor(-0.3369)
Company B: Observed=1, Predicted= tensor(-0.4096)


In [33]:
inputs = torch.tensor([[0., 0.5, 0.25, 1.],[1., 0.5, 0.25, 1.]])
labels = torch.tensor([0., 1.])

dataset = TensorDataset(inputs, labels)
dataLoader = DataLoader(dataset)

trainer = L.Trainer(max_epochs=2000)
trainer.fit(model, train_dataloaders=dataLoader)

You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name         | Type | Params | Mode
---------------------------------------------
  | other params | n/a  | 12     | n/a 
---------------------------------------------
12        Trainable params
0         Non-trainable params
12        Total params
0.000     Total estimated model params size (MB)
0         Modules in train mode
0         Modules in eval mode
/Users/sanjeeb/Desktop/Harbin Institute of Technology/Artificial Intelligence/AI_Codes/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
/Us

Epoch 1999: 100%|██████████| 2/2 [00:00<00:00, 80.22it/s, v_num=4] 

`Trainer.fit` stopped: `max_epochs=2000` reached.


Epoch 1999: 100%|██████████| 2/2 [00:00<00:00, 54.42it/s, v_num=4]


In [34]:
print("Company A: Observed=0, Predicted=",model(torch.tensor([0.,0.5,0.25,1.])).detach())
print("Company B: Observed=1, Predicted=",model(torch.tensor([1.,0.5,0.25,1.])).detach())

Company A: Observed=0, Predicted= tensor(0.4856)
Company B: Observed=1, Predicted= tensor(0.5491)


In [35]:
path_to_best_checkpoint = trainer.checkpoint_callback.best_model_path
trainer = L.Trainer(max_epochs=3000)
trainer.fit(model, train_dataloaders=dataLoader, ckpt_path=path_to_best_checkpoint)

You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Restoring states from the checkpoint path at /Users/sanjeeb/Desktop/Harbin Institute of Technology/Artificial Intelligence/AI_Codes/Neural-Network/lightning_logs/version_4/checkpoints/epoch=1999-step=4000.ckpt
/Users/sanjeeb/Desktop/Harbin Institute of Technology/Artificial Intelligence/AI_Codes/.venv/lib/python3.11/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:362: The dirpath has changed from '/Users/sanjeeb/Desktop/Harbin Institute of Technology/Artificial Intelligence/AI_Codes/Neural-Network/lightning_logs/version_4/checkpoints' to '/Users/sanjeeb/Desktop/Harbin Institute of Technology/Artificial Intelligence/AI_Codes/Neural-Network/lightning_logs/version_5/checkpoints', therefore `best_model_score`, `kth_best_model_path`,

Epoch 2999: 100%|██████████| 2/2 [00:00<00:00, 76.26it/s, v_num=5]

`Trainer.fit` stopped: `max_epochs=3000` reached.


Epoch 2999: 100%|██████████| 2/2 [00:00<00:00, 54.22it/s, v_num=5]


In [36]:
print("Company A: Observed=0, Predicted=",model(torch.tensor([0.,0.5,0.25,1.])).detach())
print("Company B: Observed=1, Predicted=",model(torch.tensor([1.,0.5,0.25,1.])).detach())

Company A: Observed=0, Predicted= tensor(0.3006)
Company B: Observed=1, Predicted= tensor(0.6620)


In [37]:
path_to_best_checkpoint = trainer.checkpoint_callback.best_model_path
trainer = L.Trainer(max_epochs=5000)
trainer.fit(model, train_dataloaders=dataLoader, ckpt_path=path_to_best_checkpoint)

You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Restoring states from the checkpoint path at /Users/sanjeeb/Desktop/Harbin Institute of Technology/Artificial Intelligence/AI_Codes/Neural-Network/lightning_logs/version_5/checkpoints/epoch=2999-step=6000.ckpt
/Users/sanjeeb/Desktop/Harbin Institute of Technology/Artificial Intelligence/AI_Codes/.venv/lib/python3.11/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:362: The dirpath has changed from '/Users/sanjeeb/Desktop/Harbin Institute of Technology/Artificial Intelligence/AI_Codes/Neural-Network/lightning_logs/version_5/checkpoints' to '/Users/sanjeeb/Desktop/Harbin Institute of Technology/Artificial Intelligence/AI_Codes/Neural-Network/lightning_logs/version_6/checkpoints', therefore `best_model_score`, `kth_best_model_path`,

Epoch 4999: 100%|██████████| 2/2 [00:00<00:00, 87.74it/s, v_num=6] 

`Trainer.fit` stopped: `max_epochs=5000` reached.


Epoch 4999: 100%|██████████| 2/2 [00:00<00:00, 57.55it/s, v_num=6]


In [38]:
print("Company A: Observed=0, Predicted=",model(torch.tensor([0.,0.5,0.25,1.])).detach())
print("Company B: Observed=1, Predicted=",model(torch.tensor([1.,0.5,0.25,1.])).detach())

Company A: Observed=0, Predicted= tensor(0.0008)
Company B: Observed=1, Predicted= tensor(0.9622)


# Using and Training PyTorch's nn.LSTM()

In [39]:
class LightningLSTM(L.LightningModule):
    def __init__(self):
        super().__init__()
        self.lstm = nn.LSTM(input_size=1, hidden_size=1)
    def forward(self,input):
        input_trans = input.view(len(input),1)
        lstm_out, temp = self.lstm(input_trans)
        prediction = lstm_out[-1]
        return prediction
    def configure_optimizers(self):
        return Adam(self.parameters(), lr=0.1)
    def training_step(self, batch, batch_idx):
        input_i, label_i = batch 
        output_i = self.forward(input_i[0])
        loss = (output_i - label_i)**2 
        
        self.log("train_loss",loss)
        
        if(label_i == 0):
            self.log("out_0",output_i)
        else:
            self.log("out_1",output_i)
        return loss         

In [40]:
model = LightningLSTM()

print("Company A: Observed=0, Predicted=",model(torch.tensor([0.,0.5,0.25,1.])).detach())
print("Company B: Observed=1, Predicted=",model(torch.tensor([1.,0.5,0.25,1.])).detach())

Company A: Observed=0, Predicted= tensor([-0.3953])
Company B: Observed=1, Predicted= tensor([-0.3879])


In [41]:
trainer = L.Trainer(max_epochs=300, log_every_n_steps=2)

trainer.fit(model, train_dataloaders=dataLoader)

You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name | Type | Params | Mode 
--------------------------------------
0 | lstm | LSTM | 16     | train
--------------------------------------
16        Trainable params
0         Non-trainable params
16        Total params
0.000     Total estimated model params size (MB)
1         Modules in train mode
0         Modules in eval mode
/Users/sanjeeb/Desktop/Harbin Institute of Technology/Artificial Intelligence/AI_Codes/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Epoch 299: 100%|██████████| 2/2 [00:00<00:00, 137.19it/s, v_num=7]

`Trainer.fit` stopped: `max_epochs=300` reached.


Epoch 299: 100%|██████████| 2/2 [00:00<00:00, 102.59it/s, v_num=7]


In [42]:
print("Company A: Observed=0, Predicted=",model(torch.tensor([0.,0.5,0.25,1.])).detach())
print("Company B: Observed=1, Predicted=",model(torch.tensor([1.,0.5,0.25,1.])).detach())

Company A: Observed=0, Predicted= tensor([0.0001])
Company B: Observed=1, Predicted= tensor([0.9882])
