In [44]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torchvision import transforms
from torchvision.datasets import FashionMNIST
from torch.utils.data import DataLoader, random_split
from torch import optim

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')


print(device)

In [45]:
BATCH_SIZE = 64
transforms = transforms.Compose([transforms.ToTensor(), transforms.Normalize(mean=0.2860 ,std= 0.3530)])
dataset = FashionMNIST(root='data/', download=True,
transform=transforms)
train, val = random_split(dataset, [55000, 5000])
train =  DataLoader(train, batch_size=BATCH_SIZE, shuffle=True,drop_last=True)
val = DataLoader(val, batch_size=BATCH_SIZE, shuffle=True,drop_last=True)

In [90]:
class SimpleLSTM(nn.Module):
    def __init__(self,input_size,hidden_size,output_size,num_layers = 1) -> None:
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        self.rnn = nn.LSTM(input_size,hidden_size,num_layers,batch_first = False)
        self.fc1 = nn.Linear(hidden_size,output_size)
    def forward(self,x):
        x,_= self.rnn(x) # x = (seq_len,batch_size,input_size)
        x = x[-1,:,:]
        x = self.fc1(x)
        return x.softmax(dim = 1)


This [paper](https://arxiv.org/pdf/1508.02774.pdf) has been used as reference for obtaining the hidden size of the LSTM and the learning rate. Basically, it says that the LSTM is really sensible to the learnign rate and the hidden size; yet other parameters as the batch size and the momentum doest not really affect the training.

In [None]:
EPOCHS = 1_000
model = SimpleLSTM(1,512,10)
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),lr=1e-3)


model.train()
for epoch in range(EPOCHS):
  loss_list = np.array([])
  for i, batch_idex in enumerate(train):
    optimizer.zero_grad()
    x, y = batch_idex
    x = x.to(device)
    y = y.to(device)
    x = x.permute(1,0,2,3)
    x = x.reshape(1,BATCH_SIZE,28*28)
    x = x.permute(2,1,0)
    y = F.one_hot(y, num_classes=10).view(10,BATCH_SIZE)
    y = y.float()
    y = y.permute(1,0)
    pred = model(x) # 64,784,1
    loss = criterion(pred,y)
    loss_list = np.append(loss_list,loss.item())
    loss.backward()
    optimizer.step()

  
  print(f"Epoch {epoch} de {EPOCHS}")
  print(f"Loss: {loss_list.mean()}")

In [None]:
class LitSimpleLTSM(pl.LightningModule):
    def __init__(self,input_size,hidden_size,output_size,num_layers = 1):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        self.RNN = SimpleLSTM(input_size=input_size,hidden_size=hidden_size,output_size=output_size,num_layers=num_layers)
    
    def hot_encode(self,y, num_classes=10):
        """
        One hot encode an int
        """
        y = F.one_hot(y, num_classes=10).view(10,BATCH_SIZE)
        return y.float()
    
    def training_step(self, batch, batch_idx):
        # training_step defines the train loop.
        # it is independent of forward
        x, y = batch
        x = x.permute(1,0,2,3)
        x = x.reshape(1,BATCH_SIZE,28*28)
        x = x.permute(2,1,0)
        y = self.hot_encode(y,10).view(10,BATCH_SIZE)
        y = y.permute(1,0)
        pred = self.RNN(x)
        loss = nn.CrossEntropyLoss()(pred,y)
        self.log("train_loss", loss)
        return loss
    def validation_step(self, batch, batch_idx):
        
        pass
    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=1e-3)
        return optimizer