## Load Data

In [119]:
import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision
from torchvision.datasets import FashionMNIST
import torchvision.transforms as transforms
import numpy as np
import random
import torch.nn.functional as F

np.random.seed(0)
random.seed(0)
torch.manual_seed(0)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [120]:
batch_size = 512
num_epochs = 15

train_dataset = FashionMNIST('./data', train=True, download=False, transform=transforms.ToTensor())
train_loader = DataLoader(train_dataset, batch_size, shuffle=True)

In [121]:
class SwigGLU(nn.Module):
    def __init__(self):
        super(SwigGLU, self).__init__()
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        a, b = x.chunk(2, dim=1)
        return a * self.sigmoid(b)

In [122]:
class MLP(nn.Module):
    def __init__(self, input_dims, hidden_dims, output_dims):
        super(MLP, self).__init__()
        self.layer1 = nn.Linear(input_dims, hidden_dims*2)
        self.layer2 = nn.Linear(hidden_dims*2, hidden_dims*2)
        self.layer3 = nn.Linear(hidden_dims*2, hidden_dims*2)
        self.output = nn.Linear(hidden_dims, output_dims)
        self.bn1 = nn.BatchNorm1d(hidden_dims)
        self.bn_out = nn.BatchNorm1d(output_dims)
        for m in self.modules():
            if isinstance(m, nn.Linear):
                # nn.init.normal_(m.weight, mean=0.0, std=0.05)
                nn.init.kaiming_normal(m.weight)
                nn.init.constant_(m.bias, 0.0)


    def forward(self, x):
        x = nn.Flatten()(x)
        x = self.layer1(x)
        x = self.bn1(x)
        # x = F.relu(x)
        x = SwigGLU()(x)
        identity = x
        
        x = self.layer2(x)
        x = self.bn1(x)
        # x = F.relu(x)
        x = SwigGLU()(x)
        x = self.layer3(x)
        x = self.bn1(x)
        # x = F.relu(x)
        x = SwigGLU()(x)
        
        x = x + identity
        out = self.output(x)
        out = self.bn_out(out)
        
        return out

In [123]:
model = MLP(input_dims=784, hidden_dims=128, output_dims=10).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

  nn.init.kaiming_normal(m.weight)


In [124]:
torch.backends.cudnn.enabled = False
for epoch in range(num_epochs):    
    t_loss = 0
    t_acc = 0
    cnt = 0
    for X, y in train_loader:
        X, y = X.to(device), y.to(device)
        optimizer.zero_grad()
        outputs = model(X)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()
        
        t_loss += loss.item()
        t_acc += (torch.argmax(outputs, 1) == y).sum().item()
        cnt += len(y)

    t_loss /= len(train_loader)
    t_acc /= cnt
    print(f"Epoch {epoch+1}/{num_epochs}, Train_Loss: {t_loss:.4f}, Train_Acc: {t_acc:.4f}")

RuntimeError: running_mean should contain 256 elements not 128