In [18]:
import torch
import torch.nn as nn

In [19]:
class Layer:
    def __init__(self, D_in, D_out, activation_type="relu"):
        self.D_in = D_in
        self.D_out = D_out
        self.activation_type = activation_type
        
        self.W = torch.randn((D_in, D_out)) * torch.sqrt(torch.tensor(2.0 / D_in))
        self.b = torch.zeros(D_out)
        self.W.requires_grad = True
        self.b.requires_grad = True
        
        if activation_type == "relu":
            self.activation = nn.ReLU()
        elif activation_type == "tanh":
            self.activation = nn.Tanh()
        elif activation_type == "sigmoid":
            self.activation = nn.Sigmoid()
        else:
            self.activation = None
    
    def forward(self, X):
        z = X @ self.W + self.b
        if self.activation is not None:
            a = self.activation(z)
        else:
            a = z
        return a, z
    
    def parameters(self):
        return [self.W, self.b]

In [20]:
class Network:
    def __init__(self, layer_dims, activation_types=None, optimizer_type=None):
        self.layers = []
        self.layer_dims = layer_dims
        self.optimizer_type = optimizer_type
        
        if activation_types is None:
            activation_types = ["relu"] * (len(layer_dims) - 2) + ["none"]

        for i in range(len(layer_dims) - 1):
            layer = Layer(layer_dims[i], layer_dims[i+1], activation_types[i])
            self.layers.append(layer)
        
        if optimizer_type is not None and optimizer_type == "Adam":
            self.mt = [torch.zeros_like(p,requires_grad=False) for p in self.parameters()]
            self.vt = [torch.zeros_like(p,requires_grad=False) for p in self.parameters()]
            self.t = 0
        elif optimizer_type is not None and optimizer_type == "SGD":
            self.velocity = [torch.zeros_like(p,requires_grad=False) for p in self.parameters()]

    def forward(self, X):
        a = X
        for layer in self.layers:
            a, _ = layer.forward(a)
        return a

    def parameters(self):
        params = []
        for layer in self.layers:
            params.extend(layer.parameters())
        return params

    def train_epoch(self, X, y, learning_rate, momentum=0.9):
        predictions = self.forward(X)
        loss = torch.mean((predictions - y) ** 2)
        loss.backward()

        if self.optimizer_type is None:
            with torch.no_grad():
                for param in self.parameters():
                    param -= learning_rate * param.grad
                    
        elif self.optimizer_type == "SGD":
            with torch.no_grad():
                for i, param in enumerate(self.parameters()):
                    self.velocity[i] = momentum * self.velocity[i] - learning_rate * param.grad
                    param += self.velocity[i]
                    
        elif self.optimizer_type == "Adam":
            beta1 = 0.9 
            beta2 = 0.999
            epsilon = 1e-8
            self.t += 1  
            
            with torch.no_grad():
                for i, param in enumerate(self.parameters()):
                    self.mt[i] = beta1 * self.mt[i] + (1 - beta1) * param.grad
                    self.vt[i] = beta2 * self.vt[i] + (1 - beta2) * (param.grad ** 2)
                    mt_hat = self.mt[i] / (1 - beta1 ** self.t)
                    vt_hat = self.vt[i] / (1 - beta2 ** self.t)
                    param -= learning_rate * mt_hat / (torch.sqrt(vt_hat + epsilon))

        for param in self.parameters():
            if param.grad is not None:
                param.grad.zero_()

        return loss.item()

In [21]:
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader


transform = transforms.Compose([
    transforms.ToTensor(), 
    transforms.Normalize((0.1307,), (0.3081,)) 
])

train_dataset = torchvision.datasets.MNIST(
    root='./data',
    train=True,
    download=True,
    transform=transform
)

test_dataset = torchvision.datasets.MNIST(
    root='./data',
    train=False,
    download=True,
    transform=transform
)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
images,labels = next(iter(train_loader))
# images.shape,labels.shape


In [22]:
layer_dims = [784, 128, 64, 32, 10]
activation_types = ["relu", "relu", "relu", "none"]
optimizer_type = "SGD"
model = Network(layer_dims, activation_types,optimizer_type)

def train_mnist_epoch(model, train_loader, learning_rate):
    total_loss = 0
    
    for images, labels in train_loader:
        images = images.reshape(-1, 28 * 28)
        
        labels_one_hot = torch.zeros(labels.shape[0], 10)
        labels_one_hot.scatter_(1, labels.unsqueeze(1), 1)
        
        loss = model.train_epoch(images, labels_one_hot, learning_rate)
        total_loss += loss
    
    return total_loss / len(train_loader)

learning_rate = 0.01
epochs = 3
losses = []

for epoch in range(epochs):
    loss = train_mnist_epoch(model, train_loader, learning_rate)
    losses.append(loss)
    print(f"Epoch {epoch+1}/{epochs}: Loss = {loss:.6f}")

Epoch 1/3: Loss = 0.063990
Epoch 2/3: Loss = 0.031553
Epoch 3/3: Loss = 0.022946


In [23]:
total_test = 0
correct_prediction = 0

for images,labels in test_loader:
    images = images.reshape(-1,28*28)
    prediction = model.forward(images)
    prediction = torch.argmax(prediction,dim =1)
    correct_prediction += torch.sum(prediction == labels)
    total_test += labels.shape[0]
accuracy = correct_prediction/total_test *100
print(f"Accuracy: {accuracy}")

Accuracy: 88.2300033569336
