In [25]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader


In [26]:
# Load data
transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

In [27]:
train_data = datasets.MNIST(
    root="data",
    train=True,
    download=True,
    transform=transforms
)

test_data = datasets.MNIST(
    root="data",
    train=False,
    download=True,
    transform=transforms
)


In [28]:
train_data, validation_data = torch.utils.data.random_split(train_data, [0.8, 0.2], generator=torch.Generator().manual_seed(55))

In [29]:
batch_size = 128
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
validation_loader = DataLoader(validation_data, batch_size=batch_size, shuffle=False)

In [30]:
class NonlinearClassifier(nn.Module):

    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.layers_stack = nn.Sequential(
            nn.Linear(28*28, 50),
            nn.ReLU(),
            #nn.Dropout(0.2),
            nn.Linear(50, 50),
            nn.ReLU(),
            #nn.Dropout(0.2),
            nn.Linear(50, 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        x = self.layers_stack(x)

        return x

In [31]:
def train_one_epoch(dataloader, model, loss_fn, optimizer):
    model.train()

    for batch, (X, y) in enumerate(dataloader):
        pred = model(X)

        loss = loss_fn(pred, y)

        loss.backward()

        optimizer.step()

        optimizer.zero_grad()

In [32]:
def evaluate(dataloader, model, loss_fn):
    model.eval()
    num_batches = len(dataloader)
    correct, loss = 0.0, 0.0
    
    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
            
    loss /= num_batches
    correct /= num_batches
    accuracy = 100.0 * correct 
    
    return accuracy, loss
            

In this problem, using the **Adam optimizer** may be superior to SGD.
Adam can adaptively adjust the learning rate for each parameter, which means optimizing the update speed for different parameters. 

For complex deep learning models, different parameters may exhibit significant differences in features, so adaptive learning rate adjustment can more effectively converge the model to the optimal solution.

Meanwhile, we use NonlinearClassifier instead of linear.

In [33]:
nonlinear_model = NonlinearClassifier()
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(nonlinear_model.parameters(), lr=0.0001)

In [34]:
epochs = 10
for j in range(epochs) :
    train_one_epoch(train_loader, nonlinear_model, loss_fn, optimizer)

    acc, loss = evaluate(validation_loader, nonlinear_model, loss_fn)
    print(f"Epoch {j}: training loss: {loss}, accuracy: {acc}")

Epoch 0: training loss: 0.8515109914414426, accuracy: 10255.31914893617
Epoch 1: training loss: 0.49639235373507157, accuracy: 11104.255319148937
Epoch 2: training loss: 0.3944193318169168, accuracy: 11376.595744680852
Epoch 3: training loss: 0.3474557436210044, accuracy: 11493.617021276596
Epoch 4: training loss: 0.3170074398847336, accuracy: 11620.212765957447
Epoch 5: training loss: 0.2969891703192224, accuracy: 11667.021276595744
Epoch 6: training loss: 0.2802999181316254, accuracy: 11711.702127659575
Epoch 7: training loss: 0.26866967516376616, accuracy: 11754.255319148937
Epoch 8: training loss: 0.2573510560583561, accuracy: 11788.297872340425
Epoch 9: training loss: 0.24700508155721299, accuracy: 11838.297872340425
