In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import numpy as np
from tqdm import tqdm

if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Cuda is avilable. Using device: {torch.cuda.get_device_name(0)}")
else:
    print("cuda is not available")

class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(784, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 10)

    def forward(self, x):
        x = torch.flatten(x, 1)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

class EarlyStopping:
    def __init__(self, patience=5, delta=0):
        self.patience = patience
        self.delta = delta
        self.best_score = None
        self.early_stop = False
        self.counter = 0
        self.best_model_state = None

    def __call__(self, val_loss, model):
        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.best_model_state = model.state_dict()
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.best_model_state = model.state_dict()
            self.counter = 0

    def load_best_model(self, model):
        model.load_state_dict(self.best_model_state)


# Data loading
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Model, loss function, and optimizer
model = SimpleNN()
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Early stopping
early_stopping = EarlyStopping(patience=5, delta=0.01)

# Training loop
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for data, target in tqdm(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * data.size(0)

    train_loss /= len(train_loader.dataset)

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for data, target in tqdm(test_loader):
            data, target = data.to(device), target.to(device)
            output = model(data)
            loss = criterion(output, target)
            val_loss += loss.item() * data.size(0)

    val_loss /= len(test_loader.dataset)

    print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')

    early_stopping(val_loss, model)
    if early_stopping.early_stop:
        print("Early stopping")
        break

# Load the best model
early_stopping.load_best_model(model)

model.eval()
correct = 0
total = 0
with torch.no_grad():
    for data, target in test_loader:
        data, target = data.to(device), target.to(device)
        outputs = model(data)
        _, predicted = torch.max(outputs.data, 1)
        total += target.size(0)
        correct += (predicted == target).sum().item()

print(f'Accuracy of the model on the test images: {100 * correct / total:.2f}%')

Cuda is avilable. Using device: NVIDIA GeForce RTX 3050 Laptop GPU


100%|██████████| 938/938 [00:12<00:00, 75.72it/s]
100%|██████████| 157/157 [00:02<00:00, 68.69it/s]


Epoch 1, Train Loss: 0.3891, Val Loss: 0.2360


100%|██████████| 938/938 [00:17<00:00, 54.48it/s]
100%|██████████| 157/157 [00:02<00:00, 76.15it/s]


Epoch 2, Train Loss: 0.1821, Val Loss: 0.1481


100%|██████████| 938/938 [00:16<00:00, 58.43it/s]
100%|██████████| 157/157 [00:01<00:00, 79.23it/s]


Epoch 3, Train Loss: 0.1350, Val Loss: 0.1420


100%|██████████| 938/938 [00:15<00:00, 61.27it/s]
100%|██████████| 157/157 [00:02<00:00, 77.05it/s]


Epoch 4, Train Loss: 0.1105, Val Loss: 0.1257


100%|██████████| 938/938 [00:15<00:00, 60.60it/s]
100%|██████████| 157/157 [00:01<00:00, 85.20it/s]


Epoch 5, Train Loss: 0.0939, Val Loss: 0.0896
Accuracy of the model on the test images: 97.16%


In [2]:
with torch.cuda.profiler.profile():
  for data, target in train_loader:
    data, target = data.to(device), target.to(device)
    outputs = model(data)
    loss = criterion(outputs, target)  
    loss.backward()

print(torch.cuda.max_memory_allocated())
print(torch.cuda.max_memory_reserved())

19435008
25165824
