In [1]:
from sklearn.datasets import fetch_openml
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
mnist = fetch_openml('mnist_784', as_frame=False)
X, y = torch.from_numpy(mnist.data.astype('float32')).to(device) / 255, torch.from_numpy(mnist.target.astype('int')).to(device)

In [2]:
from torch.utils.data import TensorDataset, DataLoader

split = int(len(X) * 6 / 7)

test_X = X[split:]
test_y = y[split:]
train_X = X[:split]
train_y = y[:split]

train_X.shape, train_y.shape

(torch.Size([60000, 784]), torch.Size([60000]))

In [3]:
import torch.nn as nn, torch.nn.functional as F

class MulticlassMLP(nn.Module):
    def __init__(self, in_dim=784, hidden_dim=128, out_dim=10):
        super().__init__()
        self.W1 = nn.Parameter(torch.randn(in_dim, hidden_dim, device=device))
        self.B1 = nn.Parameter(torch.randn(hidden_dim, device=device))
        self.W2 = nn.Parameter(torch.randn(hidden_dim, out_dim, device=device))
        self.B2 = nn.Parameter(torch.randn(out_dim, device=device))
    
    def forward(self, X):
        X2 = X @ self.W1 + self.B1
        X3 = F.relu(X2)
        X4 = X3 @ self.W2 + self.B2
        return X4

model = MulticlassMLP().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)

In [4]:
model.train()
epoch = 10000
train_error, test_error = [], []

for i in range(epoch):
    train_loss = criterion(model(train_X), train_y)
    test_loss = criterion(model(test_X), test_y)
    optimizer.zero_grad()
    train_loss.backward()
    optimizer.step()
    train_error.append(train_loss)
    test_error.append(test_loss)
    if i % 1000 == 0:
        print(f'Train error: {train_loss}. Test error: {test_loss}.')

Train error: 114.4581527709961. Test error: 114.16349029541016.


OutOfMemoryError: CUDA out of memory. Tried to allocate 30.00 MiB. GPU 0 has a total capacity of 3.94 GiB of which 16.31 MiB is free. Including non-PyTorch memory, this process has 3.84 GiB memory in use. Of the allocated memory 3.70 GiB is allocated by PyTorch, and 70.12 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
correct = 0

for i in range(len(test_X)):
    correct += torch.argmax(model(test_X[i])) == test_y[i].item()
    print(torch.argmax(model(test_X[i])), test_y[i].item())

tensor(5, device='cuda:0') 5
tensor(0, device='cuda:0') 0
tensor(2, device='cuda:0') 4
tensor(1, device='cuda:0') 1
tensor(9, device='cuda:0') 9
tensor(2, device='cuda:0') 2
tensor(1, device='cuda:0') 1
tensor(3, device='cuda:0') 3
tensor(1, device='cuda:0') 1
tensor(4, device='cuda:0') 4
tensor(3, device='cuda:0') 3
tensor(5, device='cuda:0') 5
tensor(3, device='cuda:0') 3
tensor(6, device='cuda:0') 6
tensor(1, device='cuda:0') 1
tensor(7, device='cuda:0') 7
tensor(2, device='cuda:0') 2
tensor(8, device='cuda:0') 8
tensor(6, device='cuda:0') 6
tensor(9, device='cuda:0') 9
tensor(4, device='cuda:0') 4
tensor(0, device='cuda:0') 0
tensor(9, device='cuda:0') 9
tensor(1, device='cuda:0') 1
tensor(7, device='cuda:0') 1
tensor(2, device='cuda:0') 2
tensor(4, device='cuda:0') 4
tensor(3, device='cuda:0') 3
tensor(7, device='cuda:0') 2
tensor(7, device='cuda:0') 7
tensor(3, device='cuda:0') 3
tensor(8, device='cuda:0') 8
tensor(6, device='cuda:0') 6
tensor(9, device='cuda:0') 9
tensor(0, devi

In [None]:
correct

tensor(53272, device='cuda:0')

In [7]:
import torch
del model
del train_X
del train_y
del test_X
del test_y

In [8]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

In [9]:
!nvidia-smi

Thu Sep 25 22:18:42 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.82.07              Driver Version: 580.82.07      CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Quadro P2000                   On  |   00000000:01:00.0  On |                  N/A |
| N/A   52C    P8            N/A  / 5001W |    4016MiB /   4096MiB |     26%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+----------------------------------------------

In [12]:
import gc, torch
for obj in gc.get_objects():
    try:
        if torch.is_tensor(obj) and obj.is_cuda:
            print(type(obj), obj.size(), obj.device)
    except:
        pass


<class 'torch.Tensor'> torch.Size([70000, 784]) cuda:0
<class 'torch.Tensor'> torch.Size([70000]) cuda:0
<class 'torch.Tensor'> torch.Size([10000, 784]) cuda:0
<class 'torch.Tensor'> torch.Size([10000]) cuda:0
<class 'torch.Tensor'> torch.Size([60000, 784]) cuda:0
<class 'torch.nn.parameter.Parameter'> torch.Size([784, 128]) cuda:0
<class 'torch.Tensor'> torch.Size([]) cuda:0
<class 'torch.Tensor'> torch.Size([10000, 128]) cuda:0
<class 'torch.Tensor'> torch.Size([]) cuda:0
<class 'torch.Tensor'> torch.Size([784, 128]) cuda:0
<class 'torch.Tensor'> torch.Size([784, 128]) cuda:0
<class 'torch.Tensor'> torch.Size([128]) cuda:0
<class 'torch.Tensor'> torch.Size([128]) cuda:0
<class 'torch.Tensor'> torch.Size([128, 10]) cuda:0
<class 'torch.Tensor'> torch.Size([128, 10]) cuda:0
<class 'torch.Tensor'> torch.Size([10]) cuda:0
<class 'torch.Tensor'> torch.Size([10]) cuda:0
<class 'torch.Tensor'> torch.Size([]) cuda:0
<class 'torch.Tensor'> torch.Size([10000, 128]) cuda:0
<class 'torch.Tensor'