In [3]:
import torch
assert torch.cuda.is_available(), 'Switch to GPU Runtime: Runtime > Change runtime type > GPU'
!nvidia-smi

Thu Mar  6 19:14:09 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   46C    P8              9W /   70W |       2MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [4]:
import torchvision
import torchvision.transforms as transforms
import time


In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using Device:{device}")

Using Device:cuda


In [7]:
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])

In [8]:
train_dataset = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transform)


Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9.91M/9.91M [00:00<00:00, 16.1MB/s]


Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28.9k/28.9k [00:00<00:00, 479kB/s]


Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1.65M/1.65M [00:00<00:00, 3.84MB/s]


Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4.54k/4.54k [00:00<00:00, 5.83MB/s]

Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw






In [10]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=128, shuffle= True, num_workers=2)
test_loader = torch.utils.data.DataLoader(test_dataset,batch_size=128, shuffle=False, num_workers=2)


In [11]:
print("Training Data Size:", len(train_dataset))
print("Test Data Size:", len(test_dataset))

Training Data Size: 60000
Test Data Size: 10000


In [15]:
#MLP
class MLP(torch.nn.Module):
  def __init__(self):
    super(MLP, self).__init__()
    self.flatten = torch.nn.Flatten()
    self.fc1 = torch.nn.Linear(28*28, 512)
    self.fc2 = torch.nn.Linear(512, 256)
    self.fc3 = torch.nn.Linear(256, 10)
    self.relu = torch.nn.ReLU()

  def forward(self,x):
    x= self.flatten(x)
    x= self.relu(self.fc1(x))
    x= self.relu(self.fc2(x))
    x=self.fc3(x)
    return x

In [16]:
model = MLP().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.CrossEntropyLoss()


In [17]:
#TRAIN
def train_epoch(loader, model, optimizer, criterion, device):
  model.train()
  for batch_idx, (data,target) in enumerate(loader):
    data,target = data.to(device), target.to(device)
    optimizer.zero_grad()
    output= model(data)
    loss= criterion(output, target)
    loss.backward()
    optimizer.step()
    if batch_idx%100 ==0:
      print(f"batch {batch_idx}, loss: {loss.item():.4f}")

In [18]:
for epoch in range(3):
  print(f"Epoch{epoch+1}")
  train_epoch(train_loader, model, optimizer, criterion, device)

Epoch1
batch 0, loss: 2.3042
batch 100, loss: 0.2279
batch 200, loss: 0.1798
batch 300, loss: 0.1803
batch 400, loss: 0.1344
Epoch2
batch 0, loss: 0.0255
batch 100, loss: 0.1235
batch 200, loss: 0.1149
batch 300, loss: 0.0868
batch 400, loss: 0.0436
Epoch3
batch 0, loss: 0.0540
batch 100, loss: 0.0560
batch 200, loss: 0.0170
batch 300, loss: 0.0727
batch 400, loss: 0.0887


In [19]:
#Evaluate
model.eval()
correct=0
total=0
with torch.no_grad():
  for data, target in test_loader:
    data, target= data.to(device), target.to(device)
    outputs = model(data)
    _,predicted = torch.max(outputs.data,1)
    total+=target.size(0)
    correct+=(predicted==target).sum().item()
accuracy = 100*correct/total
print(f"Test Accuracy: {accuracy:.2f}%")

Test Accuracy: 97.41%


In [31]:
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1000, shuffle=False, num_workers=2)

# GPU inference with multiple runs for stability
model.eval()
gpu_times = []
for _ in range(5):  # Average over 5 runs
    with torch.no_grad():
        total_time = 0
        for data, target in test_loader:
            data = data.to(device)
            torch.cuda.synchronize()  # Sync before timing
            start_time = time.time()
            outputs = model(data)
            torch.cuda.synchronize()  # Sync after
            total_time += time.time() - start_time
    gpu_times.append(total_time)

gpu_time = sum(gpu_times) / len(gpu_times)  # Average time
gpu_throughput = len(test_dataset) / gpu_time
print(f"GPU inference time (avg): {gpu_time:.4f} seconds")
print(f"GPU throughput: {gpu_throughput:.2f} images/second")

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat1 in method wrapper_CUDA_addmm)

In [26]:
#cpu inference
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1000, shuffle=False, num_workers=2)
cpu_device = torch.device('cpu')
model_cpu=model.to(cpu_device)
cpu_time=0
with torch.no_grad():
  for data, target in test_loader:
    data = data.to(cpu_device)
    start_time=time.time()
    outputs = model_cpu(data)
    cpu_time+= time.time() - start_time

cpu_throughput = len(test_dataset)/cpu_time
print(f"CPU Inference Time:{cpu_time:.4f} secs")
print(f"CPU Throughput:{cpu_throughput:.2f} images/sec")

CPU Inference Time:0.2933 secs
CPU Throughput:34099.70 images/sec


In [28]:
speedup = (gpu_throughput - cpu_throughput) / cpu_throughput * 100
print(f"GPU speedup over CPU: {speedup:.2f}%")

GPU speedup over CPU: 2492.66%


In [30]:
from google.colab import drive
drive.mount('/content/drive')

# Save model
torch.save(model.state_dict(), 'mnist_model_gpu.pth')
!cp mnist_model_gpu.pth /content/drive/MyDrive/mnist_model_gpu.pth
print("Model saved to Drive")

Mounted at /content/drive
Model saved to Drive


In [33]:
# Prepare test batch
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1000, shuffle=False, num_workers=2)

# GPU inference with multiple runs for stability
model = model.to(device)
model.eval()
gpu_times = []
for _ in range(5):  # Average over 5 runs
    with torch.no_grad():
        total_time = 0
        for data, target in test_loader:
            data = data.to(device)
            torch.cuda.synchronize()  # Sync before timing
            start_time = time.time()
            outputs = model(data)
            torch.cuda.synchronize()  # Sync after
            total_time += time.time() - start_time
    gpu_times.append(total_time)

gpu_time = sum(gpu_times) / len(gpu_times)  # Average time
gpu_throughput = len(test_dataset) / gpu_time
print(f"GPU inference time (avg): {gpu_time:.4f} seconds")
print(f"GPU throughput: {gpu_throughput:.2f} images/second")

# CPU inference (unchanged)
cpu_device = torch.device('cpu')
model_cpu = model.to(cpu_device)
cpu_time = 0
with torch.no_grad():
    for data, target in test_loader:
        data = data.to(cpu_device)
        start_time = time.time()
        outputs = model_cpu(data)
        cpu_time += time.time() - start_time

cpu_throughput = len(test_dataset) / cpu_time
print(f"CPU inference time: {cpu_time:.4f} seconds")
print(f"CPU throughput: {cpu_throughput:.2f} images/second")

# Speedup
speedup = (gpu_throughput - cpu_throughput) / cpu_throughput * 100
print(f"GPU speedup over CPU: {speedup:.2f}%")

GPU inference time (avg): 0.0101 seconds
GPU throughput: 993844.96 images/second
CPU inference time: 0.2825 seconds
CPU throughput: 35399.42 images/second
GPU speedup over CPU: 2707.52%


In [36]:
# Single-run sanity check with full test set in one batch
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=10000, shuffle=False, num_workers=2)
model= model.to(device)
model.eval()

with torch.no_grad():
    data, target = next(iter(test_loader))  # One batch of 10,000
    data = data.to(device)
    torch.cuda.synchronize()
    start_time = time.time()
    outputs = model(data)
    torch.cuda.synchronize()
    gpu_time = time.time() - start_time

gpu_throughput = len(test_dataset) / gpu_time
print(f"GPU inference time (single run, 10k batch): {gpu_time:.4f} seconds")
print(f"GPU throughput: {gpu_throughput:.2f} images/second")

GPU inference time (single run, 10k batch): 0.0046 seconds
GPU throughput: 2159118.71 images/second


In [37]:
# Multi-run sanity check with 10k batch
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=10000, shuffle=False, num_workers=2)
model= model.to(device)
model.eval()

gpu_times = []
for _ in range(5):  # 5 runs
    with torch.no_grad():
        data, target = next(iter(test_loader))
        data = data.to(device)
        torch.cuda.synchronize()
        start_time = time.time()
        outputs = model(data)
        torch.cuda.synchronize()
        gpu_times.append(time.time() - start_time)

gpu_time = sum(gpu_times) / 5
gpu_throughput = len(test_dataset) / gpu_time
print(f"GPU inference time (avg, 10k batch): {gpu_time:.4f} seconds")
print(f"GPU throughput: {gpu_throughput:.2f} images/second")

GPU inference time (avg, 10k batch): 0.0046 seconds
GPU throughput: 2156875.89 images/second
