# Profiling resnets on GPU - Training and Data Loading

Next, we compare the performance of ResNets with VGG16.

In [86]:
import numpy as np
import torch
from torch import nn
from torch.profiler import profile, record_function, ProfilerActivity
import torchvision
import torchvision.transforms as transforms
import torch.optim as optim
import time

In [87]:
path = "./data"
workers = 2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 64

In [88]:
transform_train = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
trainset = torchvision.datasets.CIFAR100(
    root=path, train=True, download=True, transform=transform_train)
trainloader = torch.utils.data.DataLoader(
    trainset, batch_size=batch_size, shuffle=True, num_workers=workers)

testset = torchvision.datasets.CIFAR100(
    root=path, train=False, download=True, transform=transform_train)
testloader = torch.utils.data.DataLoader(
    testset, batch_size = batch_size, shuffle=True, num_workers=workers)

Files already downloaded and verified
Files already downloaded and verified


In [90]:
def train(model, criterion, trainloader, optimizer, scheduler, epochs):
  model.train()
  for epoch in range(0, epochs):
      print('\nEpoch: %d' % epoch)
      running_loss = 0.0

      for i, data in enumerate(trainloader, 0):
          inputs, labels = data
          inputs = inputs.to(device)
          labels = labels.to(device)

          optimizer.zero_grad()

          outputs = model(inputs)
          loss = criterion(outputs,labels)
          loss.backward()
          optimizer.step()

          running_loss += loss.item()

          del inputs, labels

          if i % 100 == 99:
              print(f'Average loss: {running_loss / 100}')
              running_loss = 0.0

      scheduler.step()

In [96]:
def evaluate(model, testloader):
    model.eval()
    total = 0
    correct = 0
    
    with torch.no_grad():
        for data in testloader:
            images, labels = data
            images = images.to(device)
            labels = labels.to(device)
            # calculate outputs by running images through the network
            outputs = model(images)
            # the class with the highest energy is what we choose as prediction
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(f'Accuracy of the network on the test images: {100 * correct // total} %')

In [97]:
lr = 0.0001
epochs = 30

net = torchvision.models.resnet101()
net = net.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[20], gamma=0.1)

# Train
print("Training Large ResNet-50 with bottleneck layers")
start = time.perf_counter()
train(net, criterion, trainloader, optimizer, scheduler, epochs)
stop = time.perf_counter()

# Evaluate
evaluate(net, testloader)

print(f'Total training time: {(stop-start)/60} minutes')

Training Large ResNet-50 with bottleneck layers

Epoch: 0
Average loss: 5.0566258096694945
Average loss: 4.737003808021545
Average loss: 4.681090292930603
Average loss: 4.638065795898438
Average loss: 4.579130535125732
Average loss: 4.526783618927002
Average loss: 4.451810903549195
Accuracy of the network on the test images: 4 %
Total training time: 74.4431343289998s


In [None]:
lr = 0.0001
epochs = 30

net = torchvision.models.vgg16()
net = net.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.9)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[20], gamma=0.1)

# Train
print("Training VGG-16")
start = time.perf_counter()
train(net, criterion, trainloader, optimizer, scheduler, epochs)
stop = time.perf_counter()

# Evaluate
evaluate(net, testloader)

print(f'Total training time: {(stop-start)/60} minutes')