In [1]:
 !pip install pynvml

Collecting pynvml
  Downloading pynvml-11.5.3-py3-none-any.whl.metadata (8.8 kB)
Downloading pynvml-11.5.3-py3-none-any.whl (53 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/53.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pynvml
Successfully installed pynvml-11.5.3


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
import torchvision
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
from torch.profiler import profile, record_function, ProfilerActivity

import matplotlib.pyplot as plt
import numpy as np

import copy
from collections import namedtuple
import time
import os
import random
import re

import cv2
from torch.utils.data import Dataset, DataLoader, Subset
from PIL import Image

from tqdm import tqdm
from pynvml import *
import pandas as pd

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
def print_gpu_utilization():
    if torch.cuda.is_available():
        device = torch.cuda.current_device()  # 현재 GPU 디바이스 정보
        allocated_memory = torch.cuda.memory_allocated(device) / 1024**3  # 메모리 사용량 (GB)
        reserved_memory = torch.cuda.memory_reserved(device) / 1024**3  # 예약된 메모리 (GB)
        print(f"Allocated Memory: {allocated_memory:.2f} GB")
        print(f"Reserved Memory: {reserved_memory:.2f} GB")
    else:
        print("No GPU available.")

In [4]:
def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [5]:
size = 224
mean = (0.485, 0.456, 0.406)
std = (0.229, 0.224, 0.225)
batch_size = 32

In [6]:
train_transforms = transforms.Compose([
    transforms.RandomResizedCrop(size, scale=(0.5, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])

test_transforms = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(size),
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])

In [7]:
# CIFAR-10
trainset = datasets.CIFAR10(root='./data', train=True, download=True, transform=train_transforms)
trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True)

testset = datasets.CIFAR10(root='./data', train=False, download=True, transform=test_transforms)
testloader = DataLoader(testset, batch_size=batch_size, shuffle=False)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170M/170M [00:02<00:00, 70.8MB/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [8]:
VALID_RATIO = 0.7
n_train_examples = int(len(trainset) * VALID_RATIO)
n_valid_examples = len(trainset) - n_train_examples

train_data, valid_data = data.random_split(trainset, [n_train_examples, n_valid_examples])

In [9]:
valid_data = copy.deepcopy(valid_data)
valid_data.dataset.transform = test_transforms

In [10]:
len(train_data), len(valid_data), len(testset)

(35000, 15000, 10000)

In [11]:
sample_fraction = 0.2

# 무작위 인덱스 생성
train_indices = torch.randperm(len(trainset))[:int(len(trainset) * sample_fraction)]
valid_indices = torch.randperm(len(valid_data))[:int(len(valid_data) * sample_fraction)]
test_indices = torch.randperm(len(testset))[:int(len(testset) * sample_fraction)]

# 서브셋 생성
train_subset = Subset(trainset, train_indices)
valid_subset = Subset(valid_data, valid_indices)
test_subset = Subset(testset, test_indices)

In [12]:
len(train_subset), len(valid_subset), len(test_subset)

(10000, 3000, 2000)

In [13]:
train_iterator = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
valid_iterator = DataLoader(valid_subset, batch_size=batch_size, shuffle=False)
test_iterator = DataLoader(test_subset, batch_size=batch_size, shuffle=False)

In [14]:
class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_channels, out_channels, stride=1, downsample = False):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)

        if downsample:
            conv = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False)
            bn = nn.BatchNorm2d(out_channels)
            downsample = nn.Sequential(conv, bn)
        else:
            downsample = None
        self.downsample = downsample

    def forward(self, x):
        i = x
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.bn2(x)

        if self.downsample is not None:
            i = self.downsample(i)

        x += i
        x = self.relu(x)

        return x

In [15]:
class ResNet(nn.Module):
    def __init__(self, config, output_dim, zero_init_residual = False):
        super().__init__()

        block, n_blocks, channels = config
        self.in_channels = channels[0]
        assert len(n_blocks) == len(channels) == 4

        self.conv1 = nn.Conv2d(3, self.in_channels, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(self.in_channels)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.layer1 = self.get_resnet_layer(block, n_blocks[0], channels[0])
        self.layer2 = self.get_resnet_layer(block, n_blocks[1], channels[1], stride=2)
        self.layer3 = self.get_resnet_layer(block, n_blocks[2], channels[2], stride=2)
        self.layer4 = self.get_resnet_layer(block, n_blocks[3], channels[3], stride=2)

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(self.in_channels, output_dim)

        if zero_init_residual:
            for m in self.modules():
                if isinstance(m, BasicBlock):
                    nn.init.constant_(m.bn2.weight, 0)
                #elif isinstance(m, Bottleneck):
                    #nn.init.constant_(m.bn3.weight, 0)

    def get_resnet_layer(self, block, n_blocks, channels, stride=1):
        layers = []
        if self.in_channels != block.expansion * channels:
            downsample = True
        else:
            downsample = False
        layers.append(block(self.in_channels, channels, stride, downsample))
        for i in range(1, n_blocks):
            layers.append(block(block.expansion * channels, channels))

        self.in_channels = block.expansion * channels
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        #x = self.layer1(x)
        x = checkpoint.checkpoint(self.layer1, x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.avgpool(x)
        h = x.view(x.shape[0], -1)
        x = self.fc(h)
        return x, h

In [16]:
ResNetConfig = namedtuple('ResNetConfig', ['block', 'n_blocks', 'channels'])

In [17]:
resnet18_config = ResNetConfig(block = BasicBlock, n_blocks = [2, 2, 2, 2], channels = [64, 128, 256, 512])

In [18]:
model = ResNet(resnet18_config, 10)

In [19]:
print(model)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kerne

In [20]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [21]:
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

pretrained_model = model.to(device)
criterion = criterion.to(device)

In [22]:
def calculate_accuracy(y_pred, y):
    top_pred = y_pred.argmax(1, keepdim=True)
    correct = top_pred.eq(y.view_as(top_pred)).sum()
    acc = correct.float() / y.shape[0]
    return acc

In [23]:
pattern = re.compile(r'key=(?P<key>\S+)\s+'
                     r'self_cpu_time=(?P<self_cpu_time>\S+)\s+'
                     r'cpu_time=(?P<cpu_time>\S+)\s+'
                     r'self_cuda_time=(?P<self_cuda_time>\S+)\s+'
                     r'cuda_time=(?P<cuda_time>\S+)\s+'
                     r'input_shapes=(?P<input_shapes>\S*)\s*'
                     r'cpu_memory_usage=(?P<cpu_memory_usage>\S*)\s*'
                     r'cuda_memory_usage=(?P<cuda_memory_usage>\S*)')

In [24]:
import torch.utils.checkpoint as checkpoint

def gradient_checkpoint(model, *inputs):

  return model(*inputs)


In [25]:
def train(model, train_loader, criterion, optimizer, device, micro_batch_size = 8):
    start_time = time.monotonic()
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    with profile(
        activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
        profile_memory=True,  # 메모리 사용량 추적
        record_shapes=True  # 텐서 크기 기록
    ) as prof:
        for inputs, labels in tqdm(train_loader, desc="Training"):
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()  # 기울기 초기화
            memory_usage = torch.cuda.memory_allocated(device) / (1024 ** 2)

            for i in range(0, len(inputs), micro_batch_size):
                if i != 0:
                    del micro_inputs, micro_labels
                    torch.cuda.empty_cache()
                micro_inputs = inputs[i:i+micro_batch_size]
                micro_labels = labels[i:i+micro_batch_size]

                with record_function("forward_pass"):  # Forward pass 프로파일링
                    outputs = model(micro_inputs)  # 모델 연산

                with record_function("loss_computation"):  # 손실 계산 프로파일링
                    loss = criterion(outputs[0], micro_labels)  # 손실 계산

                with record_function("backward_pass"):  # Backward pass 프로파일링
                    loss.backward()  # 역전파

                with record_function("optimizer_step"):
                    optimizer.step()

                running_loss += loss.item()
                _, predicted = torch.max(outputs[0], 1)
                total += micro_labels.size(0)
                correct += (predicted == micro_labels).sum().item()
    end_time = time.monotonic()
    selected_keys = ["forward_pass", "loss_computation", "backward_pass", "optimizer_step"]

    # key_averages()로부터 얻은 평균값을 필터링
    filtered_averages = [avg for avg in prof.key_averages() if avg.key in selected_keys]
    extracted_data = []

    for avg in filtered_averages:
      avg_str = str(avg)
      match = pattern.search(avg_str)
      if match:
        extracted_data.append(match.groupdict())
    df = pd.DataFrame(extracted_data)
    print(df)
    free_memory, total_memory = torch.cuda.mem_get_info()
    print(f"Free memory: {free_memory / 1024**2:.2f} MB")
    print(f"Total memory: {total_memory / 1024**2:.2f} MB")
    # 훈련 후 평균 손실과 정확도 계산
    epoch_loss = running_loss / len(train_loader)
    accuracy = 100 * correct / total

    return epoch_loss, accuracy, start_time, end_time

In [26]:
def evaluate(model, data_loader, criterion, device, phase="Validation"):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in tqdm(data_loader, desc=f"{phase}"):
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs[0], labels)

            running_loss += loss.item()
            _, predicted = torch.max(outputs[0], 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    epoch_loss = running_loss / len(data_loader)
    accuracy = 100 * correct / total
    # print(f"{phase} Loss: {epoch_loss:.4f}, {phase} Accuracy: {accuracy:.2f}%")

    return epoch_loss, accuracy

In [27]:
torch.cuda.empty_cache()

In [28]:
memory_usage = torch.cuda.memory_allocated(device) / (1024 ** 2)
print(memory_usage)

42.70654296875


In [29]:
free_memory, total_memory = torch.cuda.mem_get_info()
print(f"Free memory: {free_memory / 1024**2:.2f} MB")
print(f"Total memory: {total_memory / 1024**2:.2f} MB")

Free memory: 40026.81 MB
Total memory: 40513.81 MB


In [30]:
EPOCHS = 10
best_valid_loss = float('inf')
total_time = 0
for epoch in range(EPOCHS):

    train_loss, train_acc, start_time, end_time = train(model, train_iterator, criterion, optimizer, device)

    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, device)
    #if valid_loss < best_valid_loss:
        #best_valid_loss = valid_loss
        #torch.save(model.state_dict(), 'vgg19-model.pt')

    # end_time = time.monotonic()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    total_time += end_time - start_time

    print(f'Epoch: {epoch+1:02} | Epoch Train Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc:.2f}%')

print("Train finished")

  return fn(*args, **kwargs)
Training: 100%|██████████| 313/313 [00:57<00:00,  5.48it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass        2.353s   10.038ms        0.000us    1.834ms   
1      forward_pass       0.000us    0.000us        11.509s    9.199ms   
2  loss_computation     102.869ms  255.123us        0.000us    5.562us   
3  loss_computation       0.000us    0.000us      100.985ms   80.788us   
4     backward_pass       18.109s   14.547ms        0.000us    1.748us   
5     backward_pass       0.000us    0.000us        2.185ms    1.748us   
6    optimizer_step      58.581ms    2.670ms        0.000us  686.032us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                       6340000     164948419072>  
1                             0                0>  
2                             0          1280512>  
3                             0                0>  
4                             0    -150266270208>  
5                             0                0>  
6                           248         941

  return fn(*args, **kwargs)
Validation: 100%|██████████| 94/94 [00:04<00:00, 18.87it/s]


Epoch: 01 | Epoch Train Time: 2m 14s
	Train Loss: 8.658 | Train Acc: 20.56%
	 Val. Loss: 1.939 |  Val. Acc: 27.70%


Training: 100%|██████████| 313/313 [00:55<00:00,  5.61it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass        2.452s    9.621ms        0.000us    1.832ms   
1      forward_pass       0.000us    0.000us        11.486s    9.189ms   
2  loss_computation     109.839ms  197.347us        0.000us    5.597us   
3  loss_computation       0.000us    0.000us       71.862ms   57.490us   
4     backward_pass       17.575s   14.110ms        0.000us    1.746us   
5     backward_pass       0.000us    0.000us        2.183ms    1.746us   
6    optimizer_step      63.987ms    2.649ms        0.000us  680.875us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                       6340000     164953383424>  
1                             0                0>  
2                             0          1280512>  
3                             0                0>  
4                             0    -150279704576>  
5                             0                0>  
6                             0            

  return fn(*args, **kwargs)
Validation: 100%|██████████| 94/94 [00:06<00:00, 15.15it/s]


Epoch: 02 | Epoch Train Time: 2m 11s
	Train Loss: 7.656 | Train Acc: 29.08%
	 Val. Loss: 1.737 |  Val. Acc: 34.67%


Training: 100%|██████████| 313/313 [00:54<00:00,  5.77it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass        2.315s    9.204ms        0.000us    1.831ms   
1      forward_pass       0.000us    0.000us        11.018s    8.815ms   
2  loss_computation      95.462ms  174.486us        0.000us    5.535us   
3  loss_computation       0.000us    0.000us       64.778ms   51.823us   
4     backward_pass       16.547s   13.287ms        0.000us    1.747us   
5     backward_pass       0.000us    0.000us        2.184ms    1.747us   
6    optimizer_step      56.170ms    2.668ms        0.000us  674.860us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                       6340000     164971782656>  
1                             0                0>  
2                             0          1280512>  
3                             0                0>  
4                             0    -150280131072>  
5                             0                0>  
6                             0         489

  return fn(*args, **kwargs)
Validation: 100%|██████████| 94/94 [00:06<00:00, 15.65it/s]


Epoch: 03 | Epoch Train Time: 2m 8s
	Train Loss: 7.221 | Train Acc: 33.89%
	 Val. Loss: 1.571 |  Val. Acc: 43.20%


Training: 100%|██████████| 313/313 [00:54<00:00,  5.73it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass        2.347s    9.299ms        0.000us    1.832ms   
1      forward_pass       0.000us    0.000us        11.129s    8.903ms   
2  loss_computation      96.366ms  177.876us        0.000us    5.578us   
3  loss_computation       0.000us    0.000us       66.935ms   53.548us   
4     backward_pass       16.600s   13.329ms        0.000us    1.745us   
5     backward_pass       0.000us    0.000us        2.182ms    1.745us   
6    optimizer_step      60.521ms    2.521ms        0.000us  675.341us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                       6340000     164944667136>  
1                             0                0>  
2                             0          1280512>  
3                             0                0>  
4                             0    -150280131072>  
5                             0                0>  
6                             0         979

  return fn(*args, **kwargs)
Validation: 100%|██████████| 94/94 [00:06<00:00, 13.49it/s]


Epoch: 04 | Epoch Train Time: 2m 9s
	Train Loss: 6.758 | Train Acc: 38.75%
	 Val. Loss: 1.523 |  Val. Acc: 44.37%


Training: 100%|██████████| 313/313 [01:00<00:00,  5.14it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass        2.410s    9.585ms        0.000us    1.831ms   
1      forward_pass       0.000us    0.000us        11.469s    9.175ms   
2  loss_computation      99.085ms  183.918us        0.000us    5.595us   
3  loss_computation       0.000us    0.000us       69.409ms   55.527us   
4     backward_pass       16.802s   13.492ms        0.000us    1.745us   
5     backward_pass       0.000us    0.000us        2.182ms    1.745us   
6    optimizer_step      57.852ms    2.882ms        0.000us  675.608us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                       6340000     164949795328>  
1                             0                0>  
2                             0          1280512>  
3                             0                0>  
4                             0    -150280130560>  
5                             0                0>  
6                             0            

  return fn(*args, **kwargs)
Validation: 100%|██████████| 94/94 [00:06<00:00, 15.05it/s]


Epoch: 05 | Epoch Train Time: 2m 18s
	Train Loss: 6.398 | Train Acc: 42.10%
	 Val. Loss: 1.485 |  Val. Acc: 46.23%


Training: 100%|██████████| 313/313 [00:56<00:00,  5.59it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass        2.338s    9.353ms        0.000us    1.832ms   
1      forward_pass       0.000us    0.000us        11.191s    8.953ms   
2  loss_computation      97.036ms  180.279us        0.000us    5.545us   
3  loss_computation       0.000us    0.000us       67.964ms   54.371us   
4     backward_pass       16.714s   13.419ms        0.000us    1.746us   
5     backward_pass       0.000us    0.000us        2.183ms    1.746us   
6    optimizer_step      62.202ms    2.763ms        0.000us  675.685us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                       6340000     164953809408>  
1                             0                0>  
2                             0          1280512>  
3                             0                0>  
4                             0    -150280131072>  
5                             0                0>  
6                             0         975

  return fn(*args, **kwargs)
Validation: 100%|██████████| 94/94 [00:07<00:00, 12.20it/s]


Epoch: 06 | Epoch Train Time: 2m 10s
	Train Loss: 6.021 | Train Acc: 46.09%
	 Val. Loss: 1.428 |  Val. Acc: 48.87%


Training: 100%|██████████| 313/313 [01:03<00:00,  4.91it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass        2.410s    9.596ms        0.000us    1.832ms   
1      forward_pass       0.000us    0.000us        11.479s    9.183ms   
2  loss_computation      98.807ms  183.953us        0.000us    5.596us   
3  loss_computation       0.000us    0.000us       68.950ms   55.160us   
4     backward_pass       16.922s   13.587ms        0.000us    1.747us   
5     backward_pass       0.000us    0.000us        2.183ms    1.747us   
6    optimizer_step      58.987ms    2.870ms        0.000us  675.673us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                       6340000     164968571392>  
1                             0                0>  
2                             0          1280512>  
3                             0                0>  
4                             0    -150280131072>  
5                             0                0>  
6                             0         485

  return fn(*args, **kwargs)
Validation: 100%|██████████| 94/94 [00:07<00:00, 13.28it/s]


Epoch: 07 | Epoch Train Time: 2m 21s
	Train Loss: 5.701 | Train Acc: 48.44%
	 Val. Loss: 1.283 |  Val. Acc: 53.03%


Training: 100%|██████████| 313/313 [01:02<00:00,  5.01it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass        2.400s    9.548ms        0.000us    1.831ms   
1      forward_pass       0.000us    0.000us        11.419s    9.135ms   
2  loss_computation      99.430ms  184.921us        0.000us    5.598us   
3  loss_computation       0.000us    0.000us       69.791ms   55.833us   
4     backward_pass       16.951s   13.610ms        0.000us    1.747us   
5     backward_pass       0.000us    0.000us        2.184ms    1.747us   
6    optimizer_step      59.607ms    2.875ms        0.000us  675.806us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                       6340000     164970979840>  
1                             0                0>  
2                             0          1280512>  
3                             0                0>  
4                             0    -150280131072>  
5                             0                0>  
6                             0            

  return fn(*args, **kwargs)
Validation: 100%|██████████| 94/94 [00:08<00:00, 10.81it/s]


Epoch: 08 | Epoch Train Time: 2m 23s
	Train Loss: 5.362 | Train Acc: 52.13%
	 Val. Loss: 1.245 |  Val. Acc: 53.93%


Training: 100%|██████████| 313/313 [01:06<00:00,  4.73it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass        2.407s    9.546ms        0.000us    1.832ms   
1      forward_pass       0.000us    0.000us        11.420s    9.136ms   
2  loss_computation      98.942ms  182.631us        0.000us    5.572us   
3  loss_computation       0.000us    0.000us       68.220ms   54.576us   
4     backward_pass       16.770s   13.465ms        0.000us    1.747us   
5     backward_pass       0.000us    0.000us        2.184ms    1.747us   
6    optimizer_step      58.370ms    2.853ms        0.000us  675.601us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                       6340000     164953809408>  
1                             0                0>  
2                             0          1280512>  
3                             0                0>  
4                             0    -150280130048>  
5                             0                0>  
6                             0         485

  return fn(*args, **kwargs)
Validation: 100%|██████████| 94/94 [00:07<00:00, 12.85it/s]


Epoch: 09 | Epoch Train Time: 2m 22s
	Train Loss: 5.060 | Train Acc: 54.87%
	 Val. Loss: 1.092 |  Val. Acc: 61.67%


Training: 100%|██████████| 313/313 [01:03<00:00,  4.95it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass        2.428s    9.682ms        0.000us    1.832ms   
1      forward_pass       0.000us    0.000us        11.580s    9.264ms   
2  loss_computation     100.255ms  188.272us        0.000us    5.557us   
3  loss_computation       0.000us    0.000us       71.378ms   57.102us   
4     backward_pass       16.904s   13.572ms        0.000us    1.746us   
5     backward_pass       0.000us    0.000us        2.182ms    1.746us   
6    optimizer_step      58.511ms    3.108ms        0.000us  675.392us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                       6340000     164947386880>  
1                             0                0>  
2                             0          1280512>  
3                             0                0>  
4                             0    -150280131072>  
5                             0                0>  
6                             0         485

  return fn(*args, **kwargs)
Validation: 100%|██████████| 94/94 [00:09<00:00,  9.94it/s]

Epoch: 10 | Epoch Train Time: 2m 24s
	Train Loss: 4.804 | Train Acc: 56.68%
	 Val. Loss: 1.022 |  Val. Acc: 62.47%
Train finished





In [32]:
memory_usage = torch.cuda.memory_allocated(device) / (1024 ** 2)
print(memory_usage)

211.7109375


In [31]:
from torch import profiler

dummy_input = torch.randn(32, 3, 224, 224).cuda()

# Profiling inference
with profiler.profile(
    activities=[
       profiler.ProfilerActivity.CPU,
        profiler.ProfilerActivity.CUDA,  # Include if using GPU
    ],
    on_trace_ready=profiler.tensorboard_trace_handler("./logs"),  # Optional logging
    record_shapes=True,
    with_stack=True
) as prof:
    with torch.no_grad():
        model(dummy_input)


# Print results
print(prof.key_averages().table(sort_by="cuda_time_total" if torch.cuda.is_available() else "cpu_time_total", row_limit=10))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                           aten::conv2d         1.52%     154.564us        48.25%       4.902ms     245.084us       0.000us         0.00%       2.695ms     134.742us            20  
                                      aten::convolution         2.07%     210.376us        46.73%       4.747ms     237.356us       0.000us         0.00%       2.695ms     134.742us            20  
         