In [1]:
!pip install pynvml

Collecting pynvml
  Downloading pynvml-11.5.3-py3-none-any.whl.metadata (8.8 kB)
Downloading pynvml-11.5.3-py3-none-any.whl (53 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/53.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pynvml
Successfully installed pynvml-11.5.3


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
import torchvision
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
from torch.profiler import profile, record_function, ProfilerActivity

import matplotlib.pyplot as plt
import numpy as np

import copy
from collections import namedtuple
import time
import os
import random
import re

import cv2
from torch.utils.data import Dataset, DataLoader, Subset
from PIL import Image

from tqdm import tqdm
from pynvml import *
import pandas as pd

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
def print_gpu_utilization():
    if torch.cuda.is_available():
        device = torch.cuda.current_device()  # 현재 GPU 디바이스 정보
        allocated_memory = torch.cuda.memory_allocated(device) / 1024**3  # 메모리 사용량 (GB)
        reserved_memory = torch.cuda.memory_reserved(device) / 1024**3  # 예약된 메모리 (GB)
        print(f"Allocated Memory: {allocated_memory:.2f} GB")
        print(f"Reserved Memory: {reserved_memory:.2f} GB")
    else:
        print("No GPU available.")

In [4]:
def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [5]:
size = 224
mean = (0.485, 0.456, 0.406)
std = (0.229, 0.224, 0.225)
batch_size = 32

In [6]:
train_transforms = transforms.Compose([
    transforms.RandomResizedCrop(size, scale=(0.5, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])

test_transforms = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(size),
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])

In [7]:
# CIFAR-10
trainset = datasets.CIFAR10(root='./data', train=True, download=True, transform=train_transforms)
trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True)

testset = datasets.CIFAR10(root='./data', train=False, download=True, transform=test_transforms)
testloader = DataLoader(testset, batch_size=batch_size, shuffle=False)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170M/170M [00:13<00:00, 12.9MB/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [8]:
VALID_RATIO = 0.7
n_train_examples = int(len(trainset) * VALID_RATIO)
n_valid_examples = len(trainset) - n_train_examples

train_data, valid_data = data.random_split(trainset, [n_train_examples, n_valid_examples])

In [9]:
valid_data = copy.deepcopy(valid_data)
valid_data.dataset.transform = test_transforms

In [10]:
len(train_data), len(valid_data), len(testset)

(35000, 15000, 10000)

In [11]:
sample_fraction = 0.2

# 무작위 인덱스 생성
train_indices = torch.randperm(len(trainset))[:int(len(trainset) * sample_fraction)]
valid_indices = torch.randperm(len(valid_data))[:int(len(valid_data) * sample_fraction)]
test_indices = torch.randperm(len(testset))[:int(len(testset) * sample_fraction)]

# 서브셋 생성
train_subset = Subset(trainset, train_indices)
valid_subset = Subset(valid_data, valid_indices)
test_subset = Subset(testset, test_indices)

In [12]:
len(train_subset), len(valid_subset), len(test_subset)

(10000, 3000, 2000)

In [13]:
train_iterator = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
valid_iterator = DataLoader(valid_subset, batch_size=batch_size, shuffle=False)
test_iterator = DataLoader(test_subset, batch_size=batch_size, shuffle=False)

In [14]:
class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_channels, out_channels, stride=1, downsample = False):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)

        if downsample:
            conv = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False)
            bn = nn.BatchNorm2d(out_channels)
            downsample = nn.Sequential(conv, bn)
        else:
            downsample = None
        self.downsample = downsample

    def forward(self, x):
        i = x
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.bn2(x)

        if self.downsample is not None:
            i = self.downsample(i)

        x += i
        x = self.relu(x)

        return x

In [15]:
class ResNet(nn.Module):
    def __init__(self, config, output_dim, zero_init_residual = False):
        super().__init__()

        block, n_blocks, channels = config
        self.in_channels = channels[0]
        assert len(n_blocks) == len(channels) == 4

        self.conv1 = nn.Conv2d(3, self.in_channels, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(self.in_channels)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.layer1 = self.get_resnet_layer(block, n_blocks[0], channels[0])
        self.layer2 = self.get_resnet_layer(block, n_blocks[1], channels[1], stride=2)
        self.layer3 = self.get_resnet_layer(block, n_blocks[2], channels[2], stride=2)
        self.layer4 = self.get_resnet_layer(block, n_blocks[3], channels[3], stride=2)

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(self.in_channels, output_dim)

        if zero_init_residual:
            for m in self.modules():
                if isinstance(m, BasicBlock):
                    nn.init.constant_(m.bn2.weight, 0)
                #elif isinstance(m, Bottleneck):
                    #nn.init.constant_(m.bn3.weight, 0)

    def get_resnet_layer(self, block, n_blocks, channels, stride=1):
        layers = []
        if self.in_channels != block.expansion * channels:
            downsample = True
        else:
            downsample = False
        layers.append(block(self.in_channels, channels, stride, downsample))
        for i in range(1, n_blocks):
            layers.append(block(block.expansion * channels, channels))

        self.in_channels = block.expansion * channels
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.avgpool(x)
        h = x.view(x.shape[0], -1)
        x = self.fc(h)
        return x, h

In [16]:
ResNetConfig = namedtuple('ResNetConfig', ['block', 'n_blocks', 'channels'])

In [17]:
resnet18_config = ResNetConfig(block = BasicBlock, n_blocks = [2, 2, 2, 2], channels = [64, 128, 256, 512])

In [18]:
pretrained_model = models.resnet18(pretrained=True)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 224MB/s]


In [19]:
print(pretrained_model)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [20]:
model = ResNet(resnet18_config, 10)

In [21]:
print(model)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kerne

In [22]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [23]:
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

pretrained_model = model.to(device)
criterion = criterion.to(device)

In [24]:
def calculate_accuracy(y_pred, y):
    top_pred = y_pred.argmax(1, keepdim=True)
    correct = top_pred.eq(y.view_as(top_pred)).sum()
    acc = correct.float() / y.shape[0]
    return acc

In [25]:
pattern = re.compile(r'key=(?P<key>\S+)\s+'
                     r'self_cpu_time=(?P<self_cpu_time>\S+)\s+'
                     r'cpu_time=(?P<cpu_time>\S+)\s+'
                     r'self_cuda_time=(?P<self_cuda_time>\S+)\s+'
                     r'cuda_time=(?P<cuda_time>\S+)\s+'
                     r'input_shapes=(?P<input_shapes>\S*)\s*'
                     r'cpu_memory_usage=(?P<cpu_memory_usage>\S*)\s*'
                     r'cuda_memory_usage=(?P<cuda_memory_usage>\S*)')

# Introducing the Melon

In [26]:
class LifetimeAwareMemoryPool:
    def __init__(self, memory_budget):
        self.memory_budget = memory_budget
        self.allocated_memory = 0
        self.memory_blocks = []  # (start_addr, size, tensor_id, lifetime)
        self.tensor_map = {}     # tensor_id -> memory_block_index

    def allocate(self, tensor_id, size, lifetime):
        """텐서의 수명을 고려한 메모리 할당"""
        # 이미 할당된 텐서인 경우
        if tensor_id in self.tensor_map:
            return self.tensor_map[tensor_id]

        # 가용 메모리 공간 찾기
        best_addr = self._find_best_fit(size, lifetime)

        # 메모리 부족한 경우
        if best_addr is None:
            self._compact()  # 메모리 조각 모음
            best_addr = self._find_best_fit(size, lifetime)
            if best_addr is None:
                raise MemoryError("Not enough memory")

        # 새로운 메모리 블록 할당
        block_index = len(self.memory_blocks)
        self.memory_blocks.append((best_addr, size, tensor_id, lifetime))
        self.tensor_map[tensor_id] = block_index
        self.allocated_memory += size

        return best_addr

    def free(self, tensor_id):
        """텐서 메모리 해제"""
        if tensor_id in self.tensor_map:
            block_index = self.tensor_map[tensor_id]
            _, size, _, _ = self.memory_blocks[block_index]
            self.allocated_memory -= size
            del self.tensor_map[tensor_id]
            self.memory_blocks[block_index] = None

    def _find_best_fit(self, size, lifetime):
        """최적의 메모리 위치 찾기"""
        if self.allocated_memory + size > self.memory_budget:
            return None

        # 긴 수명의 텐서는 낮은 주소에 할당
        available_addr = 0
        for block in self.memory_blocks:
            if block is None:
                continue
            block_addr, block_size, _, block_lifetime = block

            # 수명이 겹치지 않는 경우 해당 공간 재사용 가능
            if not self._lifetimes_overlap(lifetime, block_lifetime):
                if block_addr - available_addr >= size:
                    return available_addr
            available_addr = max(available_addr, block_addr + block_size)

        # 새로운 메모리 영역 할당
        if self.allocated_memory + size <= self.memory_budget:
            return available_addr

        return None

    def _compact(self):
        """메모리 조각 모음"""
        # 유효한 블록만 필터링
        valid_blocks = [b for b in self.memory_blocks if b is not None]

        # 수명 기준으로 정렬
        valid_blocks.sort(key=lambda x: x[3])

        # 메모리 재할당
        self.memory_blocks = []
        self.tensor_map.clear()
        self.allocated_memory = 0

        current_addr = 0
        for _, size, tensor_id, lifetime in valid_blocks:
            self.memory_blocks.append((current_addr, size, tensor_id, lifetime))
            self.tensor_map[tensor_id] = len(self.memory_blocks) - 1
            self.allocated_memory += size
            current_addr += size

    def _lifetimes_overlap(self, lifetime1, lifetime2):
        """두 텐서의 수명이 겹치는지 확인"""
        start1, end1 = lifetime1
        start2, end2 = lifetime2
        return not (end1 <= start2 or end2 <= start1)

In [27]:
selected_keys = ["forward_pass", "loss_computation", "backward_pass", "optimizer_step"]

In [28]:
class MelonTrainer:
    def __init__(self, model, criterion, optimizer, device, memory_budget):
        self.model = model.to(device)
        self.criterion = criterion
        self.optimizer = optimizer
        self.device = device
        self.memory_budget = memory_budget
        self.has_bn = self._check_has_bn()
        self.memory_pool = self._initialize_memory_pool()

    def _check_has_bn(self):
        """BatchNorm 레이어 존재 여부 확인"""
        for module in self.model.modules():
            if isinstance(module, nn.BatchNorm2d):
                return True
        return False

    def _initialize_memory_pool(self):
        """Lifetime-aware memory pool 초기화"""
        return LifetimeAwareMemoryPool(self.memory_budget)

    def train(self, train_loader):
        start_time = time.monotonic()
        self.model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        with profile(
            activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
            profile_memory=True,  # 메모리 사용량 추적
            record_shapes=True  # 텐서 크기 기록
        ) as prof:

            for inputs, labels in tqdm(train_loader, desc="Training"):
                inputs, labels = inputs.to(self.device), labels.to(self.device)

                if self.has_bn:
                  # BatchNorm이 있는 경우 recomputation 사용
                  loss, acc = self._train_step_with_recomputation(inputs, labels)
                else:
                  # BatchNorm이 없는 경우 micro-batch 사용
                  loss, acc = self._train_step_with_microbatch(inputs, labels)

                running_loss += loss
                correct += acc[0]
                total += acc[1]

        end_time = time.monotonic()

        filtered_averages = [avg for avg in prof.key_averages() if avg.key in selected_keys]
        extracted_data = []

        for avg in filtered_averages:
          avg_str = str(avg)
          match = pattern.search(avg_str)
          if match:
            extracted_data.append(match.groupdict())
        df = pd.DataFrame(extracted_data)
        print(df)
        free_memory, total_memory = torch.cuda.mem_get_info()
        print(f"Free memory: {free_memory / 1024**2:.2f} MB")
        print(f"Total memory: {total_memory / 1024**2:.2f} MB")
        epoch_loss = running_loss / len(train_loader)
        accuracy = 100 * correct / total

        return epoch_loss, accuracy, start_time, end_time

    def _train_step_with_recomputation(self, inputs, labels):
        """Recomputation을 사용한 학습 스텝"""
        self.optimizer.zero_grad()

        # Forward pass with checkpoints
        with torch.no_grad():
            intermediate_outputs = []
            x = inputs

            # ResNet의 각 레이어별로 순차적으로 처리
            x = self.model.conv1(x)
            x = self.model.bn1(x)
            x = self.model.relu(x)
            x = self.model.maxpool(x)

            # layer1-4 처리
            for layer_name in ['layer1', 'layer2', 'layer3', 'layer4']:
                layer = getattr(self.model, layer_name)
                x = layer(x)
                if self.has_bn:
                    intermediate_outputs.append(x.detach())

        # Recomputation and backward
        with record_function("forward_pass"):
            outputs = self.model(inputs)
            if isinstance(outputs, tuple):
                outputs = outputs[0]
        with record_function("loss_computation"):
            loss = self.criterion(outputs, labels)

        with record_function("backward_pass"):
            loss.backward()
        with record_function("optimizer_step"):
            self.optimizer.step()

        _, predicted = torch.max(outputs.data, 1)
        correct = (predicted == labels).sum().item()
        total = labels.size(0)

        return loss.item(), (correct, total)

    def _train_step_with_microbatch(self, inputs, labels):
        """Micro-batch를 사용한 학습 스텝"""
        batch_size = inputs.size(0)
        micro_batch_size = self._calculate_micro_batch_size(inputs.size())

        self.optimizer.zero_grad()
        total_loss = 0
        correct = 0

        for i in range(0, batch_size, micro_batch_size):
            micro_inputs = inputs[i:i+micro_batch_size]
            micro_labels = labels[i:i+micro_batch_size]

            with record_function("forward_pass"):
                outputs = self.model(micro_inputs)

            with record_function("loss_computation"):
                loss = self.criterion(outputs[0], micro_labels)
                scaled_loss = loss * (micro_batch_size / batch_size)

            with record_function("backward_pass"):
                scaled_loss.backward()

            total_loss += loss.item()
            _, predicted = torch.max(outputs[0].data, 1)
            correct += (predicted == micro_labels).sum().item()

        with record_function("optimizer_step"):
            self.optimizer.step()

        return total_loss, (correct, batch_size)

    def _calculate_micro_batch_size(self, input_size):
        """메모리 예산에 따른 micro-batch 크기 계산"""
        tensor_size = input_size[1] * input_size[2] * input_size[3] * 4
        return max(1, min(input_size[0], self.memory_budget // tensor_size))

In [29]:
def evaluate(model, data_loader, criterion, device, phase="Validation"):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in tqdm(data_loader, desc=f"{phase}"):
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs[0], labels)

            running_loss += loss.item()
            _, predicted = torch.max(outputs[0], 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    epoch_loss = running_loss / len(data_loader)
    accuracy = 100 * correct / total
    # print(f"{phase} Loss: {epoch_loss:.4f}, {phase} Accuracy: {accuracy:.2f}%")

    return epoch_loss, accuracy

In [30]:
torch.cuda.empty_cache()

In [31]:
free_memory, total_memory = torch.cuda.mem_get_info()
print(f"Free memory: {free_memory / 1024**2:.2f} MB")
print(f"Total memory: {total_memory / 1024**2:.2f} MB")

Free memory: 40026.81 MB
Total memory: 40513.81 MB


In [32]:
trainer = MelonTrainer(
    model=model,
    criterion=criterion,
    optimizer=optimizer,
    device=device,
    memory_budget=4096
)

EPOCHS = 10
best_valid_loss = float('inf')
total_time = 0

for epoch in range(EPOCHS):
    train_loss, train_acc, start_time, end_time = trainer.train(train_iterator)

    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, device)

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    total_time += end_time - start_time

    print(f'Epoch: {epoch+1:02} | Epoch Train Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc:.2f}%')

print("Train finished")

Training: 100%|██████████| 313/313 [00:31<00:00,  9.99it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     657.853ms    8.380ms        0.000us    4.914ms   
1      forward_pass       0.000us    0.000us         2.559s    8.177ms   
2  loss_computation      25.877ms  477.618us        0.000us    5.570us   
3  loss_computation       0.000us    0.000us       47.677ms  152.322us   
4     backward_pass        3.581s   11.541ms        0.000us    1.766us   
5     backward_pass       0.000us    0.000us      552.896us    1.766us   
6    optimizer_step      15.736ms    3.143ms        0.000us  678.317us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                             0     216566887424>  
1                             0                0>  
2                             0           800768>  
3                             0                0>  
4                             0    -201847934976>  
5                             0                0>  
6                           248         918

Validation: 100%|██████████| 94/94 [00:04<00:00, 20.35it/s]


Epoch: 01 | Epoch Train Time: 1m 7s
	Train Loss: 1.982 | Train Acc: 27.14%
	 Val. Loss: 1.894 |  Val. Acc: 28.87%


Training: 100%|██████████| 313/313 [00:29<00:00, 10.73it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     647.334ms    7.876ms        0.000us    4.912ms   
1      forward_pass       0.000us    0.000us         2.406s    7.688ms   
2  loss_computation      24.663ms  189.568us        0.000us    5.570us   
3  loss_computation       0.000us    0.000us       18.384ms   58.734us   
4     backward_pass        3.051s    9.798ms        0.000us    1.768us   
5     backward_pass       0.000us    0.000us      553.381us    1.768us   
6    optimizer_step      15.407ms    2.597ms        0.000us  677.482us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                             0     216552928256>  
1                             0                0>  
2                             0           800768>  
3                             0                0>  
4                             0    -201856978944>  
5                             0                0>  
6                             0            

Validation: 100%|██████████| 94/94 [00:05<00:00, 17.07it/s]


Epoch: 02 | Epoch Train Time: 1m 4s
	Train Loss: 1.782 | Train Acc: 34.05%
	 Val. Loss: 1.629 |  Val. Acc: 39.63%


Training: 100%|██████████| 313/313 [00:27<00:00, 11.22it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     633.446ms    7.757ms        0.000us    4.913ms   
1      forward_pass       0.000us    0.000us         2.374s    7.584ms   
2  loss_computation      22.319ms  177.616us        0.000us    5.566us   
3  loss_computation       0.000us    0.000us       17.771ms   56.777us   
4     backward_pass        2.922s    9.382ms        0.000us    1.768us   
5     backward_pass       0.000us    0.000us      553.474us    1.768us   
6    optimizer_step      14.736ms    2.536ms        0.000us  675.993us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                             0     216552928256>  
1                             0                0>  
2                             0           800768>  
3                             0                0>  
4                             0    -201856978944>  
5                             0                0>  
6                             0            

Validation: 100%|██████████| 94/94 [00:05<00:00, 17.61it/s]


Epoch: 03 | Epoch Train Time: 1m 2s
	Train Loss: 1.692 | Train Acc: 37.90%
	 Val. Loss: 1.557 |  Val. Acc: 42.00%


Training: 100%|██████████| 313/313 [00:28<00:00, 11.09it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     639.099ms    7.806ms        0.000us    4.912ms   
1      forward_pass       0.000us    0.000us         2.389s    7.632ms   
2  loss_computation      22.935ms  180.699us        0.000us    5.566us   
3  loss_computation       0.000us    0.000us       17.929ms   57.280us   
4     backward_pass        2.926s    9.394ms        0.000us    1.767us   
5     backward_pass       0.000us    0.000us      553.147us    1.767us   
6    optimizer_step      14.794ms    2.563ms        0.000us  675.671us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                             0     216552928256>  
1                             0                0>  
2                             0           800768>  
3                             0                0>  
4                             0    -201856978944>  
5                             0                0>  
6                             0            

Validation: 100%|██████████| 94/94 [00:05<00:00, 16.36it/s]


Epoch: 04 | Epoch Train Time: 1m 2s
	Train Loss: 1.601 | Train Acc: 41.67%
	 Val. Loss: 1.441 |  Val. Acc: 47.83%


Training: 100%|██████████| 313/313 [00:28<00:00, 10.89it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     648.517ms    7.894ms        0.000us    4.912ms   
1      forward_pass       0.000us    0.000us         2.415s    7.717ms   
2  loss_computation      22.827ms  181.822us        0.000us    5.573us   
3  loss_computation       0.000us    0.000us       18.108ms   57.854us   
4     backward_pass        2.998s    9.627ms        0.000us    1.767us   
5     backward_pass       0.000us    0.000us      552.986us    1.767us   
6    optimizer_step      15.061ms    2.595ms        0.000us  675.518us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                             0     216552928256>  
1                             0                0>  
2                             0           800768>  
3                             0                0>  
4                             0    -201856978944>  
5                             0                0>  
6                             0            

Validation: 100%|██████████| 94/94 [00:06<00:00, 15.66it/s]


Epoch: 05 | Epoch Train Time: 1m 3s
	Train Loss: 1.496 | Train Acc: 46.12%
	 Val. Loss: 1.342 |  Val. Acc: 51.03%


Training: 100%|██████████| 313/313 [00:29<00:00, 10.53it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     649.083ms    7.920ms        0.000us    4.913ms   
1      forward_pass       0.000us    0.000us         2.424s    7.746ms   
2  loss_computation      22.575ms  179.835us        0.000us    5.564us   
3  loss_computation       0.000us    0.000us       17.952ms   57.354us   
4     backward_pass        2.968s    9.531ms        0.000us    1.766us   
5     backward_pass       0.000us    0.000us      552.870us    1.766us   
6    optimizer_step      15.674ms    2.631ms        0.000us  675.981us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                             0     216552928256>  
1                             0                0>  
2                             0           800768>  
3                             0                0>  
4                             0    -201856978944>  
5                             0                0>  
6                             0            

Validation: 100%|██████████| 94/94 [00:06<00:00, 15.25it/s]


Epoch: 06 | Epoch Train Time: 1m 4s
	Train Loss: 1.370 | Train Acc: 51.24%
	 Val. Loss: 1.222 |  Val. Acc: 54.43%


Training: 100%|██████████| 313/313 [00:31<00:00, 10.00it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     654.866ms    7.968ms        0.000us    4.913ms   
1      forward_pass       0.000us    0.000us         2.437s    7.785ms   
2  loss_computation      23.650ms  185.685us        0.000us    5.572us   
3  loss_computation       0.000us    0.000us       18.227ms   58.233us   
4     backward_pass        3.080s    9.891ms        0.000us    1.768us   
5     backward_pass       0.000us    0.000us      553.474us    1.768us   
6    optimizer_step      15.530ms    2.683ms        0.000us  675.784us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                             0     216552928256>  
1                             0                0>  
2                             0           800768>  
3                             0                0>  
4                             0    -201856978944>  
5                             0                0>  
6                             0            

Validation: 100%|██████████| 94/94 [00:06<00:00, 14.75it/s]


Epoch: 07 | Epoch Train Time: 1m 5s
	Train Loss: 1.251 | Train Acc: 55.55%
	 Val. Loss: 1.058 |  Val. Acc: 62.17%


Training: 100%|██████████| 313/313 [00:31<00:00,  9.83it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     656.281ms    8.017ms        0.000us    4.913ms   
1      forward_pass       0.000us    0.000us         2.454s    7.840ms   
2  loss_computation      23.496ms  185.212us        0.000us    5.568us   
3  loss_computation       0.000us    0.000us       18.375ms   58.708us   
4     backward_pass        2.983s    9.579ms        0.000us    1.768us   
5     backward_pass       0.000us    0.000us      553.496us    1.768us   
6    optimizer_step      15.698ms    2.700ms        0.000us  676.098us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                             0     216552928256>  
1                             0                0>  
2                             0           800768>  
3                             0                0>  
4                             0    -201856978944>  
5                             0                0>  
6                             0            

Validation: 100%|██████████| 94/94 [00:06<00:00, 15.17it/s]


Epoch: 08 | Epoch Train Time: 1m 6s
	Train Loss: 1.166 | Train Acc: 58.03%
	 Val. Loss: 0.994 |  Val. Acc: 64.80%


Training: 100%|██████████| 313/313 [00:31<00:00,  9.95it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     667.191ms    8.067ms        0.000us    4.913ms   
1      forward_pass       0.000us    0.000us         2.466s    7.877ms   
2  loss_computation      24.641ms  190.941us        0.000us    5.570us   
3  loss_computation       0.000us    0.000us       18.700ms   59.744us   
4     backward_pass        3.061s    9.831ms        0.000us    1.769us   
5     backward_pass       0.000us    0.000us      553.595us    1.769us   
6    optimizer_step      16.230ms    2.744ms        0.000us  675.789us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                             0     216552928256>  
1                             0                0>  
2                             0           800768>  
3                             0                0>  
4                             0    -201856978944>  
5                             0                0>  
6                             0            

Validation: 100%|██████████| 94/94 [00:06<00:00, 14.70it/s]


Epoch: 09 | Epoch Train Time: 1m 6s
	Train Loss: 1.079 | Train Acc: 61.09%
	 Val. Loss: 1.034 |  Val. Acc: 63.37%


Training: 100%|██████████| 313/313 [00:32<00:00,  9.58it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     676.010ms    8.272ms        0.000us    4.912ms   
1      forward_pass       0.000us    0.000us         2.529s    8.079ms   
2  loss_computation      24.981ms  193.219us        0.000us    5.563us   
3  loss_computation       0.000us    0.000us       18.738ms   59.864us   
4     backward_pass        3.113s    9.998ms        0.000us    1.767us   
5     backward_pass       0.000us    0.000us      553.092us    1.767us   
6    optimizer_step      16.603ms    2.823ms        0.000us  675.856us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                             0     216552928256>  
1                             0                0>  
2                             0           800768>  
3                             0                0>  
4                             0    -201856978944>  
5                             0                0>  
6                             0            

Validation: 100%|██████████| 94/94 [00:05<00:00, 15.82it/s]

Epoch: 10 | Epoch Train Time: 1m 8s
	Train Loss: 1.000 | Train Acc: 64.56%
	 Val. Loss: 0.875 |  Val. Acc: 68.27%
Train finished





In [33]:
free_memory, total_memory = torch.cuda.mem_get_info()
print(f"Free memory: {free_memory / 1024**2:.2f} MB")
print(f"Total memory: {total_memory / 1024**2:.2f} MB")

Free memory: 38510.81 MB
Total memory: 40513.81 MB


In [34]:
print("ResNet18")
print(f'Total Training Time: {int(total_time/60)}m {int(total_time%60)}s')

ResNet18
Total Training Time: 10m 52s


In [35]:
torch.save(model.state_dict(), 'trained_model.pth')

In [36]:
import os
model_file_size = os.path.getsize('trained_model.pth')  # 바이트 단위
model_file_size_MB = model_file_size / (1024 ** 2)  # MB로 변환
print(f"Saved model file size: {model_file_size_MB:.2f} MB")

Saved model file size: 42.73 MB


In [37]:
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"{name} - Size: {param.size()} - Number of elements: {param.numel()}")

conv1.weight - Size: torch.Size([64, 3, 7, 7]) - Number of elements: 9408
bn1.weight - Size: torch.Size([64]) - Number of elements: 64
bn1.bias - Size: torch.Size([64]) - Number of elements: 64
layer1.0.conv1.weight - Size: torch.Size([64, 64, 3, 3]) - Number of elements: 36864
layer1.0.bn1.weight - Size: torch.Size([64]) - Number of elements: 64
layer1.0.bn1.bias - Size: torch.Size([64]) - Number of elements: 64
layer1.0.conv2.weight - Size: torch.Size([64, 64, 3, 3]) - Number of elements: 36864
layer1.0.bn2.weight - Size: torch.Size([64]) - Number of elements: 64
layer1.0.bn2.bias - Size: torch.Size([64]) - Number of elements: 64
layer1.1.conv1.weight - Size: torch.Size([64, 64, 3, 3]) - Number of elements: 36864
layer1.1.bn1.weight - Size: torch.Size([64]) - Number of elements: 64
layer1.1.bn1.bias - Size: torch.Size([64]) - Number of elements: 64
layer1.1.conv2.weight - Size: torch.Size([64, 64, 3, 3]) - Number of elements: 36864
layer1.1.bn2.weight - Size: torch.Size([64]) - Numbe

In [38]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params}")

Total number of parameters: 11181642


In [39]:
from torchsummary import summary

summary(model, input_size=(3, 224, 224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 112, 112]           9,408
       BatchNorm2d-2         [-1, 64, 112, 112]             128
              ReLU-3         [-1, 64, 112, 112]               0
         MaxPool2d-4           [-1, 64, 56, 56]               0
            Conv2d-5           [-1, 64, 56, 56]          36,864
       BatchNorm2d-6           [-1, 64, 56, 56]             128
              ReLU-7           [-1, 64, 56, 56]               0
            Conv2d-8           [-1, 64, 56, 56]          36,864
       BatchNorm2d-9           [-1, 64, 56, 56]             128
             ReLU-10           [-1, 64, 56, 56]               0
       BasicBlock-11           [-1, 64, 56, 56]               0
           Conv2d-12           [-1, 64, 56, 56]          36,864
      BatchNorm2d-13           [-1, 64, 56, 56]             128
             ReLU-14           [-1, 64,

In [41]:
from torch import profiler
dummy_input = torch.randn(32, 3, 224, 224).cuda()

# Profiling inference
with profiler.profile(
    activities=[
       profiler.ProfilerActivity.CPU,
        profiler.ProfilerActivity.CUDA,  # Include if using GPU
    ],
    on_trace_ready=profiler.tensorboard_trace_handler("./logs"),  # Optional logging
    record_shapes=True,
    with_stack=True
) as prof:
    with torch.no_grad():
        model(dummy_input)


# Print results
print(prof.key_averages().table(sort_by="cuda_time_total" if torch.cuda.is_available() else "cpu_time_total", row_limit=50))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                           aten::conv2d         2.04%     145.925us        58.13%       4.152ms     207.589us       0.000us         0.00%       2.719ms     135.926us            20  
                                      aten::convolution         3.19%     228.119us        56.09%       4.006ms     200.293us       0.000us         0.00%       2.719ms     135.926us            20  
         