In [1]:
!pip install pynvml

Collecting pynvml
  Downloading pynvml-11.5.3-py3-none-any.whl.metadata (8.8 kB)
Downloading pynvml-11.5.3-py3-none-any.whl (53 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/53.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pynvml
Successfully installed pynvml-11.5.3


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
import torchvision
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
from torch.profiler import profile, record_function, ProfilerActivity

import matplotlib.pyplot as plt
import numpy as np

import copy
from collections import namedtuple
import time
import os
import random
import re

import cv2
from torch.utils.data import Dataset, DataLoader, Subset
from PIL import Image

from tqdm import tqdm
from pynvml import *
import pandas as pd

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
def print_gpu_utilization():
    if torch.cuda.is_available():
        device = torch.cuda.current_device()  # 현재 GPU 디바이스 정보
        allocated_memory = torch.cuda.memory_allocated(device) / 1024**3  # 메모리 사용량 (GB)
        reserved_memory = torch.cuda.memory_reserved(device) / 1024**3  # 예약된 메모리 (GB)
        print(f"Allocated Memory: {allocated_memory:.2f} GB")
        print(f"Reserved Memory: {reserved_memory:.2f} GB")
    else:
        print("No GPU available.")

In [4]:
def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [5]:
size = 224
mean = (0.485, 0.456, 0.406)
std = (0.229, 0.224, 0.225)
batch_size = 32

In [6]:
train_transforms = transforms.Compose([
    transforms.RandomResizedCrop(size, scale=(0.5, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])

test_transforms = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(size),
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])

In [7]:
# CIFAR-10
trainset = datasets.CIFAR10(root='./data', train=True, download=True, transform=train_transforms)
trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True)

testset = datasets.CIFAR10(root='./data', train=False, download=True, transform=test_transforms)
testloader = DataLoader(testset, batch_size=batch_size, shuffle=False)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170M/170M [00:02<00:00, 69.9MB/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [8]:
VALID_RATIO = 0.7
n_train_examples = int(len(trainset) * VALID_RATIO)
n_valid_examples = len(trainset) - n_train_examples

train_data, valid_data = data.random_split(trainset, [n_train_examples, n_valid_examples])

In [9]:
valid_data = copy.deepcopy(valid_data)
valid_data.dataset.transform = test_transforms

In [10]:
len(train_data), len(valid_data), len(testset)

(35000, 15000, 10000)

In [11]:
sample_fraction = 0.2

# 무작위 인덱스 생성
train_indices = torch.randperm(len(trainset))[:int(len(trainset) * sample_fraction)]
valid_indices = torch.randperm(len(valid_data))[:int(len(valid_data) * sample_fraction)]
test_indices = torch.randperm(len(testset))[:int(len(testset) * sample_fraction)]

# 서브셋 생성
train_subset = Subset(trainset, train_indices)
valid_subset = Subset(valid_data, valid_indices)
test_subset = Subset(testset, test_indices)

In [12]:
len(train_subset), len(valid_subset), len(test_subset)

(10000, 3000, 2000)

In [13]:
train_iterator = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
valid_iterator = DataLoader(valid_subset, batch_size=batch_size, shuffle=False)
test_iterator = DataLoader(test_subset, batch_size=batch_size, shuffle=False)

In [14]:
class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_channels, out_channels, stride=1, downsample = False):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)

        if downsample:
            conv = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False)
            bn = nn.BatchNorm2d(out_channels)
            downsample = nn.Sequential(conv, bn)
        else:
            downsample = None
        self.downsample = downsample

    def forward(self, x):
        i = x
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.bn2(x)

        if self.downsample is not None:
            i = self.downsample(i)

        x += i
        x = self.relu(x)

        return x

In [15]:
class ResNet(nn.Module):
    def __init__(self, config, output_dim, zero_init_residual = False):
        super().__init__()

        block, n_blocks, channels = config
        self.in_channels = channels[0]
        assert len(n_blocks) == len(channels) == 4

        self.conv1 = nn.Conv2d(3, self.in_channels, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(self.in_channels)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.layer1 = self.get_resnet_layer(block, n_blocks[0], channels[0])
        self.layer2 = self.get_resnet_layer(block, n_blocks[1], channels[1], stride=2)
        self.layer3 = self.get_resnet_layer(block, n_blocks[2], channels[2], stride=2)
        self.layer4 = self.get_resnet_layer(block, n_blocks[3], channels[3], stride=2)

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(self.in_channels, output_dim)

        if zero_init_residual:
            for m in self.modules():
                if isinstance(m, BasicBlock):
                    nn.init.constant_(m.bn2.weight, 0)
                #elif isinstance(m, Bottleneck):
                    #nn.init.constant_(m.bn3.weight, 0)

    def get_resnet_layer(self, block, n_blocks, channels, stride=1):
        layers = []
        if self.in_channels != block.expansion * channels:
            downsample = True
        else:
            downsample = False
        layers.append(block(self.in_channels, channels, stride, downsample))
        for i in range(1, n_blocks):
            layers.append(block(block.expansion * channels, channels))

        self.in_channels = block.expansion * channels
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.avgpool(x)
        h = x.view(x.shape[0], -1)
        x = self.fc(h)
        return x, h

In [16]:
ResNetConfig = namedtuple('ResNetConfig', ['block', 'n_blocks', 'channels'])

In [17]:
resnet18_config = ResNetConfig(block = BasicBlock, n_blocks = [2, 2, 2, 2], channels = [64, 128, 256, 512])

In [18]:
pretrained_model = models.resnet18(pretrained=True)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 161MB/s]


In [19]:
print(pretrained_model)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [20]:
model = ResNet(resnet18_config, 10)

In [21]:
print(model)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kerne

In [22]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [24]:
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

pretrained_model = model.to(device)
criterion = criterion.to(device)

In [25]:
def calculate_accuracy(y_pred, y):
    top_pred = y_pred.argmax(1, keepdim=True)
    correct = top_pred.eq(y.view_as(top_pred)).sum()
    acc = correct.float() / y.shape[0]
    return acc

In [26]:
pattern = re.compile(r'key=(?P<key>\S+)\s+'
                     r'self_cpu_time=(?P<self_cpu_time>\S+)\s+'
                     r'cpu_time=(?P<cpu_time>\S+)\s+'
                     r'self_cuda_time=(?P<self_cuda_time>\S+)\s+'
                     r'cuda_time=(?P<cuda_time>\S+)\s+'
                     r'input_shapes=(?P<input_shapes>\S*)\s*'
                     r'cpu_memory_usage=(?P<cpu_memory_usage>\S*)\s*'
                     r'cuda_memory_usage=(?P<cuda_memory_usage>\S*)')

In [27]:
def train(model, train_loader, criterion, optimizer, device, k):
    start_time = time.monotonic()
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    with profile(
        activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
        profile_memory=True,  # 메모리 사용량 추적
        record_shapes=True  # 텐서 크기 기록
    ) as prof:
        for inputs, labels in tqdm(train_loader, desc="Training"):
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()  # 기울기 초기화

            with record_function("forward_pass"):  # Forward pass 프로파일링
                outputs = model(inputs)  # 모델 연산

            with record_function("loss_computation"):  # 손실 계산 프로파일링
                loss = criterion(outputs[0], labels)  # 손실 계산

            with record_function("backward_pass"):  # Backward pass 프로파일링
                loss.backward()  # 역전파

            for i, param in enumerate(model.parameters()):
                if i == len(list(model.parameters())) - 1:
                    break
                if param.grad is not None:
                # 기울기의 절댓값을 기준으로 상위 k개 기울기 추적
                    grad_values = param.grad.abs().view(-1)
                    #print(len(grad_values))
                    topk_values, _ = grad_values.topk(k, largest=True)

                    threshold = topk_values[-1]

                    # 임계값 이상이면 해당 기울기로 업데이트, 아니면 랜덤 값으로 대체
                    mask = param.grad.abs() >= threshold

                    random_grad = torch.randn_like(param.grad)
                    updated_grad = torch.where(mask, param.grad, random_grad)

                    param.grad = updated_grad.clone().detach()

            with record_function("optimizer_step"):
                optimizer.step()

            running_loss += loss.item()
            _, predicted = torch.max(outputs[0], 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    end_time = time.monotonic()
    selected_keys = ["forward_pass", "loss_computation", "backward_pass", "optimizer_step"]

    # key_averages()로부터 얻은 평균값을 필터링
    filtered_averages = [avg for avg in prof.key_averages() if avg.key in selected_keys]
    extracted_data = []

    for avg in filtered_averages:
      avg_str = str(avg)
      match = pattern.search(avg_str)
      if match:
        extracted_data.append(match.groupdict())
    df = pd.DataFrame(extracted_data)
    print(df)
    # 훈련 후 평균 손실과 정확도 계산
    epoch_loss = running_loss / len(train_loader)
    accuracy = 100 * correct / total

    return epoch_loss, accuracy, start_time, end_time

In [28]:
def evaluate(model, data_loader, criterion, device, phase="Validation"):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in tqdm(data_loader, desc=f"{phase}"):
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs[0], labels)

            running_loss += loss.item()
            _, predicted = torch.max(outputs[0], 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    epoch_loss = running_loss / len(data_loader)
    accuracy = 100 * correct / total
    # print(f"{phase} Loss: {epoch_loss:.4f}, {phase} Accuracy: {accuracy:.2f}%")

    return epoch_loss, accuracy

In [29]:
torch.cuda.empty_cache()

In [30]:
free_memory, total_memory = torch.cuda.mem_get_info()
print(f"Free memory: {free_memory / 1024**2:.2f} MB")
print(f"Total memory: {total_memory / 1024**2:.2f} MB")

Free memory: 40026.81 MB
Total memory: 40513.81 MB


In [31]:
EPOCHS = 10
best_valid_loss = float('inf')
total_time = 0
for epoch in range(EPOCHS + 1):

    train_loss, train_acc, start_time, end_time = train(model, train_iterator, criterion, optimizer, device, 10)

    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, device)
    #if valid_loss < best_valid_loss:
        #best_valid_loss = valid_loss
        #torch.save(model.state_dict(), 'vgg19-model.pt')

    # end_time = time.monotonic()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    total_time += end_time - start_time

    print(f'Epoch: {epoch+1:02} | Epoch Train Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc:.2f}%')

print("Train finished")

Training: 100%|██████████| 313/313 [00:39<00:00,  7.87it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     684.961ms   11.614ms        0.000us    4.888ms   
1      forward_pass       0.000us    0.000us         2.993s    9.531ms   
2  loss_computation      31.058ms  475.110us        0.000us    5.407us   
3  loss_computation       0.000us    0.000us       46.356ms  148.103us   
4     backward_pass        3.588s   11.565ms        0.000us    1.726us   
5     backward_pass       0.000us    0.000us      540.257us    1.726us   
6    optimizer_step      20.011ms    3.184ms        0.000us  683.682us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                             0     217042920448>  
1                             0                0>  
2                             0           641024>  
3                             0                0>  
4                             0    -202807742464>  
5                             0                0>  
6                           248         920

Validation: 100%|██████████| 94/94 [00:04<00:00, 19.72it/s]


Epoch: 01 | Epoch Train Time: 1m 29s
	Train Loss: 2.362 | Train Acc: 10.57%
	 Val. Loss: 2.441 |  Val. Acc: 10.17%


Training: 100%|██████████| 313/313 [00:37<00:00,  8.29it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     688.847ms    8.393ms        0.000us    4.887ms   
1      forward_pass       0.000us    0.000us         2.535s    8.098ms   
2  loss_computation      28.829ms  203.754us        0.000us    5.429us   
3  loss_computation       0.000us    0.000us       18.590ms   59.393us   
4     backward_pass        3.100s    9.955ms        0.000us    1.727us   
5     backward_pass       0.000us    0.000us      540.537us    1.727us   
6    optimizer_step      19.693ms    2.712ms        0.000us  682.141us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                             0     217037349888>  
1                             0                0>  
2                             0           641024>  
3                             0                0>  
4                             0    -202819538944>  
5                             0                0>  
6                             0            

Validation: 100%|██████████| 94/94 [00:05<00:00, 16.19it/s]


Epoch: 02 | Epoch Train Time: 1m 26s
	Train Loss: 2.385 | Train Acc: 8.74%
	 Val. Loss: 2.472 |  Val. Acc: 7.87%


Training: 100%|██████████| 313/313 [00:37<00:00,  8.34it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     688.693ms    8.382ms        0.000us    4.887ms   
1      forward_pass       0.000us    0.000us         2.532s    8.089ms   
2  loss_computation      29.274ms  205.427us        0.000us    5.426us   
3  loss_computation       0.000us    0.000us       18.574ms   59.340us   
4     backward_pass        3.009s    9.664ms        0.000us    1.727us   
5     backward_pass       0.000us    0.000us      540.607us    1.727us   
6    optimizer_step      19.947ms    2.710ms        0.000us  682.346us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                             0     217037349888>  
1                             0                0>  
2                             0           641024>  
3                             0                0>  
4                             0    -202819538944>  
5                             0                0>  
6                             0            

Validation: 100%|██████████| 94/94 [00:05<00:00, 17.58it/s]


Epoch: 03 | Epoch Train Time: 1m 26s
	Train Loss: 2.423 | Train Acc: 9.29%
	 Val. Loss: 2.451 |  Val. Acc: 9.27%


Training: 100%|██████████| 313/313 [00:37<00:00,  8.31it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     680.624ms    8.282ms        0.000us    4.888ms   
1      forward_pass       0.000us    0.000us         2.504s    8.000ms   
2  loss_computation      27.712ms  197.524us        0.000us    5.432us   
3  loss_computation       0.000us    0.000us       18.114ms   57.871us   
4     backward_pass        2.992s    9.607ms        0.000us    1.729us   
5     backward_pass       0.000us    0.000us      541.240us    1.729us   
6    optimizer_step      19.029ms    2.687ms        0.000us  682.348us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                             0     217030927360>  
1                             0                0>  
2                             0           641024>  
3                             0                0>  
4                             0    -202819538944>  
5                             0                0>  
6                             0            

Validation: 100%|██████████| 94/94 [00:06<00:00, 15.65it/s]


Epoch: 04 | Epoch Train Time: 1m 25s
	Train Loss: 2.447 | Train Acc: 9.17%
	 Val. Loss: 2.481 |  Val. Acc: 8.43%


Training: 100%|██████████| 313/313 [00:38<00:00,  8.08it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     697.304ms    8.406ms        0.000us    4.887ms   
1      forward_pass       0.000us    0.000us         2.539s    8.112ms   
2  loss_computation      28.535ms  202.665us        0.000us    5.426us   
3  loss_computation       0.000us    0.000us       18.562ms   59.304us   
4     backward_pass        3.012s    9.674ms        0.000us    1.727us   
5     backward_pass       0.000us    0.000us      540.480us    1.727us   
6    optimizer_step      20.139ms    2.723ms        0.000us  682.170us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                             0     217024504832>  
1                             0                0>  
2                             0           641024>  
3                             0                0>  
4                             0    -202819538944>  
5                             0                0>  
6                             0         447

Validation: 100%|██████████| 94/94 [00:06<00:00, 14.92it/s]


Epoch: 05 | Epoch Train Time: 1m 26s
	Train Loss: 2.421 | Train Acc: 10.35%
	 Val. Loss: 2.411 |  Val. Acc: 10.43%


Training: 100%|██████████| 313/313 [00:40<00:00,  7.80it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     701.081ms    8.405ms        0.000us    4.887ms   
1      forward_pass       0.000us    0.000us         2.542s    8.123ms   
2  loss_computation      28.177ms  199.598us        0.000us    5.426us   
3  loss_computation       0.000us    0.000us       18.299ms   58.464us   
4     backward_pass        3.017s    9.686ms        0.000us    1.729us   
5     backward_pass       0.000us    0.000us      541.044us    1.729us   
6    optimizer_step      18.907ms    2.674ms        0.000us  682.023us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                             0     217034138624>  
1                             0                0>  
2                             0           641024>  
3                             0                0>  
4                             0    -202819538944>  
5                             0                0>  
6                             0            

Validation: 100%|██████████| 94/94 [00:06<00:00, 14.74it/s]


Epoch: 06 | Epoch Train Time: 1m 27s
	Train Loss: 2.410 | Train Acc: 11.03%
	 Val. Loss: 2.404 |  Val. Acc: 11.43%


Training: 100%|██████████| 313/313 [00:40<00:00,  7.70it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     696.246ms    8.464ms        0.000us    4.888ms   
1      forward_pass       0.000us    0.000us         2.559s    8.176ms   
2  loss_computation      28.368ms  200.757us        0.000us    5.429us   
3  loss_computation       0.000us    0.000us       18.316ms   58.517us   
4     backward_pass        3.016s    9.685ms        0.000us    1.727us   
5     backward_pass       0.000us    0.000us      540.538us    1.727us   
6    optimizer_step      19.204ms    2.667ms        0.000us  682.277us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                             0     217024504832>  
1                             0                0>  
2                             0           641024>  
3                             0                0>  
4                             0    -202819538944>  
5                             0                0>  
6                             0            

Validation: 100%|██████████| 94/94 [00:06<00:00, 14.93it/s]


Epoch: 07 | Epoch Train Time: 1m 27s
	Train Loss: 2.420 | Train Acc: 10.55%
	 Val. Loss: 2.454 |  Val. Acc: 8.80%


Training: 100%|██████████| 313/313 [00:40<00:00,  7.74it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     707.301ms    8.528ms        0.000us    4.888ms   
1      forward_pass       0.000us    0.000us         2.577s    8.233ms   
2  loss_computation      29.311ms  207.166us        0.000us    5.435us   
3  loss_computation       0.000us    0.000us       18.838ms   60.184us   
4     backward_pass        3.052s    9.801ms        0.000us    1.728us   
5     backward_pass       0.000us    0.000us      540.845us    1.728us   
6    optimizer_step      19.156ms    2.759ms        0.000us  682.234us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                             0     217037349888>  
1                             0                0>  
2                             0           641024>  
3                             0                0>  
4                             0    -202819538944>  
5                             0                0>  
6                             0            

Validation: 100%|██████████| 94/94 [00:06<00:00, 13.99it/s]


Epoch: 08 | Epoch Train Time: 1m 27s
	Train Loss: 2.410 | Train Acc: 9.60%
	 Val. Loss: 2.436 |  Val. Acc: 9.73%


Training: 100%|██████████| 313/313 [00:41<00:00,  7.55it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     703.618ms    8.495ms        0.000us    4.887ms   
1      forward_pass       0.000us    0.000us         2.566s    8.199ms   
2  loss_computation      28.512ms  204.560us        0.000us    5.429us   
3  loss_computation       0.000us    0.000us       18.847ms   60.214us   
4     backward_pass        3.029s    9.728ms        0.000us    1.729us   
5     backward_pass       0.000us    0.000us      541.178us    1.729us   
6    optimizer_step      20.026ms    2.796ms        0.000us  682.077us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                             0     217024504832>  
1                             0                0>  
2                             0           641024>  
3                             0                0>  
4                             0    -202819538944>  
5                             0                0>  
6                             0            

Validation: 100%|██████████| 94/94 [00:06<00:00, 13.63it/s]


Epoch: 09 | Epoch Train Time: 1m 28s
	Train Loss: 2.393 | Train Acc: 10.29%
	 Val. Loss: 2.370 |  Val. Acc: 9.40%


Training: 100%|██████████| 313/313 [00:40<00:00,  7.69it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     708.027ms    8.509ms        0.000us    4.888ms   
1      forward_pass       0.000us    0.000us         2.572s    8.217ms   
2  loss_computation      28.227ms  201.292us        0.000us    5.426us   
3  loss_computation       0.000us    0.000us       18.566ms   59.318us   
4     backward_pass        3.076s    9.879ms        0.000us    1.727us   
5     backward_pass       0.000us    0.000us      540.519us    1.727us   
6    optimizer_step      19.758ms    2.802ms        0.000us  682.208us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                             0     217024504832>  
1                             0                0>  
2                             0           641024>  
3                             0                0>  
4                             0    -202819538944>  
5                             0                0>  
6                             0            

Validation: 100%|██████████| 94/94 [00:07<00:00, 12.81it/s]


Epoch: 10 | Epoch Train Time: 1m 28s
	Train Loss: 2.381 | Train Acc: 10.88%
	 Val. Loss: 2.406 |  Val. Acc: 9.87%


Training: 100%|██████████| 313/313 [00:46<00:00,  6.79it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     705.309ms    8.625ms        0.000us    4.887ms   
1      forward_pass       0.000us    0.000us         2.607s    8.329ms   
2  loss_computation      28.660ms  204.231us        0.000us    5.427us   
3  loss_computation       0.000us    0.000us       18.527ms   59.193us   
4     backward_pass        3.041s    9.765ms        0.000us    1.727us   
5     backward_pass       0.000us    0.000us      540.695us    1.727us   
6    optimizer_step      19.255ms    2.854ms        0.000us  682.127us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                             0     217037349888>  
1                             0                0>  
2                             0           641024>  
3                             0                0>  
4                             0    -202819538944>  
5                             0                0>  
6                             0            

Validation: 100%|██████████| 94/94 [00:06<00:00, 14.11it/s]

Epoch: 11 | Epoch Train Time: 1m 37s
	Train Loss: 2.418 | Train Acc: 10.40%
	 Val. Loss: 2.521 |  Val. Acc: 10.07%
Train finished





In [32]:
free_memory, total_memory = torch.cuda.mem_get_info()
print(f"Free memory: {free_memory / 1024**2:.2f} MB")
print(f"Total memory: {total_memory / 1024**2:.2f} MB")

Free memory: 38532.81 MB
Total memory: 40513.81 MB


In [33]:
print("ResNet18")
print(f'Total Training Time: {int(total_time/60)}m {int(total_time%60)}s')

ResNet18
Total Training Time: 16m 11s


In [34]:
torch.save(model.state_dict(), 'trained_model.pth')

In [35]:
import os
model_file_size = os.path.getsize('trained_model.pth')  # 바이트 단위
model_file_size_MB = model_file_size / (1024 ** 2)  # MB로 변환
print(f"Saved model file size: {model_file_size_MB:.2f} MB")

Saved model file size: 42.73 MB


In [36]:
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"{name} - Size: {param.size()} - Number of elements: {param.numel()}")

conv1.weight - Size: torch.Size([64, 3, 7, 7]) - Number of elements: 9408
bn1.weight - Size: torch.Size([64]) - Number of elements: 64
bn1.bias - Size: torch.Size([64]) - Number of elements: 64
layer1.0.conv1.weight - Size: torch.Size([64, 64, 3, 3]) - Number of elements: 36864
layer1.0.bn1.weight - Size: torch.Size([64]) - Number of elements: 64
layer1.0.bn1.bias - Size: torch.Size([64]) - Number of elements: 64
layer1.0.conv2.weight - Size: torch.Size([64, 64, 3, 3]) - Number of elements: 36864
layer1.0.bn2.weight - Size: torch.Size([64]) - Number of elements: 64
layer1.0.bn2.bias - Size: torch.Size([64]) - Number of elements: 64
layer1.1.conv1.weight - Size: torch.Size([64, 64, 3, 3]) - Number of elements: 36864
layer1.1.bn1.weight - Size: torch.Size([64]) - Number of elements: 64
layer1.1.bn1.bias - Size: torch.Size([64]) - Number of elements: 64
layer1.1.conv2.weight - Size: torch.Size([64, 64, 3, 3]) - Number of elements: 36864
layer1.1.bn2.weight - Size: torch.Size([64]) - Numbe

In [37]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params}")

Total number of parameters: 11181642


In [38]:
from torchsummary import summary

summary(model, input_size=(3, 224, 224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 112, 112]           9,408
       BatchNorm2d-2         [-1, 64, 112, 112]             128
              ReLU-3         [-1, 64, 112, 112]               0
         MaxPool2d-4           [-1, 64, 56, 56]               0
            Conv2d-5           [-1, 64, 56, 56]          36,864
       BatchNorm2d-6           [-1, 64, 56, 56]             128
              ReLU-7           [-1, 64, 56, 56]               0
            Conv2d-8           [-1, 64, 56, 56]          36,864
       BatchNorm2d-9           [-1, 64, 56, 56]             128
             ReLU-10           [-1, 64, 56, 56]               0
       BasicBlock-11           [-1, 64, 56, 56]               0
           Conv2d-12           [-1, 64, 56, 56]          36,864
      BatchNorm2d-13           [-1, 64, 56, 56]             128
             ReLU-14           [-1, 64,