In [1]:
!pip install pynvml

Collecting pynvml
  Downloading pynvml-11.5.3-py3-none-any.whl.metadata (8.8 kB)
Downloading pynvml-11.5.3-py3-none-any.whl (53 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/53.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pynvml
Successfully installed pynvml-11.5.3


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
import torchvision
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
from torch.profiler import profile, record_function, ProfilerActivity

import matplotlib.pyplot as plt
import numpy as np

import copy
from collections import namedtuple
import time
import os
import random
import re

import cv2
from torch.utils.data import Dataset, DataLoader, Subset
from PIL import Image

from tqdm import tqdm
from pynvml import *
import pandas as pd

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
def print_gpu_utilization():
    if torch.cuda.is_available():
        device = torch.cuda.current_device()  # 현재 GPU 디바이스 정보
        allocated_memory = torch.cuda.memory_allocated(device) / 1024**3  # 메모리 사용량 (GB)
        reserved_memory = torch.cuda.memory_reserved(device) / 1024**3  # 예약된 메모리 (GB)
        print(f"Allocated Memory: {allocated_memory:.2f} GB")
        print(f"Reserved Memory: {reserved_memory:.2f} GB")
    else:
        print("No GPU available.")

In [4]:
def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [5]:
size = 224
mean = (0.485, 0.456, 0.406)
std = (0.229, 0.224, 0.225)
batch_size = 32

In [6]:
train_transforms = transforms.Compose([
    transforms.RandomResizedCrop(size, scale=(0.5, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])

test_transforms = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(size),
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])

In [7]:
# CIFAR-10
trainset = datasets.CIFAR10(root='./data', train=True, download=True, transform=train_transforms)
trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True)

testset = datasets.CIFAR10(root='./data', train=False, download=True, transform=test_transforms)
testloader = DataLoader(testset, batch_size=batch_size, shuffle=False)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170M/170M [00:01<00:00, 98.8MB/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [8]:
VALID_RATIO = 0.7
n_train_examples = int(len(trainset) * VALID_RATIO)
n_valid_examples = len(trainset) - n_train_examples

train_data, valid_data = data.random_split(trainset, [n_train_examples, n_valid_examples])

In [9]:
valid_data = copy.deepcopy(valid_data)
valid_data.dataset.transform = test_transforms

In [10]:
len(train_data), len(valid_data), len(testset)

(35000, 15000, 10000)

In [11]:
sample_fraction = 0.2

# 무작위 인덱스 생성
train_indices = torch.randperm(len(trainset))[:int(len(trainset) * sample_fraction)]
valid_indices = torch.randperm(len(valid_data))[:int(len(valid_data) * sample_fraction)]
test_indices = torch.randperm(len(testset))[:int(len(testset) * sample_fraction)]

# 서브셋 생성
train_subset = Subset(trainset, train_indices)
valid_subset = Subset(valid_data, valid_indices)
test_subset = Subset(testset, test_indices)

In [12]:
len(train_subset), len(valid_subset), len(test_subset)

(10000, 3000, 2000)

In [13]:
train_iterator = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
valid_iterator = DataLoader(valid_subset, batch_size=batch_size, shuffle=False)
test_iterator = DataLoader(test_subset, batch_size=batch_size, shuffle=False)

In [14]:
class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_channels, out_channels, stride=1, downsample = False):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)

        if downsample:
            conv = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False)
            bn = nn.BatchNorm2d(out_channels)
            downsample = nn.Sequential(conv, bn)
        else:
            downsample = None
        self.downsample = downsample

    def forward(self, x):
        i = x
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.bn2(x)

        if self.downsample is not None:
            i = self.downsample(i)

        x += i
        x = self.relu(x)

        return x

In [15]:
class ResNet(nn.Module):
    def __init__(self, config, output_dim, zero_init_residual = False):
        super().__init__()

        block, n_blocks, channels = config
        self.in_channels = channels[0]
        assert len(n_blocks) == len(channels) == 4

        self.conv1 = nn.Conv2d(3, self.in_channels, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(self.in_channels)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.layer1 = self.get_resnet_layer(block, n_blocks[0], channels[0])
        self.layer2 = self.get_resnet_layer(block, n_blocks[1], channels[1], stride=2)
        self.layer3 = self.get_resnet_layer(block, n_blocks[2], channels[2], stride=2)
        self.layer4 = self.get_resnet_layer(block, n_blocks[3], channels[3], stride=2)

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(self.in_channels, output_dim)

        if zero_init_residual:
            for m in self.modules():
                if isinstance(m, BasicBlock):
                    nn.init.constant_(m.bn2.weight, 0)
                #elif isinstance(m, Bottleneck):
                    #nn.init.constant_(m.bn3.weight, 0)

    def get_resnet_layer(self, block, n_blocks, channels, stride=1):
        layers = []
        if self.in_channels != block.expansion * channels:
            downsample = True
        else:
            downsample = False
        layers.append(block(self.in_channels, channels, stride, downsample))
        for i in range(1, n_blocks):
            layers.append(block(block.expansion * channels, channels))

        self.in_channels = block.expansion * channels
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.avgpool(x)
        h = x.view(x.shape[0], -1)
        x = self.fc(h)
        return x, h

In [16]:
ResNetConfig = namedtuple('ResNetConfig', ['block', 'n_blocks', 'channels'])

In [17]:
resnet18_config = ResNetConfig(block = BasicBlock, n_blocks = [2, 2, 2, 2], channels = [64, 128, 256, 512])

In [18]:
pretrained_model = models.resnet18(pretrained=True)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 151MB/s]


In [19]:
print(pretrained_model)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [20]:
model = ResNet(resnet18_config, 10)

In [21]:
print(model)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kerne

In [22]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [23]:
optimizer = optim.SGD(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

pretrained_model = model.to(device)
criterion = criterion.to(device)

In [24]:
def calculate_accuracy(y_pred, y):
    top_pred = y_pred.argmax(1, keepdim=True)
    correct = top_pred.eq(y.view_as(top_pred)).sum()
    acc = correct.float() / y.shape[0]
    return acc

In [25]:
pattern = re.compile(r'key=(?P<key>\S+)\s+'
                     r'self_cpu_time=(?P<self_cpu_time>\S+)\s+'
                     r'cpu_time=(?P<cpu_time>\S+)\s+'
                     r'self_cuda_time=(?P<self_cuda_time>\S+)\s+'
                     r'cuda_time=(?P<cuda_time>\S+)\s+'
                     r'input_shapes=(?P<input_shapes>\S*)\s*'
                     r'cpu_memory_usage=(?P<cpu_memory_usage>\S*)\s*'
                     r'cuda_memory_usage=(?P<cuda_memory_usage>\S*)')

In [26]:
def train(model, train_loader, criterion, optimizer, device):
    start_time = time.monotonic()
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    alpha = optimizer.param_groups[0]['lr']
    # PyTorch Profiler 시작
    with profile(
        activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
        profile_memory=True,  # 메모리 사용량 추적
        record_shapes=True  # 텐서 크기 기록
    ) as prof:
        for inputs, labels in tqdm(train_loader, desc="Training"):
            inputs, labels = inputs.to(device), labels.to(device)

            for param in model.parameters():
              if param.grad is not None:
                  param.grad.zero_()
            #optimizer.zero_grad()  # 기울기 초기화

            with record_function("forward_pass"):  # Forward pass 프로파일링
                outputs = model(inputs)  # 모델 연산

            with record_function("loss_computation"):  # 손실 계산 프로파일링
                loss = criterion(outputs[0], labels)  # 손실 계산

            with record_function("backward_pass"):  # Backward pass 프로파일링
                loss.backward()  # 역전파

            with record_function("optimizer_step"):  # 파라미터 업데이트 프로파일링
                #optimizer.step()  # 파라미터 업데이트
                with torch.no_grad():
                  for param in model.parameters():
                    if param.grad is not None:
                      param -= alpha * param.grad

            # 손실 및 정확도 누적
            running_loss += loss.item()
            _, predicted = torch.max(outputs[0], 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    end_time = time.monotonic()

    # 프로파일링 결과 요약 출력
    selected_keys = ["forward_pass", "loss_computation", "backward_pass", "optimizer_step"]

    # key_averages()로부터 얻은 평균값을 필터링
    filtered_averages = [avg for avg in prof.key_averages() if avg.key in selected_keys]
    extracted_data = []

    for avg in filtered_averages:
      avg_str = str(avg)
      match = pattern.search(avg_str)
      if match:
        extracted_data.append(match.groupdict())
    df = pd.DataFrame(extracted_data)
    print(df)
    #print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=20))
    # 훈련 후 평균 손실과 정확도 계산
    epoch_loss = running_loss / len(train_loader)
    accuracy = 100 * correct / total

    return epoch_loss, accuracy, start_time, end_time

In [27]:
def evaluate(model, data_loader, criterion, device, phase="Validation"):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in tqdm(data_loader, desc=f"{phase}"):
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs[0], labels)

            running_loss += loss.item()
            _, predicted = torch.max(outputs[0], 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    epoch_loss = running_loss / len(data_loader)
    accuracy = 100 * correct / total
    # print(f"{phase} Loss: {epoch_loss:.4f}, {phase} Accuracy: {accuracy:.2f}%")

    return epoch_loss, accuracy

In [28]:
torch.cuda.empty_cache()

In [29]:
free_memory, total_memory = torch.cuda.mem_get_info()
print(f"Free memory: {free_memory / 1024**2:.2f} MB")
print(f"Total memory: {total_memory / 1024**2:.2f} MB")

Free memory: 40026.81 MB
Total memory: 40513.81 MB


In [30]:
EPOCHS = 10
best_valid_loss = float('inf')
total_time = 0
for epoch in range(EPOCHS + 1):

    train_loss, train_acc, start_time, end_time = train(model, train_iterator, criterion, optimizer, device)

    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, device)
    #if valid_loss < best_valid_loss:
        #best_valid_loss = valid_loss
        #torch.save(model.state_dict(), 'vgg19-model.pt')

    # end_time = time.monotonic()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    total_time += end_time - start_time

    print(f'Epoch: {epoch+1:02} | Epoch Train Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc:.2f}%')

print("Train finished")

Training: 100%|██████████| 313/313 [00:31<00:00,  9.92it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     750.928ms   12.275ms        0.000us    4.904ms   
1      forward_pass       0.000us    0.000us         3.151s   10.035ms   
2  loss_computation      33.948ms  510.254us        0.000us    6.037us   
3  loss_computation       0.000us    0.000us       49.623ms  158.540us   
4     backward_pass        4.141s   13.330ms        0.000us    1.746us   
5     backward_pass       0.000us    0.000us      546.624us    1.746us   
6    optimizer_step     404.464ms    3.497ms        0.000us  410.664us   
7    optimizer_step       0.000us    0.000us      235.144ms  751.258us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                             0     216906802176>  
1                             0                0>  
2                             0           641024>  
3                             0                0>  
4                             0    -216844656640>  
5                    

Validation: 100%|██████████| 94/94 [00:04<00:00, 20.12it/s]


Epoch: 01 | Epoch Train Time: 1m 3s
	Train Loss: 2.034 | Train Acc: 25.75%
	 Val. Loss: 1.821 |  Val. Acc: 34.43%


Training: 100%|██████████| 313/313 [00:27<00:00, 11.32it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     709.980ms    8.446ms        0.000us    4.903ms   
1      forward_pass       0.000us    0.000us         2.553s    8.156ms   
2  loss_computation      30.110ms  208.811us        0.000us    6.033us   
3  loss_computation       0.000us    0.000us       18.584ms   59.374us   
4     backward_pass        3.434s   11.021ms        0.000us    1.748us   
5     backward_pass       0.000us    0.000us      547.196us    1.748us   
6    optimizer_step     383.679ms    3.218ms        0.000us  410.392us   
7    optimizer_step       0.000us    0.000us      188.392ms  601.892us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                             0     216907260928>  
1                             0                0>  
2                             0           641024>  
3                             0                0>  
4                             0    -216901444608>  
5                    

Validation: 100%|██████████| 94/94 [00:05<00:00, 17.71it/s]


Epoch: 02 | Epoch Train Time: 0m 58s
	Train Loss: 1.792 | Train Acc: 34.69%
	 Val. Loss: 1.669 |  Val. Acc: 37.83%


Training: 100%|██████████| 313/313 [00:26<00:00, 11.71it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     699.834ms    8.333ms        0.000us    4.905ms   
1      forward_pass       0.000us    0.000us         2.524s    8.065ms   
2  loss_computation      28.353ms  202.612us        0.000us    6.030us   
3  loss_computation       0.000us    0.000us       18.505ms   59.120us   
4     backward_pass        3.386s   10.866ms        0.000us    1.748us   
5     backward_pass       0.000us    0.000us      546.973us    1.748us   
6    optimizer_step     380.321ms    3.202ms        0.000us  410.512us   
7    optimizer_step       0.000us    0.000us      188.115ms  601.008us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                             0     216913683456>  
1                             0                0>  
2                             0           641024>  
3                             0                0>  
4                             0    -216901444608>  
5                    

Validation: 100%|██████████| 94/94 [00:05<00:00, 15.76it/s]


Epoch: 03 | Epoch Train Time: 0m 57s
	Train Loss: 1.688 | Train Acc: 38.33%
	 Val. Loss: 1.576 |  Val. Acc: 42.43%


Training: 100%|██████████| 313/313 [00:30<00:00, 10.41it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     724.603ms    8.661ms        0.000us    4.905ms   
1      forward_pass       0.000us    0.000us         2.625s    8.387ms   
2  loss_computation      28.203ms  204.088us        0.000us    6.036us   
3  loss_computation       0.000us    0.000us       18.926ms   60.466us   
4     backward_pass        3.378s   10.841ms        0.000us    1.748us   
5     backward_pass       0.000us    0.000us      547.082us    1.748us   
6    optimizer_step     406.157ms    3.341ms        0.000us  410.527us   
7    optimizer_step       0.000us    0.000us      191.153ms  610.712us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                             0     216913683456>  
1                             0                0>  
2                             0           641024>  
3                             0                0>  
4                             0    -216901445120>  
5                    

Validation: 100%|██████████| 94/94 [00:05<00:00, 15.90it/s]


Epoch: 04 | Epoch Train Time: 1m 1s
	Train Loss: 1.626 | Train Acc: 40.77%
	 Val. Loss: 1.550 |  Val. Acc: 43.03%


Training: 100%|██████████| 313/313 [00:28<00:00, 11.00it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     710.250ms    8.440ms        0.000us    4.906ms   
1      forward_pass       0.000us    0.000us         2.558s    8.174ms   
2  loss_computation      28.711ms  204.478us        0.000us    6.035us   
3  loss_computation       0.000us    0.000us       18.645ms   59.568us   
4     backward_pass        3.397s   10.900ms        0.000us    1.748us   
5     backward_pass       0.000us    0.000us      547.105us    1.748us   
6    optimizer_step     388.009ms    3.254ms        0.000us  410.496us   
7    optimizer_step       0.000us    0.000us      188.667ms  602.770us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                             0     216900838400>  
1                             0                0>  
2                             0           641024>  
3                             0                0>  
4                             0    -216901445120>  
5                    

Validation: 100%|██████████| 94/94 [00:06<00:00, 15.43it/s]


Epoch: 05 | Epoch Train Time: 0m 59s
	Train Loss: 1.559 | Train Acc: 43.25%
	 Val. Loss: 1.485 |  Val. Acc: 46.77%


Training: 100%|██████████| 313/313 [00:31<00:00,  9.86it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     718.199ms    8.623ms        0.000us    4.906ms   
1      forward_pass       0.000us    0.000us         2.614s    8.352ms   
2  loss_computation      29.401ms  208.578us        0.000us    6.032us   
3  loss_computation       0.000us    0.000us       18.996ms   60.689us   
4     backward_pass        3.417s   10.964ms        0.000us    1.749us   
5     backward_pass       0.000us    0.000us      547.550us    1.749us   
6    optimizer_step     402.628ms    3.312ms        0.000us  410.461us   
7    optimizer_step       0.000us    0.000us      192.451ms  614.859us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                             0     216900838400>  
1                             0                0>  
2                             0           641024>  
3                             0                0>  
4                             0    -216901445120>  
5                    

Validation: 100%|██████████| 94/94 [00:06<00:00, 14.73it/s]


Epoch: 06 | Epoch Train Time: 1m 3s
	Train Loss: 1.512 | Train Acc: 45.33%
	 Val. Loss: 1.436 |  Val. Acc: 47.63%


Training: 100%|██████████| 313/313 [00:30<00:00, 10.18it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     716.480ms    8.546ms        0.000us    4.905ms   
1      forward_pass       0.000us    0.000us         2.590s    8.274ms   
2  loss_computation      28.388ms  203.161us        0.000us    6.035us   
3  loss_computation       0.000us    0.000us       18.540ms   59.234us   
4     backward_pass        3.437s   11.029ms        0.000us    1.748us   
5     backward_pass       0.000us    0.000us      547.233us    1.748us   
6    optimizer_step     393.096ms    3.259ms        0.000us  410.468us   
7    optimizer_step       0.000us    0.000us      192.916ms  616.347us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                             0     216900838400>  
1                             0                0>  
2                             0           641024>  
3                             0                0>  
4                             0    -216901445120>  
5                    

Validation: 100%|██████████| 94/94 [00:06<00:00, 15.29it/s]


Epoch: 07 | Epoch Train Time: 1m 2s
	Train Loss: 1.474 | Train Acc: 46.80%
	 Val. Loss: 1.439 |  Val. Acc: 47.63%


Training: 100%|██████████| 313/313 [00:31<00:00,  9.86it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     716.130ms    8.600ms        0.000us    4.905ms   
1      forward_pass       0.000us    0.000us         2.605s    8.323ms   
2  loss_computation      28.928ms  206.737us        0.000us    6.030us   
3  loss_computation       0.000us    0.000us       18.788ms   60.026us   
4     backward_pass        3.411s   10.947ms        0.000us    1.750us   
5     backward_pass       0.000us    0.000us      547.603us    1.750us   
6    optimizer_step     399.751ms    3.278ms        0.000us  410.483us   
7    optimizer_step       0.000us    0.000us      191.283ms  611.127us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                             0     216900838400>  
1                             0                0>  
2                             0           641024>  
3                             0                0>  
4                             0    -216901445120>  
5                    

Validation: 100%|██████████| 94/94 [00:06<00:00, 14.69it/s]


Epoch: 08 | Epoch Train Time: 1m 3s
	Train Loss: 1.430 | Train Acc: 48.23%
	 Val. Loss: 1.331 |  Val. Acc: 52.13%


Training: 100%|██████████| 313/313 [00:30<00:00, 10.28it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     712.177ms    8.533ms        0.000us    4.904ms   
1      forward_pass       0.000us    0.000us         2.578s    8.235ms   
2  loss_computation      29.565ms  209.331us        0.000us    6.036us   
3  loss_computation       0.000us    0.000us       18.937ms   60.502us   
4     backward_pass        3.445s   11.056ms        0.000us    1.746us   
5     backward_pass       0.000us    0.000us      546.495us    1.746us   
6    optimizer_step     398.206ms    3.274ms        0.000us  410.443us   
7    optimizer_step       0.000us    0.000us      190.823ms  609.657us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                             0     216900838400>  
1                             0                0>  
2                             0           641024>  
3                             0                0>  
4                             0    -216901445120>  
5                    

Validation: 100%|██████████| 94/94 [00:06<00:00, 13.94it/s]


Epoch: 09 | Epoch Train Time: 1m 1s
	Train Loss: 1.397 | Train Acc: 49.60%
	 Val. Loss: 1.387 |  Val. Acc: 49.33%


Training: 100%|██████████| 313/313 [00:32<00:00,  9.53it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     724.639ms    8.737ms        0.000us    4.906ms   
1      forward_pass       0.000us    0.000us         2.645s    8.452ms   
2  loss_computation      29.882ms  211.188us        0.000us    6.033us   
3  loss_computation       0.000us    0.000us       19.047ms   60.854us   
4     backward_pass        3.428s   10.999ms        0.000us    1.748us   
5     backward_pass       0.000us    0.000us      547.174us    1.748us   
6    optimizer_step     413.056ms    3.354ms        0.000us  410.452us   
7    optimizer_step       0.000us    0.000us      194.777ms  622.291us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                             0     216900838400>  
1                             0                0>  
2                             0           641024>  
3                             0                0>  
4                             0    -216901445120>  
5                    

Validation: 100%|██████████| 94/94 [00:06<00:00, 15.29it/s]


Epoch: 10 | Epoch Train Time: 1m 6s
	Train Loss: 1.365 | Train Acc: 50.21%
	 Val. Loss: 1.332 |  Val. Acc: 51.33%


Training: 100%|██████████| 313/313 [00:30<00:00, 10.13it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     705.170ms    8.522ms        0.000us    4.906ms   
1      forward_pass       0.000us    0.000us         2.581s    8.245ms   
2  loss_computation      28.918ms  204.796us        0.000us    6.036us   
3  loss_computation       0.000us    0.000us       18.529ms   59.197us   
4     backward_pass        3.413s   10.953ms        0.000us    1.748us   
5     backward_pass       0.000us    0.000us      547.100us    1.748us   
6    optimizer_step     397.199ms    3.274ms        0.000us  410.490us   
7    optimizer_step       0.000us    0.000us      193.002ms  616.620us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                             0     216900838400>  
1                             0                0>  
2                             0           641024>  
3                             0                0>  
4                             0    -216901445120>  
5                    

Validation: 100%|██████████| 94/94 [00:07<00:00, 13.22it/s]

Epoch: 11 | Epoch Train Time: 1m 2s
	Train Loss: 1.336 | Train Acc: 52.30%
	 Val. Loss: 1.272 |  Val. Acc: 53.30%
Train finished





In [31]:
free_memory, total_memory = torch.cuda.mem_get_info()
print(f"Free memory: {free_memory / 1024**2:.2f} MB")
print(f"Total memory: {total_memory / 1024**2:.2f} MB")

Free memory: 38544.81 MB
Total memory: 40513.81 MB


In [32]:
print("ResNet18")
print(f'Total Training Time: {int(total_time/60)}m {int(total_time%60)}s')

ResNet18
Total Training Time: 11m 20s


In [33]:
torch.save(model.state_dict(), 'trained_model.pth')

In [34]:
import os
model_file_size = os.path.getsize('trained_model.pth')  # 바이트 단위
model_file_size_MB = model_file_size / (1024 ** 2)  # MB로 변환
print(f"Saved model file size: {model_file_size_MB:.2f} MB")

Saved model file size: 42.73 MB


In [35]:
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"{name} - Size: {param.size()} - Number of elements: {param.numel()}")

conv1.weight - Size: torch.Size([64, 3, 7, 7]) - Number of elements: 9408
bn1.weight - Size: torch.Size([64]) - Number of elements: 64
bn1.bias - Size: torch.Size([64]) - Number of elements: 64
layer1.0.conv1.weight - Size: torch.Size([64, 64, 3, 3]) - Number of elements: 36864
layer1.0.bn1.weight - Size: torch.Size([64]) - Number of elements: 64
layer1.0.bn1.bias - Size: torch.Size([64]) - Number of elements: 64
layer1.0.conv2.weight - Size: torch.Size([64, 64, 3, 3]) - Number of elements: 36864
layer1.0.bn2.weight - Size: torch.Size([64]) - Number of elements: 64
layer1.0.bn2.bias - Size: torch.Size([64]) - Number of elements: 64
layer1.1.conv1.weight - Size: torch.Size([64, 64, 3, 3]) - Number of elements: 36864
layer1.1.bn1.weight - Size: torch.Size([64]) - Number of elements: 64
layer1.1.bn1.bias - Size: torch.Size([64]) - Number of elements: 64
layer1.1.conv2.weight - Size: torch.Size([64, 64, 3, 3]) - Number of elements: 36864
layer1.1.bn2.weight - Size: torch.Size([64]) - Numbe

In [36]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params}")

Total number of parameters: 11181642


In [37]:
from torchsummary import summary

summary(model, input_size=(3, 224, 224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 112, 112]           9,408
       BatchNorm2d-2         [-1, 64, 112, 112]             128
              ReLU-3         [-1, 64, 112, 112]               0
         MaxPool2d-4           [-1, 64, 56, 56]               0
            Conv2d-5           [-1, 64, 56, 56]          36,864
       BatchNorm2d-6           [-1, 64, 56, 56]             128
              ReLU-7           [-1, 64, 56, 56]               0
            Conv2d-8           [-1, 64, 56, 56]          36,864
       BatchNorm2d-9           [-1, 64, 56, 56]             128
             ReLU-10           [-1, 64, 56, 56]               0
       BasicBlock-11           [-1, 64, 56, 56]               0
           Conv2d-12           [-1, 64, 56, 56]          36,864
      BatchNorm2d-13           [-1, 64, 56, 56]             128
             ReLU-14           [-1, 64,