In [1]:
!pip install pynvml

Collecting pynvml
  Downloading pynvml-11.5.3-py3-none-any.whl.metadata (8.8 kB)
Downloading pynvml-11.5.3-py3-none-any.whl (53 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/53.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pynvml
Successfully installed pynvml-11.5.3


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
import torchvision
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
from torch.profiler import profile, record_function, ProfilerActivity

import matplotlib.pyplot as plt
import numpy as np

import copy
from collections import namedtuple
import time
import os
import random
import re

import cv2
from torch.utils.data import Dataset, DataLoader, Subset
from PIL import Image

from tqdm import tqdm
from pynvml import *
import pandas as pd

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
def print_gpu_utilization():
    if torch.cuda.is_available():
        device = torch.cuda.current_device()  # 현재 GPU 디바이스 정보
        allocated_memory = torch.cuda.memory_allocated(device) / 1024**3  # 메모리 사용량 (GB)
        reserved_memory = torch.cuda.memory_reserved(device) / 1024**3  # 예약된 메모리 (GB)
        print(f"Allocated Memory: {allocated_memory:.2f} GB")
        print(f"Reserved Memory: {reserved_memory:.2f} GB")
    else:
        print("No GPU available.")

In [4]:
def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [5]:
size = 224
mean = (0.485, 0.456, 0.406)
std = (0.229, 0.224, 0.225)
batch_size = 32

In [6]:
train_transforms = transforms.Compose([
    transforms.RandomResizedCrop(size, scale=(0.5, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])

test_transforms = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(size),
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])

In [7]:
# CIFAR-10
trainset = datasets.CIFAR10(root='./data', train=True, download=True, transform=train_transforms)
trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True)

testset = datasets.CIFAR10(root='./data', train=False, download=True, transform=test_transforms)
testloader = DataLoader(testset, batch_size=batch_size, shuffle=False)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170M/170M [00:13<00:00, 12.3MB/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [8]:
VALID_RATIO = 0.7
n_train_examples = int(len(trainset) * VALID_RATIO)
n_valid_examples = len(trainset) - n_train_examples

train_data, valid_data = data.random_split(trainset, [n_train_examples, n_valid_examples])

In [9]:
valid_data = copy.deepcopy(valid_data)
valid_data.dataset.transform = test_transforms

In [10]:
len(train_data), len(valid_data), len(testset)

(35000, 15000, 10000)

In [11]:
sample_fraction = 0.2

# 무작위 인덱스 생성
train_indices = torch.randperm(len(trainset))[:int(len(trainset) * sample_fraction)]
valid_indices = torch.randperm(len(valid_data))[:int(len(valid_data) * sample_fraction)]
test_indices = torch.randperm(len(testset))[:int(len(testset) * sample_fraction)]

# 서브셋 생성
train_subset = Subset(trainset, train_indices)
valid_subset = Subset(valid_data, valid_indices)
test_subset = Subset(testset, test_indices)

In [12]:
len(train_subset), len(valid_subset), len(test_subset)

(10000, 3000, 2000)

In [13]:
train_iterator = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
valid_iterator = DataLoader(valid_subset, batch_size=batch_size, shuffle=False)
test_iterator = DataLoader(test_subset, batch_size=batch_size, shuffle=False)

In [14]:
class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_channels, out_channels, stride=1, downsample = False):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)

        if downsample:
            conv = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False)
            bn = nn.BatchNorm2d(out_channels)
            downsample = nn.Sequential(conv, bn)
        else:
            downsample = None
        self.downsample = downsample

    def forward(self, x):
        i = x
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.bn2(x)

        if self.downsample is not None:
            i = self.downsample(i)

        x += i
        x = self.relu(x)

        return x

In [15]:
class ResNet(nn.Module):
    def __init__(self, config, output_dim, zero_init_residual = False):
        super().__init__()

        block, n_blocks, channels = config
        self.in_channels = channels[0]
        assert len(n_blocks) == len(channels) == 4

        self.conv1 = nn.Conv2d(3, self.in_channels, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(self.in_channels)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.layer1 = self.get_resnet_layer(block, n_blocks[0], channels[0])
        self.layer2 = self.get_resnet_layer(block, n_blocks[1], channels[1], stride=2)
        self.layer3 = self.get_resnet_layer(block, n_blocks[2], channels[2], stride=2)
        self.layer4 = self.get_resnet_layer(block, n_blocks[3], channels[3], stride=2)

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(self.in_channels, output_dim)

        if zero_init_residual:
            for m in self.modules():
                if isinstance(m, BasicBlock):
                    nn.init.constant_(m.bn2.weight, 0)
                #elif isinstance(m, Bottleneck):
                    #nn.init.constant_(m.bn3.weight, 0)

    def get_resnet_layer(self, block, n_blocks, channels, stride=1):
        layers = []
        if self.in_channels != block.expansion * channels:
            downsample = True
        else:
            downsample = False
        layers.append(block(self.in_channels, channels, stride, downsample))
        for i in range(1, n_blocks):
            layers.append(block(block.expansion * channels, channels))

        self.in_channels = block.expansion * channels
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.avgpool(x)
        h = x.view(x.shape[0], -1)
        x = self.fc(h)
        return x, h

In [16]:
ResNetConfig = namedtuple('ResNetConfig', ['block', 'n_blocks', 'channels'])

In [17]:
resnet18_config = ResNetConfig(block = BasicBlock, n_blocks = [2, 2, 2, 2], channels = [64, 128, 256, 512])

In [18]:
model = ResNet(resnet18_config, 10)

In [19]:
print(model)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kerne

In [20]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [21]:
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

pretrained_model = model.to(device)
criterion = criterion.to(device)

In [22]:
def calculate_accuracy(y_pred, y):
    top_pred = y_pred.argmax(1, keepdim=True)
    correct = top_pred.eq(y.view_as(top_pred)).sum()
    acc = correct.float() / y.shape[0]
    return acc

In [23]:
pattern = re.compile(r'key=(?P<key>\S+)\s+'
                     r'self_cpu_time=(?P<self_cpu_time>\S+)\s+'
                     r'cpu_time=(?P<cpu_time>\S+)\s+'
                     r'self_cuda_time=(?P<self_cuda_time>\S+)\s+'
                     r'cuda_time=(?P<cuda_time>\S+)\s+'
                     r'input_shapes=(?P<input_shapes>\S*)\s*'
                     r'cpu_memory_usage=(?P<cpu_memory_usage>\S*)\s*'
                     r'cuda_memory_usage=(?P<cuda_memory_usage>\S*)')

In [24]:
def train(model, train_loader, criterion, optimizer, device, k):
    start_time = time.monotonic()
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    with profile(
        activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
        profile_memory=True,  # 메모리 사용량 추적
        record_shapes=True  # 텐서 크기 기록
    ) as prof:
        for inputs, labels in tqdm(train_loader, desc="Training"):
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()  # 기울기 초기화

            with record_function("forward_pass"):  # Forward pass 프로파일링
                outputs = model(inputs)  # 모델 연산

            with record_function("loss_computation"):  # 손실 계산 프로파일링
                loss = criterion(outputs[0], labels)  # 손실 계산

            with record_function("backward_pass"):  # Backward pass 프로파일링
                loss.backward()  # 역전파

            for i, param in enumerate(model.parameters()):
                if i == len(list(model.parameters())) - 1:
                    break
                if param.grad is not None:
                # 기울기의 절댓값을 기준으로 상위 k개 기울기 추적
                    grad_values = param.grad.abs().view(-1)
                    #print(len(grad_values))
                    topk_values, _ = grad_values.topk(k, largest=True)

                    threshold = topk_values[-1]

                    # 임계값 이상이면 해당 기울기로 업데이트, 아니면 랜덤 값으로 대체
                    mask = param.grad.abs() >= threshold

                    updated_grad = torch.zeros_like(param.grad)
                    updated_grad[mask] = param.grad[mask]

                    del param.grad
                    torch.cuda.empty_cache()
                    param.grad = updated_grad.clone().detach()

            with record_function("optimizer_step"):
                optimizer.step()

            running_loss += loss.item()
            _, predicted = torch.max(outputs[0], 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    end_time = time.monotonic()
    selected_keys = ["forward_pass", "loss_computation", "backward_pass", "optimizer_step"]

    # key_averages()로부터 얻은 평균값을 필터링
    filtered_averages = [avg for avg in prof.key_averages() if avg.key in selected_keys]
    extracted_data = []

    for avg in filtered_averages:
      avg_str = str(avg)
      match = pattern.search(avg_str)
      if match:
        extracted_data.append(match.groupdict())
    df = pd.DataFrame(extracted_data)
    print(df)
    free_memory, total_memory = torch.cuda.mem_get_info()
    print(f"Free memory: {free_memory / 1024**2:.2f} MB")
    print(f"Total memory: {total_memory / 1024**2:.2f} MB")
    # 훈련 후 평균 손실과 정확도 계산
    epoch_loss = running_loss / len(train_loader)
    accuracy = 100 * correct / total

    return epoch_loss, accuracy, start_time, end_time

In [25]:
def evaluate(model, data_loader, criterion, device, phase="Validation"):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in tqdm(data_loader, desc=f"{phase}"):
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs[0], labels)

            running_loss += loss.item()
            _, predicted = torch.max(outputs[0], 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    epoch_loss = running_loss / len(data_loader)
    accuracy = 100 * correct / total
    # print(f"{phase} Loss: {epoch_loss:.4f}, {phase} Accuracy: {accuracy:.2f}%")

    return epoch_loss, accuracy

In [26]:
torch.cuda.empty_cache()

In [27]:
free_memory, total_memory = torch.cuda.mem_get_info()
print(f"Free memory: {free_memory / 1024**2:.2f} MB")
print(f"Total memory: {total_memory / 1024**2:.2f} MB")

Free memory: 40026.81 MB
Total memory: 40513.81 MB


In [28]:
EPOCHS = 10
best_valid_loss = float('inf')
total_time = 0
for epoch in range(EPOCHS):

    train_loss, train_acc, start_time, end_time = train(model, train_iterator, criterion, optimizer, device, 7)

    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, device)
    #if valid_loss < best_valid_loss:
        #best_valid_loss = valid_loss
        #torch.save(model.state_dict(), 'vgg19-model.pt')

    # end_time = time.monotonic()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    total_time += end_time - start_time

    print(f'Epoch: {epoch+1:02} | Epoch Train Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc:.2f}%')

print("Train finished")

Training: 100%|██████████| 313/313 [00:50<00:00,  6.18it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     784.014ms   15.026ms        0.000us    4.085ms   
1      forward_pass       0.000us    0.000us         3.952s   12.587ms   
2  loss_computation      30.234ms  465.949us        0.000us    4.710us   
3  loss_computation       0.000us    0.000us       43.413ms  138.700us   
4     backward_pass        4.541s   14.602ms        0.000us    1.364us   
5     backward_pass       0.000us    0.000us      426.903us    1.364us   
6    optimizer_step      17.969ms    3.328ms        0.000us  696.919us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                             0     216404665344>  
1                             0                0>  
2                             0           641024>  
3                             0                0>  
4                             0    -201743044096>  
5                             0                0>  
6                           248         919

Validation: 100%|██████████| 94/94 [00:04<00:00, 19.83it/s]


Epoch: 01 | Epoch Train Time: 1m 56s
	Train Loss: 2.153 | Train Acc: 19.15%
	 Val. Loss: 1.960 |  Val. Acc: 25.03%


Training: 100%|██████████| 313/313 [00:48<00:00,  6.46it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     784.386ms   12.190ms        0.000us    4.079ms   
1      forward_pass       0.000us    0.000us         3.553s   11.352ms   
2  loss_computation      27.962ms  201.950us        0.000us    4.941us   
3  loss_computation       0.000us    0.000us       18.492ms   59.079us   
4     backward_pass        4.039s   12.955ms        0.000us    1.379us   
5     backward_pass       0.000us    0.000us      431.517us    1.379us   
6    optimizer_step      18.231ms    2.982ms        0.000us  694.314us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                             0     216396866560>  
1                             0                0>  
2                             0           641024>  
3                             0                0>  
4                             0    -201752809472>  
5                             0                0>  
6                             0            

Validation: 100%|██████████| 94/94 [00:06<00:00, 15.13it/s]


Epoch: 02 | Epoch Train Time: 1m 53s
	Train Loss: 1.907 | Train Acc: 27.38%
	 Val. Loss: 1.871 |  Val. Acc: 29.03%


Training: 100%|██████████| 313/313 [00:48<00:00,  6.44it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     766.241ms   11.802ms        0.000us    4.080ms   
1      forward_pass       0.000us    0.000us         3.451s   11.025ms   
2  loss_computation      26.295ms  193.684us        0.000us    4.736us   
3  loss_computation       0.000us    0.000us       18.185ms   58.099us   
4     backward_pass        3.917s   12.566ms        0.000us    1.363us   
5     backward_pass       0.000us    0.000us      426.684us    1.363us   
6    optimizer_step      17.964ms    2.974ms        0.000us  701.461us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                             0     216396735488>  
1                             0                0>  
2                             0           641024>  
3                             0                0>  
4                             0    -201751629824>  
5                             0                0>  
6                             0            

Validation: 100%|██████████| 94/94 [00:06<00:00, 15.18it/s]


Epoch: 03 | Epoch Train Time: 1m 52s
	Train Loss: 1.849 | Train Acc: 29.01%
	 Val. Loss: 1.806 |  Val. Acc: 30.63%


Training: 100%|██████████| 313/313 [00:49<00:00,  6.34it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     766.155ms   11.803ms        0.000us    4.089ms   
1      forward_pass       0.000us    0.000us         3.451s   11.027ms   
2  loss_computation      26.343ms  192.750us        0.000us    4.902us   
3  loss_computation       0.000us    0.000us       18.029ms   57.602us   
4     backward_pass        3.964s   12.712ms        0.000us    1.363us   
5     backward_pass       0.000us    0.000us      426.631us    1.363us   
6    optimizer_step      18.164ms    2.969ms        0.000us  700.295us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                             0     216396866560>  
1                             0                0>  
2                             0           641024>  
3                             0                0>  
4                             0    -201752809472>  
5                             0                0>  
6                             0            

Validation: 100%|██████████| 94/94 [00:07<00:00, 12.91it/s]


Epoch: 04 | Epoch Train Time: 1m 53s
	Train Loss: 1.824 | Train Acc: 29.94%
	 Val. Loss: 1.760 |  Val. Acc: 32.80%


Training: 100%|██████████| 313/313 [00:56<00:00,  5.52it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     785.300ms   12.118ms        0.000us    4.531ms   
1      forward_pass       0.000us    0.000us         3.554s   11.355ms   
2  loss_computation      26.500ms  195.251us        0.000us    5.508us   
3  loss_computation       0.000us    0.000us       18.459ms   58.973us   
4     backward_pass        3.988s   12.794ms        0.000us    1.577us   
5     backward_pass       0.000us    0.000us      493.559us    1.577us   
6    optimizer_step      17.460ms    3.076ms        0.000us  729.473us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                             0     216396735488>  
1                             0                0>  
2                             0           641024>  
3                             0                0>  
4                             0    -201751629824>  
5                             0                0>  
6                             0            

Validation: 100%|██████████| 94/94 [00:06<00:00, 13.90it/s]


Epoch: 05 | Epoch Train Time: 2m 0s
	Train Loss: 1.790 | Train Acc: 31.67%
	 Val. Loss: 1.714 |  Val. Acc: 34.70%


Training: 100%|██████████| 313/313 [00:50<00:00,  6.20it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     763.982ms   11.879ms        0.000us    4.092ms   
1      forward_pass       0.000us    0.000us         3.481s   11.121ms   
2  loss_computation      26.023ms  190.956us        0.000us    4.687us   
3  loss_computation       0.000us    0.000us       17.697ms   56.541us   
4     backward_pass        3.918s   12.567ms        0.000us    1.367us   
5     backward_pass       0.000us    0.000us      427.813us    1.367us   
6    optimizer_step      17.004ms    2.948ms        0.000us  708.914us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                             0     216396866560>  
1                             0                0>  
2                             0           641024>  
3                             0                0>  
4                             0    -201752809472>  
5                             0                0>  
6                             0            

Validation: 100%|██████████| 94/94 [00:08<00:00, 11.47it/s]


Epoch: 06 | Epoch Train Time: 1m 53s
	Train Loss: 1.765 | Train Acc: 32.47%
	 Val. Loss: 1.718 |  Val. Acc: 33.20%


Training: 100%|██████████| 313/313 [00:55<00:00,  5.64it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     769.737ms   11.847ms        0.000us    4.374ms   
1      forward_pass       0.000us    0.000us         3.468s   11.081ms   
2  loss_computation      25.967ms  191.575us        0.000us    5.202us   
3  loss_computation       0.000us    0.000us       17.997ms   57.497us   
4     backward_pass        3.945s   12.654ms        0.000us    1.503us   
5     backward_pass       0.000us    0.000us      470.333us    1.503us   
6    optimizer_step      17.086ms    2.964ms        0.000us  729.734us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                             0     216396735488>  
1                             0                0>  
2                             0           641024>  
3                             0                0>  
4                             0    -201751629824>  
5                             0                0>  
6                             0            

Validation: 100%|██████████| 94/94 [00:07<00:00, 12.14it/s]


Epoch: 07 | Epoch Train Time: 1m 58s
	Train Loss: 1.755 | Train Acc: 33.16%
	 Val. Loss: 1.729 |  Val. Acc: 35.27%


Training: 100%|██████████| 313/313 [00:59<00:00,  5.30it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     788.343ms   12.295ms        0.000us    4.828ms   
1      forward_pass       0.000us    0.000us         3.603s   11.513ms   
2  loss_computation      27.157ms  199.526us        0.000us    5.846us   
3  loss_computation       0.000us    0.000us       18.768ms   59.961us   
4     backward_pass        4.009s   12.857ms        0.000us    1.716us   
5     backward_pass       0.000us    0.000us      537.153us    1.716us   
6    optimizer_step      17.921ms    3.148ms        0.000us  749.153us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                             0     216396866560>  
1                             0                0>  
2                             0           641024>  
3                             0                0>  
4                             0    -201752809472>  
5                             0                0>  
6                             0            

Validation: 100%|██████████| 94/94 [00:08<00:00, 11.45it/s]


Epoch: 08 | Epoch Train Time: 2m 3s
	Train Loss: 1.741 | Train Acc: 33.52%
	 Val. Loss: 1.672 |  Val. Acc: 36.20%


Training: 100%|██████████| 313/313 [00:56<00:00,  5.55it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     771.731ms   11.866ms        0.000us    4.478ms   
1      forward_pass       0.000us    0.000us         3.475s   11.101ms   
2  loss_computation      26.176ms  192.501us        0.000us    5.366us   
3  loss_computation       0.000us    0.000us       18.039ms   57.632us   
4     backward_pass        3.969s   12.730ms        0.000us    1.551us   
5     backward_pass       0.000us    0.000us      485.530us    1.551us   
6    optimizer_step      17.205ms    2.970ms        0.000us  738.769us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                             0     216396735488>  
1                             0                0>  
2                             0           641024>  
3                             0                0>  
4                             0    -201751629824>  
5                             0                0>  
6                             0            

Validation: 100%|██████████| 94/94 [00:07<00:00, 11.97it/s]


Epoch: 09 | Epoch Train Time: 1m 59s
	Train Loss: 1.718 | Train Acc: 34.53%
	 Val. Loss: 1.624 |  Val. Acc: 38.13%


Training: 100%|██████████| 313/313 [00:55<00:00,  5.61it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     766.669ms   11.876ms        0.000us    4.409ms   
1      forward_pass       0.000us    0.000us         3.479s   11.115ms   
2  loss_computation      26.067ms  192.040us        0.000us    5.186us   
3  loss_computation       0.000us    0.000us       18.014ms   57.552us   
4     backward_pass        3.955s   12.682ms        0.000us    1.517us   
5     backward_pass       0.000us    0.000us      474.765us    1.517us   
6    optimizer_step      16.908ms    3.039ms        0.000us  729.755us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                             0     216396866560>  
1                             0                0>  
2                             0           641024>  
3                             0                0>  
4                             0    -201752809472>  
5                             0                0>  
6                             0            

Validation: 100%|██████████| 94/94 [00:08<00:00, 11.25it/s]

Epoch: 10 | Epoch Train Time: 1m 58s
	Train Loss: 1.713 | Train Acc: 34.63%
	 Val. Loss: 1.639 |  Val. Acc: 37.57%
Train finished





In [29]:
free_memory, total_memory = torch.cuda.mem_get_info()
print(f"Free memory: {free_memory / 1024**2:.2f} MB")
print(f"Total memory: {total_memory / 1024**2:.2f} MB")

Free memory: 39464.81 MB
Total memory: 40513.81 MB


In [30]:
print("ResNet18 with dropback")
print(f'Total Training Time: {int(total_time/60)}m {int(total_time%60)}s')

ResNet18 with dropback
Total Training Time: 19m 28s


In [31]:
torch.save(model.state_dict(), 'trained_model.pth')

In [32]:
import os
model_file_size = os.path.getsize('trained_model.pth')  # 바이트 단위
model_file_size_MB = model_file_size / (1024 ** 2)  # MB로 변환
print(f"Saved model file size: {model_file_size_MB:.2f} MB")

Saved model file size: 42.73 MB


In [33]:
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"{name} - Size: {param.size()} - Number of elements: {param.numel()}")

conv1.weight - Size: torch.Size([64, 3, 7, 7]) - Number of elements: 9408
bn1.weight - Size: torch.Size([64]) - Number of elements: 64
bn1.bias - Size: torch.Size([64]) - Number of elements: 64
layer1.0.conv1.weight - Size: torch.Size([64, 64, 3, 3]) - Number of elements: 36864
layer1.0.bn1.weight - Size: torch.Size([64]) - Number of elements: 64
layer1.0.bn1.bias - Size: torch.Size([64]) - Number of elements: 64
layer1.0.conv2.weight - Size: torch.Size([64, 64, 3, 3]) - Number of elements: 36864
layer1.0.bn2.weight - Size: torch.Size([64]) - Number of elements: 64
layer1.0.bn2.bias - Size: torch.Size([64]) - Number of elements: 64
layer1.1.conv1.weight - Size: torch.Size([64, 64, 3, 3]) - Number of elements: 36864
layer1.1.bn1.weight - Size: torch.Size([64]) - Number of elements: 64
layer1.1.bn1.bias - Size: torch.Size([64]) - Number of elements: 64
layer1.1.conv2.weight - Size: torch.Size([64, 64, 3, 3]) - Number of elements: 36864
layer1.1.bn2.weight - Size: torch.Size([64]) - Numbe

In [34]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params}")

Total number of parameters: 11181642


In [35]:
from torchsummary import summary

summary(model, input_size=(3, 224, 224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 112, 112]           9,408
       BatchNorm2d-2         [-1, 64, 112, 112]             128
              ReLU-3         [-1, 64, 112, 112]               0
         MaxPool2d-4           [-1, 64, 56, 56]               0
            Conv2d-5           [-1, 64, 56, 56]          36,864
       BatchNorm2d-6           [-1, 64, 56, 56]             128
              ReLU-7           [-1, 64, 56, 56]               0
            Conv2d-8           [-1, 64, 56, 56]          36,864
       BatchNorm2d-9           [-1, 64, 56, 56]             128
             ReLU-10           [-1, 64, 56, 56]               0
       BasicBlock-11           [-1, 64, 56, 56]               0
           Conv2d-12           [-1, 64, 56, 56]          36,864
      BatchNorm2d-13           [-1, 64, 56, 56]             128
             ReLU-14           [-1, 64,

In [36]:
from torch import profiler

dummy_input = torch.randn(32, 3, 224, 224).cuda()

# Profiling inference
with profiler.profile(
    activities=[
       profiler.ProfilerActivity.CPU,
        profiler.ProfilerActivity.CUDA,  # Include if using GPU
    ],
    on_trace_ready=profiler.tensorboard_trace_handler("./logs"),  # Optional logging
    record_shapes=True,
    with_stack=True
) as prof:
    with torch.no_grad():
        model(dummy_input)


# Print results
print(prof.key_averages().table(sort_by="cuda_time_total" if torch.cuda.is_available() else "cpu_time_total", row_limit=10))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                           aten::conv2d         1.57%     111.908us        60.56%       4.319ms     215.953us       0.000us         0.00%       2.693ms     134.658us            20  
                                      aten::convolution         2.46%     175.189us        58.99%       4.207ms     210.358us       0.000us         0.00%       2.693ms     134.658us            20  
         