In [1]:
 !pip install pynvml

Collecting pynvml
  Downloading pynvml-11.5.3-py3-none-any.whl.metadata (8.8 kB)
Downloading pynvml-11.5.3-py3-none-any.whl (53 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/53.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pynvml
Successfully installed pynvml-11.5.3


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
import torchvision
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
from torch.profiler import profile, record_function, ProfilerActivity

import matplotlib.pyplot as plt
import numpy as np

import copy
from collections import namedtuple
import time
import os
import random
import re

import cv2
from torch.utils.data import Dataset, DataLoader, Subset
from PIL import Image

from tqdm import tqdm
from pynvml import *
import pandas as pd

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
def print_gpu_utilization():
    if torch.cuda.is_available():
        device = torch.cuda.current_device()  # 현재 GPU 디바이스 정보
        allocated_memory = torch.cuda.memory_allocated(device) / 1024**3  # 메모리 사용량 (GB)
        reserved_memory = torch.cuda.memory_reserved(device) / 1024**3  # 예약된 메모리 (GB)
        print(f"Allocated Memory: {allocated_memory:.2f} GB")
        print(f"Reserved Memory: {reserved_memory:.2f} GB")
    else:
        print("No GPU available.")

In [4]:
def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [5]:
size = 224
mean = (0.485, 0.456, 0.406)
std = (0.229, 0.224, 0.225)
batch_size = 32

In [6]:
train_transforms = transforms.Compose([
    transforms.RandomResizedCrop(size, scale=(0.5, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])

test_transforms = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(size),
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])

In [7]:
# CIFAR-10
trainset = datasets.CIFAR10(root='./data', train=True, download=True, transform=train_transforms)
trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True)

testset = datasets.CIFAR10(root='./data', train=False, download=True, transform=test_transforms)
testloader = DataLoader(testset, batch_size=batch_size, shuffle=False)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170M/170M [00:13<00:00, 12.9MB/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [8]:
VALID_RATIO = 0.7
n_train_examples = int(len(trainset) * VALID_RATIO)
n_valid_examples = len(trainset) - n_train_examples

train_data, valid_data = data.random_split(trainset, [n_train_examples, n_valid_examples])

In [9]:
valid_data = copy.deepcopy(valid_data)
valid_data.dataset.transform = test_transforms

In [10]:
len(train_data), len(valid_data), len(testset)

(35000, 15000, 10000)

In [11]:
sample_fraction = 0.2

# 무작위 인덱스 생성
train_indices = torch.randperm(len(trainset))[:int(len(trainset) * sample_fraction)]
valid_indices = torch.randperm(len(valid_data))[:int(len(valid_data) * sample_fraction)]
test_indices = torch.randperm(len(testset))[:int(len(testset) * sample_fraction)]

# 서브셋 생성
train_subset = Subset(trainset, train_indices)
valid_subset = Subset(valid_data, valid_indices)
test_subset = Subset(testset, test_indices)

In [12]:
len(train_subset), len(valid_subset), len(test_subset)

(10000, 3000, 2000)

In [13]:
train_iterator = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
valid_iterator = DataLoader(valid_subset, batch_size=batch_size, shuffle=False)
test_iterator = DataLoader(test_subset, batch_size=batch_size, shuffle=False)

In [14]:
class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_channels, out_channels, stride=1, downsample = False):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)

        if downsample:
            conv = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False)
            bn = nn.BatchNorm2d(out_channels)
            downsample = nn.Sequential(conv, bn)
        else:
            downsample = None
        self.downsample = downsample

    def forward(self, x):
        i = x
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.bn2(x)

        if self.downsample is not None:
            i = self.downsample(i)

        x += i
        x = self.relu(x)

        return x

In [15]:
class ResNet(nn.Module):
    def __init__(self, config, output_dim, zero_init_residual = False):
        super().__init__()

        block, n_blocks, channels = config
        self.in_channels = channels[0]
        assert len(n_blocks) == len(channels) == 4

        self.conv1 = nn.Conv2d(3, self.in_channels, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(self.in_channels)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.layer1 = self.get_resnet_layer(block, n_blocks[0], channels[0])
        self.layer2 = self.get_resnet_layer(block, n_blocks[1], channels[1], stride=2)
        self.layer3 = self.get_resnet_layer(block, n_blocks[2], channels[2], stride=2)
        self.layer4 = self.get_resnet_layer(block, n_blocks[3], channels[3], stride=2)

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(self.in_channels, output_dim)

        if zero_init_residual:
            for m in self.modules():
                if isinstance(m, BasicBlock):
                    nn.init.constant_(m.bn2.weight, 0)
                #elif isinstance(m, Bottleneck):
                    #nn.init.constant_(m.bn3.weight, 0)

    def get_resnet_layer(self, block, n_blocks, channels, stride=1):
        layers = []
        if self.in_channels != block.expansion * channels:
            downsample = True
        else:
            downsample = False
        layers.append(block(self.in_channels, channels, stride, downsample))
        for i in range(1, n_blocks):
            layers.append(block(block.expansion * channels, channels))

        self.in_channels = block.expansion * channels
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        #x = self.layer1(x)
        x = checkpoint.checkpoint(self.layer1, x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.avgpool(x)
        h = x.view(x.shape[0], -1)
        x = self.fc(h)
        return x, h

In [16]:
ResNetConfig = namedtuple('ResNetConfig', ['block', 'n_blocks', 'channels'])

In [17]:
resnet18_config = ResNetConfig(block = BasicBlock, n_blocks = [2, 2, 2, 2], channels = [64, 128, 256, 512])

In [18]:
model = ResNet(resnet18_config, 10)

In [19]:
print(model)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kerne

In [20]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [21]:
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

pretrained_model = model.to(device)
criterion = criterion.to(device)

In [22]:
def calculate_accuracy(y_pred, y):
    top_pred = y_pred.argmax(1, keepdim=True)
    correct = top_pred.eq(y.view_as(top_pred)).sum()
    acc = correct.float() / y.shape[0]
    return acc

In [23]:
pattern = re.compile(r'key=(?P<key>\S+)\s+'
                     r'self_cpu_time=(?P<self_cpu_time>\S+)\s+'
                     r'cpu_time=(?P<cpu_time>\S+)\s+'
                     r'self_cuda_time=(?P<self_cuda_time>\S+)\s+'
                     r'cuda_time=(?P<cuda_time>\S+)\s+'
                     r'input_shapes=(?P<input_shapes>\S*)\s*'
                     r'cpu_memory_usage=(?P<cpu_memory_usage>\S*)\s*'
                     r'cuda_memory_usage=(?P<cuda_memory_usage>\S*)')

In [24]:
import torch.utils.checkpoint as checkpoint

def gradient_checkpoint(model, *inputs):

  return model(*inputs)


In [25]:
def train(model, train_loader, criterion, optimizer, device):
    start_time = time.monotonic()
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    with profile(
        activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
        profile_memory=True,  # 메모리 사용량 추적
        record_shapes=True  # 텐서 크기 기록
    ) as prof:
        for inputs, labels in tqdm(train_loader, desc="Training"):
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()  # 기울기 초기화
            memory_usage = torch.cuda.memory_allocated(device) / (1024 ** 2)
            #print(f"Memory usage: {memory_usage:.2f} MB")
            with record_function("forward_pass"):
              outputs = model(inputs)
            del inputs
            torch.cuda.empty_cache()

            with record_function("loss_computation"):
                loss = criterion(outputs[0], labels)
            with record_function("backward_pass"):
                loss.backward()
            with record_function("optimizer_step"):
                optimizer.step()

            running_loss += loss.item()
            _, predicted = torch.max(outputs[0], 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    end_time = time.monotonic()
    selected_keys = ["forward_pass", "loss_computation", "backward_pass", "optimizer_step"]

    # key_averages()로부터 얻은 평균값을 필터링
    filtered_averages = [avg for avg in prof.key_averages() if avg.key in selected_keys]
    extracted_data = []

    for avg in filtered_averages:
      avg_str = str(avg)
      match = pattern.search(avg_str)
      if match:
        extracted_data.append(match.groupdict())
    df = pd.DataFrame(extracted_data)
    print(df)
    free_memory, total_memory = torch.cuda.mem_get_info()
    print(f"Free memory: {free_memory / 1024**2:.2f} MB")
    print(f"Total memory: {total_memory / 1024**2:.2f} MB")
    # 훈련 후 평균 손실과 정확도 계산
    epoch_loss = running_loss / len(train_loader)
    accuracy = 100 * correct / total

    return epoch_loss, accuracy, start_time, end_time

In [26]:
def evaluate(model, data_loader, criterion, device, phase="Validation"):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in tqdm(data_loader, desc=f"{phase}"):
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs[0], labels)

            running_loss += loss.item()
            _, predicted = torch.max(outputs[0], 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    epoch_loss = running_loss / len(data_loader)
    accuracy = 100 * correct / total
    # print(f"{phase} Loss: {epoch_loss:.4f}, {phase} Accuracy: {accuracy:.2f}%")

    return epoch_loss, accuracy

In [27]:
torch.cuda.empty_cache()

In [28]:
memory_usage = torch.cuda.memory_allocated(device) / (1024 ** 2)
print(memory_usage)

42.70654296875


In [29]:
free_memory, total_memory = torch.cuda.mem_get_info()
print(f"Free memory: {free_memory / 1024**2:.2f} MB")
print(f"Total memory: {total_memory / 1024**2:.2f} MB")

Free memory: 40026.81 MB
Total memory: 40513.81 MB


In [30]:
EPOCHS = 10
best_valid_loss = float('inf')
total_time = 0
for epoch in range(EPOCHS):

    train_loss, train_acc, start_time, end_time = train(model, train_iterator, criterion, optimizer, device)

    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, device)
    #if valid_loss < best_valid_loss:
        #best_valid_loss = valid_loss
        #torch.save(model.state_dict(), 'vgg19-model.pt')

    # end_time = time.monotonic()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    total_time += end_time - start_time

    print(f'Epoch: {epoch+1:02} | Epoch Train Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc:.2f}%')

print("Train finished")

  return fn(*args, **kwargs)
Training: 100%|██████████| 313/313 [00:31<00:00, 10.06it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     567.399ms   11.854ms        0.000us    4.682ms   
1      forward_pass       0.000us    0.000us         3.040s    9.682ms   
2  loss_computation      35.415ms  524.609us        0.000us    6.429us   
3  loss_computation       0.000us    0.000us       51.121ms  163.327us   
4     backward_pass        5.541s   17.812ms        0.000us    1.628us   
5     backward_pass       0.000us    0.000us      509.557us    1.628us   
6    optimizer_step      14.479ms    3.438ms        0.000us  699.914us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                       1587536     162229720064>  
1                             0                0>  
2                             0           641024>  
3                             0                0>  
4                             0    -153761382400>  
5                             0                0>  
6                           248         912

  return fn(*args, **kwargs)
Validation: 100%|██████████| 94/94 [00:04<00:00, 19.74it/s]


Epoch: 01 | Epoch Train Time: 1m 4s
	Train Loss: 1.999 | Train Acc: 25.18%
	 Val. Loss: 1.836 |  Val. Acc: 31.97%


Training: 100%|██████████| 313/313 [00:29<00:00, 10.75it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     580.760ms    8.518ms        0.000us    4.685ms   
1      forward_pass       0.000us    0.000us         2.579s    8.240ms   
2  loss_computation      35.282ms  243.577us        0.000us    7.065us   
3  loss_computation       0.000us    0.000us       19.101ms   61.026us   
4     backward_pass        5.085s   16.298ms        0.000us    1.641us   
5     backward_pass       0.000us    0.000us      513.507us    1.641us   
6    optimizer_step      14.621ms    2.984ms        0.000us  699.229us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                       1587536     161009734656>  
1                             0                0>  
2                             0           641024>  
3                             0                0>  
4                             0    -152701632512>  
5                             0                0>  
6                             0         472

  return fn(*args, **kwargs)
Validation: 100%|██████████| 94/94 [00:05<00:00, 17.50it/s]


Epoch: 02 | Epoch Train Time: 1m 1s
	Train Loss: 1.787 | Train Acc: 33.18%
	 Val. Loss: 1.592 |  Val. Acc: 40.77%


Training: 100%|██████████| 313/313 [00:27<00:00, 11.40it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     590.568ms    8.284ms        0.000us    4.421ms   
1      forward_pass       0.000us    0.000us         2.525s    8.066ms   
2  loss_computation      31.284ms  226.212us        0.000us    6.627us   
3  loss_computation       0.000us    0.000us       18.249ms   58.302us   
4     backward_pass        4.812s   15.426ms        0.000us    1.513us   
5     backward_pass       0.000us    0.000us      473.631us    1.513us   
6    optimizer_step      14.791ms    3.004ms        0.000us  692.878us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                       1587536     160334255104>  
1                             0                0>  
2                             0           641024>  
3                             0                0>  
4                             0    -151925161984>  
5                             0                0>  
6                             0            

  return fn(*args, **kwargs)
Validation: 100%|██████████| 94/94 [00:05<00:00, 17.55it/s]


Epoch: 03 | Epoch Train Time: 0m 59s
	Train Loss: 1.656 | Train Acc: 38.93%
	 Val. Loss: 1.504 |  Val. Acc: 44.80%


Training: 100%|██████████| 313/313 [00:27<00:00, 11.57it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     592.210ms    8.277ms        0.000us    4.332ms   
1      forward_pass       0.000us    0.000us         2.522s    8.059ms   
2  loss_computation      31.492ms  227.290us        0.000us    6.482us   
3  loss_computation       0.000us    0.000us       18.083ms   57.774us   
4     backward_pass        4.844s   15.527ms        0.000us    1.473us   
5     backward_pass       0.000us    0.000us      461.179us    1.473us   
6    optimizer_step      14.154ms    2.876ms        0.000us  690.810us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                       1587536     160334255104>  
1                             0                0>  
2                             0           641024>  
3                             0                0>  
4                             0    -151925161984>  
5                             0                0>  
6                             0            

  return fn(*args, **kwargs)
Validation: 100%|██████████| 94/94 [00:06<00:00, 15.05it/s]


Epoch: 04 | Epoch Train Time: 0m 59s
	Train Loss: 1.549 | Train Acc: 43.74%
	 Val. Loss: 1.466 |  Val. Acc: 45.97%


Training: 100%|██████████| 313/313 [00:32<00:00,  9.60it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     572.382ms    8.438ms        0.000us    4.888ms   
1      forward_pass       0.000us    0.000us         2.568s    8.204ms   
2  loss_computation      32.389ms  235.107us        0.000us    6.528us   
3  loss_computation       0.000us    0.000us       19.534ms   62.409us   
4     backward_pass        4.866s   15.601ms        0.000us    1.714us   
5     backward_pass       0.000us    0.000us      536.509us    1.714us   
6    optimizer_step      14.976ms    3.145ms        0.000us  700.231us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                       1587536     160334255104>  
1                             0                0>  
2                             0           641024>  
3                             0                0>  
4                             0    -151925161984>  
5                             0                0>  
6                             0            

  return fn(*args, **kwargs)
Validation: 100%|██████████| 94/94 [00:06<00:00, 14.32it/s]


Epoch: 05 | Epoch Train Time: 1m 4s
	Train Loss: 1.413 | Train Acc: 48.61%
	 Val. Loss: 1.237 |  Val. Acc: 55.30%


Training: 100%|██████████| 313/313 [00:33<00:00,  9.27it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     576.176ms    8.571ms        0.000us    4.885ms   
1      forward_pass       0.000us    0.000us         2.609s    8.334ms   
2  loss_computation      32.684ms  235.449us        0.000us    6.523us   
3  loss_computation       0.000us    0.000us       19.559ms   62.489us   
4     backward_pass        4.862s   15.586ms        0.000us    1.714us   
5     backward_pass       0.000us    0.000us      536.456us    1.714us   
6    optimizer_step      14.377ms    3.136ms        0.000us  699.896us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                       1587536     160334255104>  
1                             0                0>  
2                             0           641024>  
3                             0                0>  
4                             0    -151925161984>  
5                             0                0>  
6                             0            

  return fn(*args, **kwargs)
Validation: 100%|██████████| 94/94 [00:06<00:00, 14.72it/s]


Epoch: 06 | Epoch Train Time: 1m 6s
	Train Loss: 1.294 | Train Acc: 53.73%
	 Val. Loss: 1.224 |  Val. Acc: 55.73%


Training: 100%|██████████| 313/313 [00:30<00:00, 10.18it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     561.776ms    8.269ms        0.000us    4.863ms   
1      forward_pass       0.000us    0.000us         2.516s    8.039ms   
2  loss_computation      32.006ms  232.914us        0.000us    6.490us   
3  loss_computation       0.000us    0.000us       19.602ms   62.628us   
4     backward_pass        4.833s   15.495ms        0.000us    1.704us   
5     backward_pass       0.000us    0.000us      533.446us    1.704us   
6    optimizer_step      14.450ms    2.910ms        0.000us  699.149us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                       1587536     160334255104>  
1                             0                0>  
2                             0           641024>  
3                             0                0>  
4                             0    -151925161984>  
5                             0                0>  
6                             0            

  return fn(*args, **kwargs)
Validation: 100%|██████████| 94/94 [00:06<00:00, 14.97it/s]


Epoch: 07 | Epoch Train Time: 1m 2s
	Train Loss: 1.200 | Train Acc: 56.95%
	 Val. Loss: 1.084 |  Val. Acc: 61.27%


Training: 100%|██████████| 313/313 [00:34<00:00,  9.19it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     574.155ms    8.471ms        0.000us    4.886ms   
1      forward_pass       0.000us    0.000us         2.578s    8.235ms   
2  loss_computation      32.747ms  237.862us        0.000us    6.523us   
3  loss_computation       0.000us    0.000us       19.924ms   63.657us   
4     backward_pass        4.901s   15.708ms        0.000us    1.715us   
5     backward_pass       0.000us    0.000us      536.725us    1.715us   
6    optimizer_step      14.465ms    3.213ms        0.000us  699.742us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                       1587536     160334255104>  
1                             0                0>  
2                             0           641024>  
3                             0                0>  
4                             0    -151925161984>  
5                             0                0>  
6                             0            

  return fn(*args, **kwargs)
Validation: 100%|██████████| 94/94 [00:05<00:00, 15.95it/s]


Epoch: 08 | Epoch Train Time: 1m 7s
	Train Loss: 1.142 | Train Acc: 59.00%
	 Val. Loss: 0.959 |  Val. Acc: 66.57%


Training: 100%|██████████| 313/313 [00:29<00:00, 10.58it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     563.600ms    8.257ms        0.000us    4.809ms   
1      forward_pass       0.000us    0.000us         2.513s    8.027ms   
2  loss_computation      32.420ms  232.257us        0.000us    6.396us   
3  loss_computation       0.000us    0.000us       19.367ms   61.874us   
4     backward_pass        4.856s   15.570ms        0.000us    1.681us   
5     backward_pass       0.000us    0.000us      526.259us    1.681us   
6    optimizer_step      14.734ms    3.042ms        0.000us  698.714us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                       1587536     160334255104>  
1                             0                0>  
2                             0           641024>  
3                             0                0>  
4                             0    -151925161984>  
5                             0                0>  
6                             0            

  return fn(*args, **kwargs)
Validation: 100%|██████████| 94/94 [00:07<00:00, 12.76it/s]


Epoch: 09 | Epoch Train Time: 1m 1s
	Train Loss: 1.062 | Train Acc: 62.32%
	 Val. Loss: 0.911 |  Val. Acc: 68.60%


Training: 100%|██████████| 313/313 [00:36<00:00,  8.68it/s]


                key self_cpu_time   cpu_time self_cuda_time  cuda_time  \
0      forward_pass     583.334ms    8.535ms        0.000us    4.885ms   
1      forward_pass       0.000us    0.000us         2.595s    8.292ms   
2  loss_computation      32.711ms  237.030us        0.000us    6.522us   
3  loss_computation       0.000us    0.000us       19.827ms   63.346us   
4     backward_pass        4.911s   15.744ms        0.000us    1.713us   
5     backward_pass       0.000us    0.000us      536.148us    1.713us   
6    optimizer_step      14.646ms    3.202ms        0.000us  700.161us   

  input_shapes cpu_memory_usage cuda_memory_usage  
0                       1587536     160334255104>  
1                             0                0>  
2                             0           641024>  
3                             0                0>  
4                             0    -151925161984>  
5                             0                0>  
6                             0            

  return fn(*args, **kwargs)
Validation: 100%|██████████| 94/94 [00:06<00:00, 14.33it/s]

Epoch: 10 | Epoch Train Time: 1m 8s
	Train Loss: 1.013 | Train Acc: 63.97%
	 Val. Loss: 0.943 |  Val. Acc: 68.27%
Train finished



