<a href="https://colab.research.google.com/github/jeongin7103/CalCheck/blob/main/train_with_checkpoint.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import sys
sys.path.append('/content/drive/MyDrive/SSD_detection')

In [None]:
import time
import torch.backends.cudnn as cudnn
import torch.optim
from model import SSD300, MultiBoxLoss
import torch.utils.data
from utils import *
from datasets import CustomDataset

In [None]:
data_folder = '/content/drive/MyDrive/SSD_detection/train'

In [None]:
global start_epoch, label_map, epoch, checkpoint, decay_lr_at

In [None]:
# Model parameters
n_classes = len(label_map)
print(n_classes)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# Learning parameters
checkpoint = "/content/drive/MyDrive/SSD_detection/checkpoints/checkpoint_ssd300_epoch_16.pth.tar"  # checkpoint 경로 넣어주기
batch_size = 4
iterations = 3
# workers = 4
print_freq = 100
lr = 1e-3
decay_lr_at = [80000, 100000]
decay_lr_to = 0.1  # decay learning rate to this fraction of the existing learning rate
momentum = 0.9  # momentum
weight_decay = 5e-4  # weight decay
grad_clip = None
cudnn.benchmark = True

21
cuda


In [None]:
def train(train_loader, model, criterion, optimizer, epoch):
    model.train()  # training mode enables dropout

    batch_time = AverageMeter()  # forward prop. + back prop. time
    data_time = AverageMeter()  # data loading time
    losses = AverageMeter()  # loss

    start = time.time()

    # Batches
    for i, (images, boxes, labels) in enumerate(train_loader):
        # images: (N, 3, 300, 300)
        data_time.update(time.time() - start)

        # Move to default device
        # images
        images = images.to(device)  # (batch_size (N), 3, 300, 300)
        boxes = [b.to(device) for b in boxes]
        labels = [l.to(device) for l in labels]

        # Forward prop.
        # 여기서 model.py의 forward 함수의 인자로 넣어줄 images 가 전달된다.
        predicted_locs, predicted_scores = model(images)  # (N, 8732, 4), (N, 8732, n_classes)

        # Loss

        loss = criterion(predicted_locs, predicted_scores, boxes, labels)  # scalar

        # Backward prop.
        optimizer.zero_grad()
        loss.backward()

        # Clip gradients, if necessary
        if grad_clip is not None:
            clip_gradient(optimizer, grad_clip)

        # Update model
        optimizer.step()

        losses.update(loss.item(), images.size(0))
        batch_time.update(time.time() - start)

        start = time.time()

        # Print status
        if i % print_freq == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data Time {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(epoch, i, len(train_loader),
                                                                  batch_time=batch_time,
                                                                  data_time=data_time, loss=losses))
    del predicted_locs, predicted_scores, images, boxes, labels  # free some memory since their histories may be stored

In [None]:
if checkpoint is None:
    start_epoch = 0
    model = SSD300(n_classes=21)
    biases = list()
    not_biases = list()
    for param_name, param in model.named_parameters():
        if param.requires_grad:
            if param_name.endswith('.bias'):
                biases.append(param)
            else:
                not_biases.append(param)
    optimizer = torch.optim.SGD(params=[{'params': biases, 'lr': 2 * lr}, {'params': not_biases}],
                                lr=lr, momentum=momentum, weight_decay=weight_decay)

else:
    # checkpoint = torch.load(checkpoint, map_location=torch.device('cpu'))
    checkpoint = torch.load(checkpoint)
    start_epoch = checkpoint['epoch'] + 1
    print('\nLoaded checkpoint from epoch %d.\n' % start_epoch)
    model = checkpoint['model']
    optimizer = checkpoint['optimizer']


Loaded checkpoint from epoch 17.



In [None]:
# Move to default device
model = model.to(device)

# loss 함수 지정
criterion = MultiBoxLoss(priors_cxcy=model.priors_cxcy).to(device)

# Custom dataloaders
train_dataset = CustomDataset(data_folder, split='train')
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
                                           collate_fn=train_dataset.collate_fn,
                                           pin_memory=True)  # note that we're passing the collate function here

In [None]:
epochs = 20
# decay_lr_at = [it // (len(train_dataset) // 32) for it in decay_lr_at]
decay_lr_at = [10,18]
print(epochs)
print(decay_lr_at)

20
[10, 18]


In [None]:
# Epochs
for epoch in range(start_epoch, epochs):
    # Decay learning rate at particular epochs
    if epoch in decay_lr_at:
        adjust_learning_rate(optimizer, decay_lr_to)

    # One epoch's train, train 함수로 학습 진행
    train(train_loader=train_loader, model=model, criterion=criterion,
          optimizer=optimizer,
          epoch=epoch)
    # Save checkpoint
    save_checkpoint(epoch, model, optimizer)

Epoch: [17][0/2000]	Batch Time 11.134 (11.134)	Data Time 4.621 (4.621)	Loss 1.6977 (1.6977)	
Epoch: [17][100/2000]	Batch Time 3.068 (3.361)	Data Time 2.818 (3.051)	Loss 2.2388 (2.1518)	
Epoch: [17][200/2000]	Batch Time 3.138 (3.301)	Data Time 2.887 (3.022)	Loss 2.0402 (2.1335)	
Epoch: [17][300/2000]	Batch Time 3.129 (3.266)	Data Time 2.877 (2.998)	Loss 0.9781 (2.1136)	
Epoch: [17][400/2000]	Batch Time 3.246 (3.270)	Data Time 3.019 (3.006)	Loss 1.1402 (2.0933)	
Epoch: [17][500/2000]	Batch Time 3.446 (3.263)	Data Time 3.212 (3.004)	Loss 1.2110 (2.0769)	
Epoch: [17][600/2000]	Batch Time 3.213 (3.257)	Data Time 2.979 (2.999)	Loss 2.5041 (2.0551)	
Epoch: [17][700/2000]	Batch Time 3.211 (3.257)	Data Time 2.959 (3.001)	Loss 2.7339 (2.0424)	
Epoch: [17][800/2000]	Batch Time 3.403 (3.255)	Data Time 3.172 (3.000)	Loss 1.8217 (2.0508)	
Epoch: [17][900/2000]	Batch Time 3.571 (3.256)	Data Time 3.316 (3.003)	Loss 1.2730 (2.0459)	
Epoch: [17][1000/2000]	Batch Time 3.262 (3.254)	Data Time 3.027 (3.001