In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
from collections import defaultdict
import gc
from pathlib import Path
import time

import numpy as np
import torch

from matplotlib import pyplot as plt
from torch.utils import data
from torchvision import transforms
from torchvision.models.detection.rpn import AnchorGenerator
from torch.utils.tensorboard import SummaryWriter
import tqdm

from detection import create_detection_model, DetectionDataset, Flip, PerspectiveTransform

%matplotlib inline

np.random.seed(2205)
torch.manual_seed(2205)

<torch._C.Generator at 0x7f8728303dd0>

In [4]:
train_transformations = [
    (Flip(), 'sample'),
    (PerspectiveTransform(), 'sample'),
    (transforms.ToPILImage(), 'image'),
    (transforms.ToTensor(), 'image'),
                    ]

val_transformations = [
    (transforms.ToPILImage(), 'image'),
    (transforms.ToTensor(), 'image'),
                    ]

In [6]:
train_dataset = DetectionDataset('data', train_transformations, 'train')
val_dataset = DetectionDataset('data', val_transformations, 'val')

train_dataloader = data.DataLoader(
        train_dataset, batch_size=2,
        num_workers=4, pin_memory=True,
        shuffle=True, drop_last=True,
        collate_fn=DetectionDataset.collate_fn
    )
val_dataloader = data.DataLoader(
        val_dataset, batch_size=1,
        num_workers=4, pin_memory=True,
        shuffle=False, drop_last=True,
        collate_fn=DetectionDataset.collate_fn
    )

In [7]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = create_detection_model()

Downloading: "https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth" to /Users/happyhooter/.cache/torch/hub/checkpoints/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth


  0%|          | 0.00/170M [00:00<?, ?B/s]

In [8]:
experiment_name = 'SGD_lr_3e-4_plateau'
writer = SummaryWriter(log_dir=f'tb_logs/{experiment_name}')


optimizer = torch.optim.SGD(model.parameters(), lr=3e-4,
                            momentum=0.9, weight_decay=0.0005)

lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=1/np.sqrt(10), patience=2,
                                                          verbose=False, threshold=1e-3)

In [9]:
# move model to the right device
model.to(device)

# let's train it for 20 epochs
num_epochs = 20

# Определим итерации, на которых будем выводить
sub_epochs = np.round(np.linspace(0, len(train_dataloader), 24)).astype(int)[1:]

best_loss = np.inf
prev_lr = optimizer.param_groups[0]['lr']

for epoch in range(num_epochs):
    loss_dict = None
    losses = None
    gc.collect()
    torch.cuda.empty_cache()
    
    # Если поменялась lr - загружаем лучшую модель
    if optimizer.param_groups[0]['lr'] < prev_lr:
        prev_lr = optimizer.param_groups[0]['lr']
        with open(f'{experiment_name}_best.pth', 'rb') as fp:
            state_dict = torch.load(fp, map_location="cpu")
        model.load_state_dict(state_dict)
        model.to(device)
        
    model.train()
    mask_rcnn_losses = defaultdict(list)
    sub_epoch_losses = defaultdict(list)
    
    for i, (images, targets) in enumerate(tqdm.tqdm(train_dataloader, desc=f'Train epoch {epoch + 1}')):
        num_iteration = len(train_dataloader) * epoch + i
        
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        # Info for tensorboard START
        for k, v in loss_dict.items():
            mask_rcnn_losses[k].append(v.item())
            sub_epoch_losses[k].append(v.item())
        mask_rcnn_losses['overall_loss'].append(losses.item())
        sub_epoch_losses['overall_loss'].append(losses.item())
        # Info for tensorboard END
        
        optimizer.zero_grad()
        losses.backward()
        
        # Info for tensorboard START
        total_norm = 0
        for p in model.parameters():
            if p.requires_grad:
                param_norm = p.grad.data.norm(2)
                total_norm += param_norm.item() ** 2
        total_norm = total_norm ** (1. / 2)
        writer.add_scalar('Detection/Train/Batches/GradNorn',
                          total_norm, num_iteration)
        # Info for tensorboard END
        
        optimizer.step()
        
        # Info for tensorboard START
        if i + 1 in sub_epochs:
            writer.add_scalar('Detection/Train/Batches/overall_loss',
                              np.mean(sub_epoch_losses['overall_loss']), num_iteration)
            writer.add_scalar('Detection/Train/Batches/loss_classifier',
                              np.mean(sub_epoch_losses['loss_classifier']), num_iteration) # r-cnn
            writer.add_scalar('Detection/Train/Batches/loss_box_reg',
                              np.mean(sub_epoch_losses['loss_box_reg']), num_iteration) # r-cnn
            writer.add_scalar('Detection/Train/Batches/loss_mask',
                              np.mean(sub_epoch_losses['loss_mask']), num_iteration) # mask
            writer.add_scalar('Detection/Train/Batches/loss_objectness',
                              np.mean(sub_epoch_losses['loss_objectness']), num_iteration) # rpn
            writer.add_scalar('Detection/Train/Batches/loss_rpn_box_reg',
                              np.mean(sub_epoch_losses['loss_rpn_box_reg']), num_iteration) #rpn
            sub_epoch_losses = defaultdict(list)
        # Info for tensorboard END
            
        
    writer.add_scalar('Detection/Train/Epochs/overall_loss', np.mean(mask_rcnn_losses['overall_loss']), epoch)
    writer.add_scalar('Detection/Train/Epochs/loss_classifier', np.mean(mask_rcnn_losses['loss_classifier']), epoch)
    writer.add_scalar('Detection/Train/Epochs/loss_box_reg', np.mean(mask_rcnn_losses['loss_box_reg']), epoch)
    writer.add_scalar('Detection/Train/Epochs/loss_mask', np.mean(mask_rcnn_losses['loss_mask']), epoch)
    writer.add_scalar('Detection/Train/Epochs/loss_objectness', np.mean(mask_rcnn_losses['loss_objectness']), epoch)
    writer.add_scalar('Detection/Train/Epochs/loss_rpn_box_reg', np.mean(mask_rcnn_losses['loss_rpn_box_reg']), epoch)
    
    print(f'Train_loss: {np.mean(mask_rcnn_losses["overall_loss"])}')
    time.sleep(0.5)
    loss_dict = None
    losses = None
    gc.collect()
    torch.cuda.empty_cache()
    
    mask_rcnn_losses = defaultdict(list)
    for images, targets in tqdm.tqdm(val_dataloader, desc=f'Validation epoch {epoch + 1}'):
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())
        
        # Info for tensorboard START
        for k, v in loss_dict.items():
            mask_rcnn_losses[k].append(v.item())
        mask_rcnn_losses['overall_loss'].append(losses.item())
        # Info for tensorboard END
        
        optimizer.zero_grad()
    
    valid_loss = np.mean(mask_rcnn_losses['overall_loss'])
    
    if valid_loss < best_loss:
        best_loss = valid_loss
        with open(f'{experiment_name}_best.pth', 'wb') as fp:
            torch.save(model.state_dict(), fp)
    
    writer.add_scalar('Detection/Valid/overall_loss', valid_loss, epoch)
    writer.add_scalar('Detection/Valid/loss_classifier', np.mean(mask_rcnn_losses['loss_classifier']), epoch)
    writer.add_scalar('Detection/Valid/loss_box_reg', np.mean(mask_rcnn_losses['loss_box_reg']), epoch)
    writer.add_scalar('Detection/Valid/loss_mask', np.mean(mask_rcnn_losses['loss_mask']), epoch)
    writer.add_scalar('Detection/Valid/loss_objectness', np.mean(mask_rcnn_losses['loss_objectness']), epoch)
    writer.add_scalar('Detection/Valid/loss_rpn_box_reg', np.mean(mask_rcnn_losses['loss_rpn_box_reg']), epoch)
    
    lr_scheduler.step(valid_loss)
    
    print(f'Valid_loss: {valid_loss}')
    time.sleep(0.5)

Train epoch 1:   0%|          | 46/11535 [22:56<95:29:03, 29.92s/it] 


KeyboardInterrupt: 

In [9]:
with open(f'{experiment_name}_best.pth', 'rb') as fp:
    state_dict = torch.load(fp, map_location="cpu")
model.load_state_dict(state_dict)
model.to(device)

MaskRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d()
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d()
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d()
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d()
          (relu): ReLU(inplace=True)
          (downsample): Sequentia