In [1]:
import os
import random
import math

import numpy as np
import pandas as pd
from PIL import Image, ImageDraw
import cv2



import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler

import torchvision
from torchvision import datasets, models, transforms
from torchvision.models.detection.retinanet import RetinaNet
from torchvision.models.detection.faster_rcnn import FasterRCNN
from torchvision.models.detection.anchor_utils import AnchorGenerator

import  torchvision.transforms.functional as F

from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import xml.etree.ElementTree as ET
import collections
from torchvision.datasets.voc import VisionDataset

from functions import *
from functions_torch import *


import albumentations as A
from albumentations.pytorch import ToTensorV2



In [2]:
params = {}
params['target_size']=(2000,1500)
params['batch_size'] = 4
params['lr'] = 0.001

voc_root = '../../ladd-and-weights/dataset/full_train_ds'
weights_root = '../../ladd-and-weights/weights/torch'



In [3]:
# Reworked class from pytorch (see https://pytorch.org/vision/0.8/_modules/torchvision/datasets/voc.html#VOCDetection)

class LADDDataSET(torchvision.datasets.VisionDataset):
    def __init__(
            self,
            root: str,
            image_set: str,
            transforms: Callable,
            transforms_wo_norm: Callable
            ):     
        super(LADDDataSET, self).__init__(root, transforms=transforms)
        self.image_set = image_set

        voc_root = root
        image_dir = os.path.join(voc_root, 'JPEGImages')
        annotation_dir = os.path.join(voc_root, 'Annotations')

        if not os.path.isdir(voc_root):
            raise RuntimeError('Dataset not found or corrupted.')

        splits_dir = os.path.join(voc_root, 'ImageSets/Main')
        split_f = os.path.join(splits_dir, image_set.rstrip('\n') + '.txt')

        with open(os.path.join(split_f), "r") as f:
            file_names = [x.strip() for x in f.readlines()]

        self.images = [os.path.join(image_dir, x + ".jpg") for x in file_names]
        self.annotations = [os.path.join(annotation_dir, x + ".xml") for x in file_names]
        self.transforms_wo_norm = transforms_wo_norm
        assert (len(self.images) == len(self.annotations))
        
    def get_data(self, index: int, transforms: Callable):
        # Read an image with OpenCV
        image = cv2.imread(self.images[index])
        # By default OpenCV uses BGR color space for color images,
        # so we need to convert the image to RGB color space.
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        description = LADDDataSET.parse_voc_xml(
            ET.parse(self.annotations[index]).getroot())

        # get bounding box coordinates 
        num_objs = len(description['annotation']['object'])
        boxes = []
        labels = []
        for l in description['annotation']['object']:
            bb = l['bndbox']
            boxes.append([int(bb['xmin']), int(bb['ymin']), int(bb['xmax']), int(bb['ymax'])]) 
            labels.append(1)
        augmented = transforms(image=image, bboxes=boxes, labels=labels)
        image = augmented['image']
        target = {}
        target["boxes"] = torch.as_tensor(augmented['bboxes'], dtype=torch.float32)
        target["labels"] = torch.as_tensor(augmented['labels'],dtype=torch.int64)
        
        return image,target


    def __getitem__(self, index: int):
        return self.get_data(index,self.transforms)
        
    
    def get_wo_norm(self, index: int):        
        return self.get_data(index, self.transforms_wo_norm)


    def __len__(self) -> int:
        return len(self.images)

    @staticmethod
    def parse_voc_xml(node: ET.Element) -> Dict[str, Any]:
        voc_dict: Dict[str, Any] = {}
        children = list(node)
        if children:
            def_dic: Dict[str, Any] = collections.defaultdict(list)
            for dc in map(LADDDataSET.parse_voc_xml, children):
                for ind, v in dc.items():
                    def_dic[ind].append(v)
            if node.tag == 'annotation':
                def_dic['object'] = [def_dic['object']]
            voc_dict = {
                node.tag:
                    {ind: v[0] if len(v) == 1 else v
                     for ind, v in def_dic.items()}
            }
        if node.text:
            text = node.text.strip()
            if not children:
                voc_dict[node.tag] = text
        return voc_dict

In [4]:
# # Pytorch implemenation of retinanet doesn't supports train on Images without any objects (which, probably need to be fixed)
# # see https://github.com/pytorch/vision/blob/master/torchvision/models/detection/retinanet.py#L475
# # As a temporary solution, yet, we just filtering out empty images

# splits_dir = os.path.join(voc_root, 'ImageSets/Main') 
# annotation_dir = os.path.join(voc_root, 'Annotations')

# with open(os.path.join(splits_dir,'train.txt'), "r") as f:
#     file_names = [x.strip() for x in f.readlines()]

# non_empty = []
# for a in file_names:
#     description = LADDDataSET.parse_voc_xml(
#         ET.parse(os.path.join(annotation_dir, a + ".xml")).getroot()
#     )
#     num_objs = len(description['annotation']['object'])
#     if num_objs > 0:
#         non_empty.append(a+'\n')
        
# with open(os.path.join(splits_dir,'train_non_empty.txt'), "w") as f:
#     f.writelines(non_empty)

# print('Total images '+str(len(file_names)), ' non empty: '+str(len(non_empty)))
                                                
                                    
        

In [5]:
albumentations_transform_train = A.Compose([
    A.Resize(params['target_size'][0],params['target_size'][1]), 
    A.HorizontalFlip(p=0.5),
    A.ChannelShuffle(p=0.5),
    A.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
    ),
    ToTensorV2()
], bbox_params=A.BboxParams(format='pascal_voc',label_fields=['labels']))


albumentations_transform_val = A.Compose([
    A.Resize(params['target_size'][0],params['target_size'][1]), 
    A.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
    ),
    ToTensorV2()
], bbox_params=A.BboxParams(format='pascal_voc',label_fields=['labels']))

albumentations_transform_train_view = A.Compose([
    A.Resize(params['target_size'][0],params['target_size'][1]), 
    A.HorizontalFlip(p=1.0),
    A.ChannelShuffle(p=1.0),
    ToTensorV2()
], bbox_params=A.BboxParams(format='pascal_voc',label_fields=['labels']))

albumentations_transform_val_view = A.Compose([
    A.Resize(params['target_size'][0],params['target_size'][1]),
    ToTensorV2()
], bbox_params=A.BboxParams(format='pascal_voc',label_fields=['labels']))

In [None]:
# # test DS
im_idx = 99

dataset = LADDDataSET(voc_root,'test',albumentations_transform_train,albumentations_transform_train_view) 
(image,target) = dataset.get_wo_norm(im_idx) 
im = F.to_pil_image(image)
draw = ImageDraw.Draw(im)

for bb in target['boxes']:
    draw.line([(bb[0], bb[1]), (bb[0], bb[3]), (bb[2], bb[3]),
               (bb[2], bb[1]), (bb[0], bb[1])], width=4, fill=(255, 0, 0))

im.show()

In [7]:
dataset_train = LADDDataSET(voc_root,'train_non_empty',albumentations_transform_train,
                           albumentations_transform_train_view) 
dataset_val = LADDDataSET(voc_root,'val',albumentations_transform_val, 
                          albumentations_transform_val_view) 

# define training and validation data loaders
data_loader = torch.utils.data.DataLoader(
    dataset_train, batch_size=params['batch_size'], shuffle=True, num_workers=4
     ,collate_fn=collate_fn
)

data_loader_val = torch.utils.data.DataLoader(
    dataset_val, batch_size=1, shuffle=False, num_workers=16
     ,collate_fn=collate_fn
)


In [8]:
anchor_sizes = tuple((x, int(x * 2 ** (1.0 / 3)), int(x * 2 ** (2.0 / 3))) for x in [16, 32, 64, 128, 256])
aspect_ratios = ((0.5, 1.0, 2.0, 3.0),) * len(anchor_sizes)
anchor_generator = AnchorGenerator(
    anchor_sizes, aspect_ratios
)

In [9]:
model = torchvision.models.detection.retinanet_resnet50_fpn(pretrained=False, num_classes=2, pretrained_backbone=True, 
                                                           min_size=params['target_size'][0], max_size = params['target_size'][1],
                                                           trainable_backbone_layers = 0, anchor_generator=anchor_generator)
# Nees to define pretrained_backbone to use trainable_backbone_layers, otherwise it's ignored
model.load_state_dict(torch.load(os.path.join(weights_root,'pretrain','resnet50_SDD.pth')), strict=False)
# model.load_state_dict(torch.load(os.path.join(weights_root,'pretrain','resnet50_SDD.pth'),
#             map_location=torch.device('cpu')
#             ), strict=False)

<All keys matched successfully>

In [10]:
# model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=False, num_classes=2, pretrained_backbone=True, 
#                                                            min_size=params['target_size'][0], max_size = params['target_size'][1],
#                                                            trainable_backbone_layers = 0)
# # Nees to define pretrained_backbone to use trainable_backbone_layers, otherwise it's ignored
# model.load_state_dict(torch.load('/app/host/lacmus/weights/resnet50_FRCNN_SDD_epoch_4.pth'), strict=False)



In [11]:
# the computation device
device = torch.device('cuda')
# device = torch.device('cpu')

model = model.to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=params['lr'], momentum=0.9, weight_decay=0.0005) 
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                               step_size=3,
                                               gamma=0.1)

In [12]:
for epoch in range(10): # train without backbone

    train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=100)
    print ("Train done, evaluating.")
    # update the learning rate
    lr_scheduler.step()
    # evaluate on the test dataset
    inference_res = evaluate(model,data_loader_val)
    print('Inference done, computing mAp : ')
    print(evaluate_res(inference_res, iou_threshold = 0.5, score_threshold = 0.05))    
    print(evaluate_res(inference_res, iou_threshold = 0.6, score_threshold = 0.05))
    print('Epoch Done')
torch.save(model.state_dict(), 'resnet50_RetinaNet_LADD_head.pth')

  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


Epoch: [0]  [  0/295]  eta: 0:11:47  lr: 0.000004  loss: 2.2853 (2.2853)  classification: 1.5119 (1.5119)  bbox_regression: 0.7735 (0.7735)  time: 2.3986  data: 1.2279  max mem: 3469
Epoch: [0]  [100/295]  eta: 0:03:47  lr: 0.000344  loss: 1.4946 (1.8446)  classification: 0.8052 (1.1239)  bbox_regression: 0.6467 (0.7208)  time: 1.1675  data: 0.0234  max mem: 3547
Epoch: [0]  [200/295]  eta: 0:01:50  lr: 0.000684  loss: 1.2573 (1.6019)  classification: 0.6739 (0.9456)  bbox_regression: 0.5788 (0.6563)  time: 1.1734  data: 0.0235  max mem: 3577
Epoch: [0]  [294/295]  eta: 0:00:01  lr: 0.001000  loss: 1.2312 (1.4785)  classification: 0.6612 (0.8508)  bbox_regression: 0.5538 (0.6277)  time: 1.1752  data: 0.0240  max mem: 8164
Epoch: [0] Total time: 0:05:45 (1.1712 s / it)
Train done, evaluating.
Inference done, computing mAp : 
(0.21985391095081178, 0.026562345902232158)
(0.0741508144717369, 0.016831585522206514)
Epoch Done
Epoch: [1]  [  0/295]  eta: 0:13:21  lr: 0.001000  loss: 1.6292 (1

In [13]:
import gc
del model
del optimizer
gc.collect()
torch.cuda.empty_cache()

In [14]:
anchor_sizes = tuple((x, int(x * 2 ** (1.0 / 3)), int(x * 2 ** (2.0 / 3))) for x in [16, 32, 64, 128, 256])
aspect_ratios = ((0.5, 1.0, 2.0, 3.0),) * len(anchor_sizes)
anchor_generator = AnchorGenerator(
    anchor_sizes, aspect_ratios
)
model = torchvision.models.detection.retinanet_resnet50_fpn(pretrained=False, num_classes=2, pretrained_backbone=True, 
                                                           min_size=params['target_size'][0], max_size = params['target_size'][1],
                                                           trainable_backbone_layers = 5, anchor_generator=anchor_generator)
model.load_state_dict(torch.load('resnet50_RetinaNet_LADD_head.pth'), strict=True)



<All keys matched successfully>

In [15]:
# model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=False, num_classes=2, pretrained_backbone=True, 
#                                                            min_size=params['target_size'][0], max_size = params['target_size'][1],
#                                                            trainable_backbone_layers = 5)
# model.load_state_dict(torch.load('/app/host/lacmus/weights/resnet50_FRCNN_LADD_head.pth'), strict=True)


In [16]:
# the computation device
device = torch.device('cuda')
model = model.to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=params['lr'], momentum=0.9, weight_decay=0.0005) 
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                               step_size=4,
                                               gamma=0.1)


In [17]:
for epoch in range(12): # train with backbone now

    train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=100)
    print ("Train done, evaluating.")
    # update the learning rate
    lr_scheduler.step()
    # evaluate on the test dataset
    inference_res = evaluate(model,data_loader_val)
    print('Inference done, computing mAp : ')
    print(evaluate_res(inference_res, iou_threshold = 0.5, score_threshold = 0.05))    
    print(evaluate_res(inference_res, iou_threshold = 0.6, score_threshold = 0.05))
    print('Epoch Done')
    torch.save(model.state_dict(), 'resnet50_RN_LADD_epoch_%i.pth'%(epoch+10))

# was    
# Epoch: [9] Total time: 0:09:23 (0.4774 s / it)
# Train done, evaluating.
# Inference done, computing mAp : 
# 0.8940549518273289
# 0.8334125499913912

Epoch: [0]  [  0/295]  eta: 0:17:08  lr: 0.000004  loss: 0.5877 (0.5877)  classification: 0.2064 (0.2064)  bbox_regression: 0.3813 (0.3813)  time: 3.4848  data: 1.5528  max mem: 8543
Epoch: [0]  [100/295]  eta: 0:06:13  lr: 0.000344  loss: 0.9916 (0.9058)  classification: 0.5213 (0.4497)  bbox_regression: 0.4703 (0.4561)  time: 1.9013  data: 0.0215  max mem: 13375
Epoch: [0]  [200/295]  eta: 0:03:01  lr: 0.000684  loss: 0.7527 (0.8682)  classification: 0.3347 (0.4161)  bbox_regression: 0.4344 (0.4521)  time: 1.9040  data: 0.0218  max mem: 13375
Epoch: [0]  [294/295]  eta: 0:00:01  lr: 0.001000  loss: 0.7914 (0.8501)  classification: 0.3440 (0.4005)  bbox_regression: 0.4364 (0.4496)  time: 1.9008  data: 0.0212  max mem: 13375
Epoch: [0] Total time: 0:09:22 (1.9080 s / it)
Train done, evaluating.
Inference done, computing mAp : 
(0.6454312257489907, 0.06352670182457416)
(0.47089293577780883, 0.05322179790264897)
Epoch Done
Epoch: [1]  [  0/295]  eta: 0:17:18  lr: 0.001000  loss: 0.7814 (

In [None]:
# # # uncomment to test evaluation model and show detections

# dataset_test = LADDDataSET(voc_root,'test',albumentations_transform_val, albumentations_transform_val_view) 
# data_loader_test = torch.utils.data.DataLoader(
#     dataset_val, batch_size=1, shuffle=False, num_workers=1
#      ,collate_fn=collate_fn
# )

# image_idx = 0

# cpu_device = torch.device("cpu")
# model.eval()
# for images, targets in data_loader_test:
#     g_images = list(img.to(device) for img in images)

#     if torch.cuda.is_available():
#         torch.cuda.synchronize()
#     outputs = model(g_images)

#     outputs = [{k: v.to(cpu_device) for k, v in t.items()} for t in outputs]
#     res = targets, outputs
#     break

# (image,target) = dataset_test.get_wo_norm(im_idx) 
# im = F.to_pil_image(image)
# draw = ImageDraw.Draw(im)

# for idx in range(len(outputs[image_idx]['boxes'])):
#     width = math.ceil(outputs[image_idx]['scores'][idx]*10)
#     bb = outputs[0]['boxes'][idx]
#     draw.line([(bb[0], bb[1]), (bb[0], bb[3]), (bb[2], bb[3]),
#                (bb[2], bb[1]), (bb[0], bb[1])], width=width, fill=(255, 0, 0))

# for bb in targets[image_idx]['boxes'][:10]:
#     draw.line([(bb[0], bb[1]), (bb[0], bb[3]), (bb[2], bb[3]),
#                (bb[2], bb[1]), (bb[0], bb[1])], width=4, fill=(0,255, 0))
# im.show()



In [None]:
# img =  Image.open('..\').convert('RGB')
# g_images = list(img.to(device) for img in images)

#     if torch.cuda.is_available():
#         torch.cuda.synchronize()
#     outputs = model(g_images)

#     outputs = [{k: v.to(cpu_device) for k, v in t.items()} for t in outputs]
#     res = targets, outputs
#     break
