In [None]:
import sys
sys.path.insert(0, "timm-efficientdet-pytorch")
sys.path.insert(0, "omegaconf")

import torch
import os
from datetime import datetime
import time
import random
import cv2
import pandas as pd
import numpy as np
import albumentations as A
import matplotlib.pyplot as plt
from albumentations.pytorch.transforms import ToTensorV2
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import Dataset,DataLoader
from torch.utils.data.sampler import SequentialSampler, RandomSampler
from glob import glob

SEED = 42

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED)

In [None]:
marking = pd.read_csv('/home/hy/dataset/gwd/train.csv')

bboxs = np.stack(marking['bbox'].apply(lambda x: np.fromstring(x[1:-1], sep=',')))
for i, column in enumerate(['x', 'y', 'w', 'h']):
    marking[column] = bboxs[:,i]
marking.drop(columns=['bbox'], inplace=True)

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

df_folds = marking[['image_id']].copy()
df_folds.loc[:, 'bbox_count'] = 1
df_folds = df_folds.groupby('image_id').count()
df_folds.loc[:, 'source'] = marking[['image_id', 'source']].groupby('image_id').min()['source']
df_folds.loc[:, 'stratify_group'] = np.char.add(
    df_folds['source'].values.astype(str),
    df_folds['bbox_count'].apply(lambda x: f'_{x // 15}').values.astype(str)
)
df_folds.loc[:, 'fold'] = 0

for fold_number, (train_index, val_index) in enumerate(skf.split(X=df_folds.index, y=df_folds['stratify_group'])):
    df_folds.loc[df_folds.iloc[val_index].index, 'fold'] = fold_number

## Albumentations

In [None]:
def get_train_transforms():
    return A.Compose(
        [
            A.RandomSizedCrop(min_max_height=(800, 800), height=1024, width=1024, p=0.5),
            A.OneOf([
                A.HueSaturationValue(hue_shift_limit=0.2, sat_shift_limit= 0.2, 
                                     val_shift_limit=0.2, p=0.9),
                A.RandomBrightnessContrast(brightness_limit=0.2, 
                                           contrast_limit=0.2, p=0.9),
                A.RandomGamma(p=0.9),
            ],p=0.9),
            A.ToGray(p=0.01),
            A.HorizontalFlip(p=0.5),
            A.VerticalFlip(p=0.5),
            A.Resize(height=1024, width=1024, p=1),
            A.Cutout(num_holes=8, max_h_size=64, max_w_size=64, fill_value=0, p=0.5),
            ToTensorV2(p=1.0),
        ], 
        p=1.0, 
        bbox_params=A.BboxParams(
            format='pascal_voc',
            min_area=0, 
            min_visibility=0,
            label_fields=['labels']
        )
    )

def get_valid_transforms():
    return A.Compose(
        [
            A.Resize(height=1024, width=1024, p=1.0),
            ToTensorV2(p=1.0),
        ], 
        p=1.0, 
        bbox_params=A.BboxParams(
            format='pascal_voc',
            min_area=0, 
            min_visibility=0,
            label_fields=['labels']
        )
    )

## Dataset

In [None]:
TRAIN_ROOT_PATH = '/home/hy/dataset/gwd/train'

class DatasetRetriever(Dataset):

    def __init__(self, marking, image_ids, transforms=None, test=False):
        super().__init__()

        self.image_ids = image_ids
        self.marking = marking
        self.transforms = transforms
        self.test = test

    def __getitem__(self, index: int):
        image_id = self.image_ids[index]
        
        if self.test or random.random() > 0.5:
            image, boxes = self.load_image_and_boxes(index)
        else:
            image, boxes = self.load_cutmix_image_and_boxes(index)

        # there is only one class
        labels = torch.ones((boxes.shape[0],), dtype=torch.int64)
        
        target = {}
        target['boxes'] = boxes
        target['labels'] = labels
        target['image_id'] = torch.tensor([index])

        if self.transforms:
            for i in range(10):
                sample = self.transforms(**{
                    'image': image,
                    'bboxes': target['boxes'],
                    'labels': labels
                })
                if len(sample['bboxes']) > 0:
                    image = sample['image']
                    target['boxes'] = torch.stack(tuple(map(torch.tensor, zip(*sample['bboxes'])))).permute(1, 0)
                    target['boxes'][:,[0,1,2,3]] = target['boxes'][:,[1,0,3,2]]  #yxyx: be warning
                    break

        return image, target, image_id

    def __len__(self) -> int:
        return self.image_ids.shape[0]

    def load_image_and_boxes(self, index):
        image_id = self.image_ids[index]
        image = cv2.imread(f'{TRAIN_ROOT_PATH}/{image_id}.jpg', cv2.IMREAD_COLOR)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
        image /= 255.0
        records = self.marking[self.marking['image_id'] == image_id]
        boxes = records[['x', 'y', 'w', 'h']].values
        boxes[:, 2] = boxes[:, 0] + boxes[:, 2]
        boxes[:, 3] = boxes[:, 1] + boxes[:, 3]
        return image, boxes

    def load_cutmix_image_and_boxes(self, index, imsize=1024):
        """ 
        This implementation of cutmix author:  https://www.kaggle.com/nvnnghia 
        Refactoring and adaptation: https://www.kaggle.com/shonenkov
        """
        w, h = imsize, imsize
        s = imsize // 2
    
        xc, yc = [int(random.uniform(imsize * 0.25, imsize * 0.75)) for _ in range(2)]  # center x, y
        indexes = [index] + [random.randint(0, self.image_ids.shape[0] - 1) for _ in range(3)]

        result_image = np.full((imsize, imsize, 3), 1, dtype=np.float32)
        result_boxes = []

        for i, index in enumerate(indexes):
            image, boxes = self.load_image_and_boxes(index)
            if i == 0:
                x1a, y1a, x2a, y2a = max(xc - w, 0), max(yc - h, 0), xc, yc  # xmin, ymin, xmax, ymax (large image)
                x1b, y1b, x2b, y2b = w - (x2a - x1a), h - (y2a - y1a), w, h  # xmin, ymin, xmax, ymax (small image)
            elif i == 1:  # top right
                x1a, y1a, x2a, y2a = xc, max(yc - h, 0), min(xc + w, s * 2), yc
                x1b, y1b, x2b, y2b = 0, h - (y2a - y1a), min(w, x2a - x1a), h
            elif i == 2:  # bottom left
                x1a, y1a, x2a, y2a = max(xc - w, 0), yc, xc, min(s * 2, yc + h)
                x1b, y1b, x2b, y2b = w - (x2a - x1a), 0, max(xc, w), min(y2a - y1a, h)
            elif i == 3:  # bottom right
                x1a, y1a, x2a, y2a = xc, yc, min(xc + w, s * 2), min(s * 2, yc + h)
                x1b, y1b, x2b, y2b = 0, 0, min(w, x2a - x1a), min(y2a - y1a, h)
            result_image[y1a:y2a, x1a:x2a] = image[y1b:y2b, x1b:x2b]
            padw = x1a - x1b
            padh = y1a - y1b

            boxes[:, 0] += padw
            boxes[:, 1] += padh
            boxes[:, 2] += padw
            boxes[:, 3] += padh

            result_boxes.append(boxes)

        result_boxes = np.concatenate(result_boxes, 0)
        np.clip(result_boxes[:, 0:], 0, 2 * s, out=result_boxes[:, 0:])
        result_boxes = result_boxes.astype(np.int32)
        result_boxes = result_boxes[np.where((result_boxes[:,2]-result_boxes[:,0])*(result_boxes[:,3]-result_boxes[:,1]) > 0)]
        return result_image, result_boxes

In [None]:
fold_number = 0

print("Fold_number:", fold_number)

train_dataset = DatasetRetriever(
    image_ids=df_folds[df_folds['fold'] != fold_number].index.values,
    marking=marking,
    transforms=get_train_transforms(),
    test=False,
)

validation_dataset = DatasetRetriever(
    image_ids=df_folds[df_folds['fold'] == fold_number].index.values,
    marking=marking,
    transforms=get_valid_transforms(),
    test=True,
)

In [None]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [None]:
def collate_fn(batch):
    return tuple(zip(*batch))

## TrainGlobalConfig

In [None]:
class TrainGlobalConfig:
    num_workers = 6
    batch_size = 3
    n_epochs = 80
    lr = 0.0002*2
    grad_accumulation_steps = 8
    folder = '0716_effdet7-cutmix-augmix_1024_ddp_grad_'

    # -------------------
    verbose = True
    verbose_step = 1
    # -------------------

    # --------------------
    step_scheduler = False # do scheduler.step after optimizer.step
    validation_scheduler = True  # do scheduler.step after validation stage loss
    
#    SchedulerClass = torch.optim.lr_scheduler.OneCycleLR
#    scheduler_params = dict(
#         max_lr=0.001,
#         epochs=n_epochs,
#         steps_per_epoch=int(len(train_dataset) / batch_size),
#         pct_start=0.1,
#         anneal_strategy='cos', 
#         final_div_factor=10**5
#     )
    
    SchedulerClass = torch.optim.lr_scheduler.ReduceLROnPlateau
    scheduler_params = dict(
        mode='min',
        factor=0.5,
        patience=1,
        verbose=False, 
        threshold=0.0001,
        threshold_mode='abs',
        cooldown=0, 
        min_lr=1e-8,
        eps=1e-08
   )
# --------------------

## Model

In [None]:
import argparse
from apex.parallel import DistributedDataParallel
from apex.parallel import convert_syncbn_model
from apex import amp

parser = argparse.ArgumentParser()
# FOR DISTRIBUTED:  Parse for the local_rank argument, which will be supplied
# automatically by torch.distributed.launch.
parser.add_argument("--local_rank", default=0, type=int)
args = parser.parse_args()

# FOR DISTRIBUTED:  If we are running under torch.distributed.launch,
# the 'WORLD_SIZE' environment variable will also be set automatically.
args.distributed = False
if 'WORLD_SIZE' in os.environ:
    args.distributed = int(os.environ['WORLD_SIZE']) > 1

if args.distributed:
    # FOR DISTRIBUTED:  Set the device according to local_rank.
    torch.cuda.set_device(args.local_rank)

    # FOR DISTRIBUTED:  Initialize the backend.  torch.distributed.launch will provide
    # environment variables, and requires that you use init_method=`env://`.
    torch.distributed.init_process_group(backend='nccl',
                                         init_method='env://')

torch.backends.cudnn.benchmark = True

In [None]:
from effdet import get_efficientdet_config, EfficientDet, DetBenchTrain
from effdet.efficientdet import HeadNet
from torchtools.lr_scheduler import DelayerScheduler
from over9000 import RangerLars 

config = get_efficientdet_config('tf_efficientdet_d7')
net = EfficientDet(config, pretrained_backbone=False)
checkpoint = torch.load('efficientdet_d7-f05bf714.pth')
net.load_state_dict(checkpoint)
config.num_classes = 1
config.image_size = 1024
net.class_net = HeadNet(config, num_outputs=config.num_classes, norm_kwargs=dict(eps=.001, momentum=.01))
model = DetBenchTrain(net, config)
#device = torch.device('cuda:0')
#model.to(device)
model.cuda()
param_optimizer = list(model.named_parameters())

no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
] 
#optimizer = RangerLars(model.parameters(),lr=TrainGlobalConfig.lr)
optimizer = torch.optim.AdamW(model.parameters(), lr=TrainGlobalConfig.lr)
model = convert_syncbn_model(model)
model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
model = DistributedDataParallel(model, delay_allreduce=True)

scheduler = TrainGlobalConfig.SchedulerClass(optimizer, **TrainGlobalConfig.scheduler_params)
#delay_epochs = 15
#total_epochs = TrainGlobalConfig.n_epochs
#base_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, delay_epochs) # delay the scheduler for 15 steps
#scheduler = DelayerScheduler(optimizer, total_epochs - delay_epochs, base_scheduler)

In [None]:
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
val_sampler = torch.utils.data.distributed.DistributedSampler(validation_dataset)

In [None]:
train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=TrainGlobalConfig.batch_size,
        sampler=RandomSampler(train_sampler),
        #sampler=train_sampler,
        pin_memory=False,
        drop_last=True,
        num_workers=TrainGlobalConfig.num_workers,
        collate_fn=collate_fn,
    )
val_loader = torch.utils.data.DataLoader(
        validation_dataset, 
        batch_size=TrainGlobalConfig.batch_size,
        num_workers=TrainGlobalConfig.num_workers,
        shuffle=False,
        sampler=SequentialSampler(val_sampler),
        #sampler=val_sampler,
        pin_memory=False,
        collate_fn=collate_fn,
    )

## logger

In [None]:
base_dir = f'./{TrainGlobalConfig.folder}'
if not os.path.exists(base_dir):
    os.makedirs(base_dir)
        
log_path = f'{base_dir}/log.txt'
best_summary_loss = 10**5

In [None]:
def log(message):
    if TrainGlobalConfig.verbose:
        print(message)
        with open(log_path, 'a+') as logger:
            logger.write(f'{message}\n')
            
import warnings
warnings.filterwarnings("ignore")

In [None]:
def save(path):
    model.eval()
    torch.save({
        'model': self.model.state_dict(),
        'optimizer': self.optimizer.state_dict(),
        'amp': self.amp.state_dict(),
        'best_summary_loss': self.best_summary_loss,
        'epoch': self.epoch,
    }, path)

In [None]:
epoch = 0
import torch.nn as nn

In [None]:
import torch.distributed as dist

def reduce_tensor(tensor):
    rt = tensor.clone()
    dist.all_reduce(rt, op=dist.reduce_op.SUM)
    rt /= 2
    return rt

In [None]:
for e in range(TrainGlobalConfig.n_epochs):
    if TrainGlobalConfig.verbose:
        lr = optimizer.param_groups[0]['lr']
        timestamp = datetime.utcnow().isoformat()
        log(f'\n{timestamp}\nLR: {lr}')
        t = time.time()
        
        model.train()
        summary_loss = AverageMeter()
        for step, (images, targets, image_ids) in enumerate(train_loader):
        ##train##
            if TrainGlobalConfig.verbose:
                if step % TrainGlobalConfig.verbose_step == 0:
                    print(
                        f'Train Step {step}/{len(train_loader)}, ' + \
                        f'summary_loss: {summary_loss.avg:.5f}, ' + \
                        f'time: {(time.time() - t):.5f}', end='\r'
                    )
            images = torch.stack(images)
            #images = images.to(device).float()
            images = images.cuda().float()
            batch_size = images.shape[0]
            #boxes = [target['boxes'].to(device).float() for target in targets]
            #labels = [target['labels'].to(device).float() for target in targets]
            boxes = [target['boxes'].cuda().float() for target in targets]
            labels = [target['labels'].cuda().float() for target in targets]

            optimizer.zero_grad()    
            loss_ori, _, _ = model(images, boxes, labels)            
            loss = loss_ori / TrainGlobalConfig.grad_accumulation_steps
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            #loss.backward()
            
            #nn.utils.clip_grad_value_(model.parameters(), clip_value=2.0)
            #summary_loss.update(reduce_tensor(scaled_loss).detach().item(), batch_size)
            summary_loss.update(reduce_tensor(loss_ori).item(), batch_size)
            if (step + 1) % TrainGlobalConfig.grad_accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()
                
                if TrainGlobalConfig.step_scheduler:
                    scheduler.step()
 
            #optimizer.step()
                 
        log(f'[RESULT]: Train. Epoch: {epoch}, summary_loss: {summary_loss.avg:.5f}, time: {(time.time() - t):.5f}')
        ##train & save##
        model.eval()
        if args.local_rank == 0:
            torch.save({
                'model': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'amp': amp.state_dict()
            }, f'{base_dir}/last-checkpoint.bin')
        
        ##valid##
        t = time.time()
        model.eval()
        summary_loss_valid = AverageMeter()
        t = time.time()
        for step, (images, targets, image_ids) in enumerate(val_loader):
            if TrainGlobalConfig.verbose:
                if step % TrainGlobalConfig.verbose_step == 0:
                    print(
                        f'Val Step {step}/{len(val_loader)}, ' + \
                        f'summary_loss: {summary_loss_valid.avg:.5f}, ' + \
                        f'time: {(time.time() - t):.5f}', end='\r'
                    )
            with torch.no_grad():
                images = torch.stack(images)
                batch_size = images.shape[0]
                images = images.cuda().float()
                boxes = [target['boxes'].cuda().float() for target in targets]
                labels = [target['labels'].cuda().float() for target in targets]

                loss, _, _ = model(images, boxes, labels)
                summary_loss_valid.update(reduce_tensor(loss).detach().item(), batch_size)
                
        log(f'[RESULT]: Val. Epoch: {epoch}, summary_loss: {summary_loss_valid.avg:.5f}, time: {(time.time() - t):.5f}')
        if summary_loss_valid.avg < best_summary_loss:
            best_summary_loss = summary_loss_valid.avg
            model.eval()
            if args.local_rank == 0:
                torch.save({
                    'model': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'amp': amp.state_dict()
                }, f'{base_dir}/best-checkpoint-{str(epoch).zfill(3)}epoch.bin')
                
                for path in sorted(glob(f'{base_dir}/best-checkpoint-*epoch.bin'))[:-3]:
                    os.remove(path)

        if TrainGlobalConfig.validation_scheduler:
            print('summary_loss_valid.avg:',summary_loss_valid.avg)
            scheduler.step(metrics=summary_loss_valid.avg)

        epoch += 1