In [None]:
!pip install jupyterlab
!pip install wandb pytorch-lightning
!pip install scikit-learn

In [1]:
# Weights & Biases
import wandb
from pytorch_lightning.loggers import WandbLogger

# Pytorch modules
import torch
from torch.nn import functional as F
from torch import nn
from torch.optim import Adam
from torch.utils.data import DataLoader, random_split

# Pytorch-Lightning
from pytorch_lightning import LightningDataModule, LightningModule, Trainer
import pytorch_lightning as pl

# Dataset
from torchvision import transforms
# create local file path 
!python tracking/create_default_local_file.py --workspace_dir . --data_dir ./data --save_dir .

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from lib.utils.misc import NestedTensor
from lib.utils.box_ops import box_cxcywh_to_xyxy, box_xywh_to_xyxy
from lib.utils.merge import merge_template_search

import argparse
from lib.train.admin.environment import env_settings
import os
import numpy as np
import random
import importlib
import cv2 as cv
from lib.train.base_functions import *

https://wandb.ai/wandb_fc/korean/reports/Weights-Biases-Pytorch-Lightning---VmlldzozNzAxOTg

https://pytorch-lightning.readthedocs.io/en/latest/cli/lightning_cli_advanced.html

https://pytorch-lightning.readthedocs.io/en/latest/starter/converting.html

https://pytorch-lightning.readthedocs.io/en/1.4.0/advanced/multi_gpu.html



In [3]:
def parse_args(args):
    """
    args for training.
    """
    parser = argparse.ArgumentParser(description='Parse args for training')
    # for train
    parser.add_argument('--script', type=str, help='training script name')
    parser.add_argument('--config', type=str, default='baseline', help='yaml configure file name')
    parser.add_argument('--save_dir', type=str, help='root directory to save checkpoints, logs, and tensorboard')
    parser.add_argument('--mode', type=str, choices=["single", "multiple"], default="multiple",
                        help="train on single gpu or multiple gpus")
    parser.add_argument('--cudnn_benchmark', type=bool, default=True, help='Set cudnn benchmark on (1) or off (0) (default is on).')
    parser.add_argument('--local_rank', default=-1, type=int, help='node rank for distributed training')

    parser.add_argument('--seed', type=int, default=42, help='seed for random numbers')
    parser.add_argument('--dry_run', type=int, default=1, help='0: wandb activate, 1: wandb off')
    parser.add_argument('--nproc_per_node', type=int, help="number of GPUs per node")  # specify when mode is multiple
    parser.add_argument('--use_lmdb', type=int, choices=[0, 1], default=0)  # whether datasets are in lmdb format
    parser.add_argument('--script_prv', type=str, help='training script name')
    parser.add_argument('--config_prv', type=str, default='baseline', help='yaml configure file name')

    args = parser.parse_args(args)

    return args

class Settings:
    """ Training settings, e.g. the paths to datasets and networks."""
    def __init__(self):
        self.set_default()       

    def set_default(self):
        self.env = env_settings()
        self.use_gpu = True

    def set_args(self, args):
        # self.args = args
        self.script_name = args.script
        self.config_name = args.config
        self.dry_run = args.dry_run
        self.project_path = 'train/{}/{}'.format(self.script_name, self.config_name)
        if args.script_prv is not None and args.config_prv is not None:
            self.project_path_prv = 'train/{}/{}'.format(args.script_prv, args.config_prv)
        self.local_rank = args.local_rank
        self.save_dir = os.path.abspath(args.save_dir)
        self.use_lmdb = args.use_lmdb
        prj_dir = os.path.abspath('')
        # prj_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
        self.cfg_file = os.path.join(prj_dir, 'experiments/%s/%s.yaml' % (self.script_name, self.config_name))

        self.description = 'Training script for STARK-S, STARK-ST stage1, and STARK-ST stage2'

        # update the default configs with config file
        if not os.path.exists(self.cfg_file):
            raise ValueError("%s doesn't exist." % self.cfg_file)
        config_module = importlib.import_module("lib.config.%s.config" % self.script_name)
        cfg = config_module.cfg
        config_module.update_config_from_file(self.cfg_file)     

        # Record the training log
        log_dir = os.path.join(self.save_dir, 'logs')
        if self.local_rank in [-1, 0]:
            if not os.path.exists(log_dir):
                os.makedirs(log_dir)
        self.log_file = os.path.join(log_dir, "%s-%s.log" % (self.script_name, self.config_name))
        return cfg

def init_seeds(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


In [4]:
class LitEXOTActor(LightningModule):
    def __init__(self, cfg, settings, loss_type, lr =0.0001):
        '''method used to define our model parameters'''
        super().__init__()
        
        self.settings = settings
        self.bs = self.settings.batchsize  # batch size
        self.exit_flag = loss_type

        # optimizer parameters
        self.cfg = cfg
        self.lr = lr

        # metrics
        # self.accuracy = pl.metrics.Accuracy()

        # optional - save hyper-parameters to self.hparams
        # they will also be automatically logged as config parameters in W&B
        self.save_hyperparameters()

    def construct(self, net, objective, loss_weight):
        self.net = net
        self.objective = objective
        self.loss_weight = loss_weight


    def forward(self, data, run_box_head=True, run_cls_head=False):
        feat_dict_list = []
        # process the templates
        for i in range(self.settings.num_template):
            template_img_i = data['template_images'][i].view(-1, *data['template_images'].shape[2:])  # (batch, 3, 128, 128)
            template_att_i = data['template_att'][i].view(-1, *data['template_att'].shape[2:])  # (batch, 128, 128)
            feat_dict_list.append(self.net(img=NestedTensor(template_img_i, template_att_i), mode='backbone'))

        # process the search regions (t-th frame)
        search_img = data['search_images'].view(-1, *data['search_images'].shape[2:])  # (batch, 3, 320, 320)
        search_att = data['search_att'].view(-1, *data['search_att'].shape[2:])  # (batch, 320, 320)
        feat_dict_list.append(self.net(img=NestedTensor(search_img, search_att), mode='backbone'))

        # run the transformer and compute losses
        seq_dict = merge_template_search(feat_dict_list)
        
        template_bboxes = box_xywh_to_xyxy(data['template_anno'])  #(N_t, batch, 4)

        # search_joint = data['search_joint'] #(N_s, batch, 6)
        # print('joint flag', data['joint_flag'])
        # if data['joint_flag'][0] != 'None':
        #     template_joint = data['template_joint'] #(N_t, batch, 6)
        # else:
        template_joint = None
        joint_annot = (template_bboxes, template_joint)  # template anno, template joint
        out_dict, _, _, flagFeat = self.net(seq_dict=seq_dict, annot = joint_annot, mode="transformer", run_box_head=run_box_head, run_cls_head=run_cls_head)
        # out_dict: (B, N, C), outputs_coord: (1, B, N, C), target_query: (1, B, N, C)
        return out_dict, flagFeat

    def training_step(self, data, batch_idx):
        '''needs to return a loss from a single batch'''
        # data, y = batch

        out_dict, flagFeat = self(data, run_box_head=True, run_cls_head=False)
        gt_exit, gt_package, gt_bboxes = self.process_gt(data)

        # compute losses
        if flagFeat == None:
            flagFeat = (gt_exit, gt_package) #, data['epoch'])
        else:
            exitflag, feature = flagFeat
            flagFeat = (gt_exit, gt_package, feature, exitflag) #, data['epoch'])
        loss, status = self.compute_losses(out_dict, gt_bboxes, flag_feat = flagFeat)
        
        # return loss, status

        # Log training loss
        self.log('Loss/train_total', loss)
        self.log('train_batch_stepidx', batch_idx)

        # Log metrics

        self.log('Loss/train_giou', status['Loss/giou'])
        self.log('Loss/train_l1', status['Loss/l1'])
        self.log('train_IoU', status['IoU'])


        return loss

    def validation_step(self, data, batch_idx):
        '''used for logging metrics'''
        # data, y = batch
        # data['epoch'] = self.epoch
        # data['settings'] = self.settings

        out_dict, flagFeat = self(data, run_box_head=True, run_cls_head=False)
        gt_exit, gt_package, gt_bboxes = self.process_gt(data)

        # compute losses
        if flagFeat == None:
            flagFeat = (gt_exit, gt_package) #, data['epoch'])
        else:
            exitflag, feature = flagFeat
            flagFeat = (gt_exit, gt_package, feature, exitflag) #, data['epoch'])
        loss, status = self.compute_losses(out_dict, gt_bboxes, flag_feat = flagFeat)

        # Log validation loss (will be automatically averaged over an epoch)
        # Log training loss
        self.log('Loss/valid_total', loss)
        self.log('val_batch_stepidx', batch_idx)

        # Log metrics

        self.log('Loss/valid_giou', status['Loss/giou'])
        self.log('Loss/valid_l1', status['Loss/l1'])
        self.log('valid_IoU', status['IoU'])
    
    def test_step(self, data, batch_idx):
        '''used for logging metrics'''
        # data, y = batch

        out_dict, flagFeat = self(data, run_box_head=True, run_cls_head=False)
        gt_exit, gt_package, gt_bboxes = self.process_gt(data)

        # compute losses
        if flagFeat == None:
            flagFeat = (gt_exit, gt_package) #, data['epoch'])
        else:
            exitflag, feature = flagFeat
            flagFeat = (gt_exit, gt_package, feature, exitflag) #, data['epoch'])
        loss, status = self.compute_losses(out_dict, gt_bboxes, flag_feat = flagFeat)

        # Log test loss
        self.log('Loss/test_total', loss)
        self.log('test_batch_stepidx', batch_idx)

        # Log metrics
        self.log('Loss/test_giou', status['Loss/giou'])
        self.log('Loss/test_l1', status['Loss/l1'])
        self.log('test_IoU', status['IoU'])

        # Log metrics
        #self.log('test_acc', self.accuracy(logits, y))
    
    def configure_optimizers(self):
        '''defines model optimizer'''
        optimizer, lr_scheduler = self.get_optimizer_scheduler(self.cfg)
        return [optimizer], [lr_scheduler]

    def process_gt(self, data):
        # process the groundtruth
        gt_bboxes = data['search_anno']  # (Ns, batch, 4) (x1,y1,w,h)

        # if data['joint_flag'][0] != 'None':            
        #     gt_joints = torch.squeeze(data['search_joint'])
        #     gt_depth = torch.squeeze(data['search_depth'])
        #     template_depth = data['template_depth']
        #     template_bboxes = box_xywh_to_xyxy(data['template_anno']) 
        #     gt_package = [gt_joints, gt_depth, template_depth, template_bboxes]
        # else:
        template_bboxes = box_xywh_to_xyxy(data['template_anno']) 
        gt_package = [template_bboxes]


        gt_exit = torch.squeeze(data['search_exit'])
        # compute losses
        if gt_bboxes.dim() ==3 and gt_bboxes.shape[0]==1:
            gt_bboxes = gt_bboxes[0]
        return gt_exit, gt_package, gt_bboxes

    def compute_losses(self, pred_dict, gt_bbox, flag_feat = None, return_status=True):
        if len(flag_feat) == 2:
            gt_exit, gt_package = flag_feat #, epoch
            exitflag = None; feature = None
        else:
            gt_exit, gt_package, feature, exitflag = flag_feat # , epoch
        
        # Get boxes
        pred_boxes = pred_dict['pred_boxes']
        if torch.isnan(pred_boxes).any():
            raise ValueError("Network outputs is NAN! Stop Training")
        num_queries = pred_boxes.size(1)

        pred_bboxes_vec = box_cxcywh_to_xyxy(pred_boxes)
        pred_boxes_vec = pred_bboxes_vec.view(-1, 4) # (B,N,4) --> (BN,4) (x1,y1,x2,y2)
        if gt_bbox.dim() ==3:
            gt_bboxes_vec = box_xywh_to_xyxy(gt_bbox).clamp(min=-1.0, max=1.0)  # (B,4) --> (B,1,4) --> (B,N,4)
            gt_boxes_vec = gt_bboxes_vec.view(-1,4).clamp(min=0.0, max=1.0)
            
            n, b, _ = gt_bbox.shape
            neg_flags = torch.zeros(n, b).detach() #.cuda()
            nonzero = torch.nonzero(gt_bbox<0, as_tuple=True)
            for i in range(len(nonzero[0])):
                neg_flags[nonzero[0][i]][nonzero[1][i]] = 1
        else:
            tmp = box_xywh_to_xyxy(gt_bbox)
            gt_boxes_vec = box_xywh_to_xyxy(gt_bbox)[:, None, :]
            gt_bboxes_vec = gt_boxes_vec.repeat((1, num_queries, 1)).clamp(min=-1.0, max=1.0)
            gt_boxes_vec = gt_bboxes_vec.view(-1, 4).clamp(min=0.0, max=1.0)              # (B,4) --> (B,1,4) --> (B,N,4)
            
            neg_flags = torch.zeros(self.bs).detach() #.cuda()
            nonzero = torch.nonzero(gt_bbox<0, as_tuple=True)

            for i in range(len(nonzero[0])):
                # print("GT NEG", gt_bboxes_vec)
                neg_flags[nonzero[0][i]] = 1

        pred_boxes_vec = box_cxcywh_to_xyxy(pred_boxes).view(-1, 4)  # (B,N,4) --> (BN,4) (x1,y1,x2,y2)
        gt_boxes_vec = box_xywh_to_xyxy(gt_bbox)[:, None, :].repeat((1, num_queries, 1)).view(-1, 4).clamp(min=0.0, max=1.0)  # (B,4) --> (B,1,4) --> (B,N,4)
        # compute giou and iou
        try:
            giou_loss, iou = self.objective['giou'](pred_boxes_vec, gt_boxes_vec)  # (BN,4) (BN,4)
        except:
            giou_loss, iou = torch.tensor(0.0), torch.tensor(0.0) #.cuda()
        # compute l1 loss
        l1_loss = self.objective['l1'](pred_boxes_vec, gt_boxes_vec)  # (BN,4) (BN,4)
        
        # compute exit loss
        if self.exit_flag == 'None':
            exit_loss = torch.tensor(0.0) #.cuda()
            reid_loss = torch.tensor(0.0)
            joint_loss = torch.tensor(0.0)
        else:
            exit_loss = self.compute_exit_loss(self, exitflag, neg_flags, gt_bboxes_vec, gt_exit)
            joint_loss, l1_loss = self.compute_joint_loss(self, gt_bbox, pred_dict, gt_package)
            reid_loss = self.compute_reid_loss(gt_bbox, pred_bboxes_vec, feature)


        # weighted sum

        # if self.exit_flag == 'None':
        loss = self.loss_weight['giou'] * giou_loss + self.loss_weight['l1'] * l1_loss
        # else:
        #     if epoch<3:
        #         loss = self.loss_weight['exit']*exit_loss
        #         #loss = self.loss_weight['giou'] * giou_loss + self.loss_weight['l1'] * l1_loss
        #     elif epoch<5:
        #         loss = self.loss_weight['giou'] * giou_loss + self.loss_weight['l1'] * l1_loss + self.loss_weight['exit']*exit_loss                 
        #     else:
        #         loss = self.loss_weight['giou'] * giou_loss + self.loss_weight['l1'] * l1_loss \
        #             + self.loss_weight['exit']*exit_loss + self.loss_weight['reId']*reid_loss

        
        if return_status:
            # status for log
            mean_iou = iou.detach().mean()
            # status = {"Loss/total": loss.item(),
            #           "Loss/giou": giou_loss.item(),
            #           "Loss/l1": l1_loss.item(),
            #           "IoU": mean_iou.item()}

            status = {"Loss/total": loss.item(),
                          "Loss/giou": giou_loss.item(),
                          "Loss/l1": l1_loss.item(),
                          "IoU": mean_iou.item(),
                          "Loss/joint": joint_loss.item(),
                          "Loss/reId": reid_loss.item(),
                          "Loss/exit": exit_loss.item()}
            return loss, status
        else:
            return loss

    def compute_joint_loss(self, gt_bbox, pred_dict, gt_package):
        if len(gt_package)>1:
            # print(gt_package[0].shape, gt_package[1].shape, gt_package[2].shape)
            # torch.Size([3, 6]) torch.Size([3, 480, 640]) torch.Size([2, 3, 480, 640])
            [gt_joints, gt_depth, template_depth, template_bboxes] = gt_package
            if gt_bbox.dim()==3:
                gt_joints = gt_joints[:,:,:2].view(-1, 2)
            else:
                gt_joints = gt_joints[:, :2]
            pred_joints = pred_dict['pred_joint']
            # print(gt_joints.shape, pred_joints.shape)
            if pred_joints !=None:
                joint_loss = self.objective['joint'](gt_joints, pred_joints)
                l1_loss = l1_loss + joint_loss/2
            else:
                joint_loss = torch.tensor(0.0)
            template_bboxes = template_bboxes.clamp(min=-1.0, max=1.0) 
            #reid_loss += self.compute_reid_depth(pred_bboxes_vec, template_bboxes, gt_depth, template_depth)
            #reid_loss = reid_loss/2
        else:
            joint_loss = torch.tensor(0.0)
        return joint_loss, l1_loss

    def compute_exit_loss(self, exitflag, neg_flags, gt_bboxes_vec, gt_exit):
        if exitflag == None:
            exit_loss = torch.tensor(0.0) #.cuda()
            return exit_loss
        if self.exit_flag == "BCE":
            exit_flag12 = torch.squeeze(exitflag[-1][2])
            exit_flag13 = torch.squeeze(exitflag[-1][3])
            exit_loss = (self.objective['exit_top'](exit_flag12, gt_exit) + self.objective['exit_top'](exit_flag13, gt_exit))/2
            #exit_loss = self.objective['exit_bottom'](neg_flags, exitflag[-1][0]) + self.objective['exit_bottom'](neg_flags, exitflag[-1][1])
        elif self.exit_flag == 'MATRIX_BCE':
            exit_loss = self.cal_exit_prob(gt_bboxes_vec, exitflag[0], exitflag[1])
        elif self.exit_flag == 'LAMBDA':

            tmppos_tl = exitflag[-1][0][neg_flags==0]
            tmppos_br = exitflag[-1][1][neg_flags==0]
            if tmppos_tl.shape[0] ==0:
                exit_loss = torch.tensor(0.0) #.cuda()
            else:
                exit_loss = torch.mean(tmppos_tl)
            if tmppos_br.shape[0] ==0:
                exit_loss += torch.tensor(0.0) #.cuda()
            else:
                exit_loss += torch.mean(tmppos_br)

            tmpneg_tl = exitflag[-1][0][neg_flags==1] 
            tmpneg_br = exitflag[-1][1][neg_flags==1]
            if tmpneg_tl.shape[0] ==0:
                exit_loss -= torch.tensor(0.0) #.cuda()
            else:
                exit_loss -= torch.mean(tmpneg_tl)
            if tmpneg_br.shape[0] ==0:
                exit_loss -= torch.tensor(0.0) #.cuda()
            else:
                exit_loss -= torch.mean(tmpneg_br)
        else:
            raise Exception('Invalid exit loss')
        return exit_loss

    def make_gt_matrix(self, feat_sz, tgt_idx):
        gt_matrix = torch.zeros(feat_sz*feat_sz).view(feat_sz, feat_sz) #.cuda()
        for i in range(feat_sz):
            for j in range(feat_sz):
                gt_matrix[i][j] = torch.exp(-(j-tgt_idx[0])**2-(i-tgt_idx[1])**2)
        gt_matrix = torch.squeeze(gt_matrix.view(-1, feat_sz*feat_sz))
        return gt_matrix

    def make_uni_matrix(self, feat_sz):
        gt_matrix = torch.ones(feat_sz*feat_sz)/(feat_sz*feat_sz) #.cuda()

        return gt_matrix
    
    def cal_exit_prob(self, gt_bboxes, prob_vec_tl, prob_vec_br):
        feat_sz = 20
        gt_bboxes = torch.squeeze(gt_bboxes)
        index = torch.round(gt_bboxes*feat_sz).int()
        ent_matrix_loss = 0
        neg_tup = torch.nonzero(gt_bboxes<0, as_tuple=True)
        neg_n = torch.unique(neg_tup[0])
        neg_b = torch.unique(neg_tup[1])

        if gt_bboxes.dim()==3:
            n, b, _ = gt_bboxes.shape
            for i in range(n):
                for j in range(b):
                    if i in neg_n and j in neg_b:
                        gt_matrix = self.make_uni_matrix(feat_sz)
                        ent_matrix_loss += self.objective['exit_top'](prob_vec_tl[i,j], gt_matrix)
                        # print("GT UNI matrix", gt_matrix)
                        gt_matrix = self.make_uni_matrix(feat_sz)
                        ent_matrix_loss += self.objective['exit_top'](prob_vec_br[i,j], gt_matrix)
                    else:
                        gt_matrix = self.make_gt_matrix(feat_sz, [index[i,j,0], index[i,j,1]])
                        ent_matrix_loss += self.objective['exit_top'](prob_vec_tl[i,j], gt_matrix)
                        # print("TL gaussian matrix", gt_matrix)
                        gt_matrix = self.make_gt_matrix(feat_sz, [index[i,j,2], index[i,j,3]])
                        ent_matrix_loss += self.objective['exit_top'](prob_vec_br[i,j], gt_matrix)
            ent_matrix_loss = ent_matrix_loss/(n*b)
        else:
            b, _ = gt_bboxes.shape

            for j in range(b):
                if j in neg_n:
                    gt_matrix = self.make_uni_matrix(feat_sz)
                    ent_matrix_loss += self.objective['exit_top'](prob_vec_tl[j], gt_matrix)
                    gt_matrix = self.make_uni_matrix(feat_sz)
                    ent_matrix_loss += self.objective['exit_top'](prob_vec_br[j], gt_matrix)
                else:
                    gt_matrix = self.make_gt_matrix(feat_sz, [index[j,0], index[j,1]])
                    ent_matrix_loss += self.objective['exit_top'](prob_vec_tl[j], gt_matrix)
                    gt_matrix = self.make_gt_matrix(feat_sz, [index[j,2], index[j,3]])
                    ent_matrix_loss += self.objective['exit_top'](prob_vec_br[j], gt_matrix)
            ent_matrix_loss = ent_matrix_loss/(b)
        
        return ent_matrix_loss

    def compute_reid_loss(self, gt_bbox, pred_bboxes_vec, feature):
        #compute re-id loss
        if feature == None:
            reid_loss = torch.tensor(0.0) #.cuda()
            return reid_loss
        if gt_bbox.dim()==3:
            pred_bboxes_vec = torch.round(pred_bboxes_vec*20).int()
            
            #feat_sz
            reid_loss = torch.tensor(0.0) #.cuda()

            assert gt_bbox.dim() ==3
            n, b, c, _, _= feature.shape

            vacant1 = torch.zeros(1, 20, 20) #.cuda()
            vacant2 = torch.zeros(1, 20, 20) #.cuda()        
            for j in range(self.bs):
                for i in range(n-1):            
                    vacant11 = vacant1.repeat(c, 1, 1)
                    vacant22 = vacant2.repeat(c, 1, 1)

                    if gt_bbox[i, j, 1]>=0 and gt_bbox[i+1, j, 1]>=0:
                        vacant11[:, pred_bboxes_vec[j, i+1, 1]:pred_bboxes_vec[j, i+1, 3],pred_bboxes_vec[j, i+1, 0]:pred_bboxes_vec[j, i+1, 2]] = 1  
                        vacant22[:, pred_bboxes_vec[j, i, 1]:pred_bboxes_vec[j, i, 3],pred_bboxes_vec[j, i, 0]:pred_bboxes_vec[j, i, 2]] = 1  

                        predN_Bbox = feature[i+1, j] * vacant11
                        pred_Bbox = feature[i,j] * vacant22
                        reid_loss += self.objective['reId'](pred_Bbox, predN_Bbox)
                    else:
                        reid_loss += torch.tensor(0.0) #.cuda()

            return reid_loss/(n*b)
        else:
            reid_loss = torch.tensor(0.0) #.cuda()
            return reid_loss

    def get_optimizer_scheduler(self, cfg):
        train_cls = getattr(cfg.TRAIN, "TRAIN_CLS", False)
        # Adam(self.parameters(), lr=self.lr)
        if train_cls:
            # print("Only training classification head. Learnable parameters are shown below.")
            param_dicts = [
                {"params": [p for n, p in self.net.named_parameters() if "cls" in n and p.requires_grad]}
            ]

            for n, p in self.net.named_parameters():
                if "cls" not in n:
                    p.requires_grad = False
                # else:
                #     print(n)
        else:
            param_dicts = [
                {"params": [p for n, p in self.net.named_parameters() if "backbone" not in n and p.requires_grad]},
                {
                    "params": [p for n, p in self.net.named_parameters() if "backbone" in n and p.requires_grad],
                    "lr": self.lr * cfg.TRAIN.BACKBONE_MULTIPLIER,
                },
            ]
            # if is_main_process():
            #     print("Learnable parameters are shown below.")
            #     for n, p in self.net.named_parameters():
            #         if p.requires_grad:
            #             print(n)

        if cfg.TRAIN.OPTIMIZER == "ADAMW":
            optimizer = torch.optim.AdamW(param_dicts, lr=self.lr,
                                        weight_decay=cfg.TRAIN.WEIGHT_DECAY)
            ## weight decay pick it out.
        elif cfg.TRAIN.OPTIMIZER == "SGD":
            optimizer = torch.optim.SGD(param_dicts, lr=self.lr,
                                        weight_decay=cfg.TRAIN.WEIGHT_DECAY)
        else:
            raise ValueError("Unsupported Optimizer")
        if cfg.TRAIN.SCHEDULER.TYPE == 'step':
            lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, cfg.TRAIN.LR_DROP_EPOCH)
        elif cfg.TRAIN.SCHEDULER.TYPE == "Mstep":
            lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
                                                                milestones=cfg.TRAIN.SCHEDULER.MILESTONES,
                                                                gamma=cfg.TRAIN.SCHEDULER.GAMMA)
        else:
            raise ValueError("Unsupported scheduler")
        return optimizer, lr_scheduler


https://gist.github.com/ashleve/ac511f08c0d29e74566900fd3efbb3ec

In [7]:
# from lib.test.evaluation import get_dataset
# from lib.test.evaluation.running import run_dataset
# from lib.test.evaluation.tracker import Tracker
from sklearn.model_selection import KFold

class RobotDataModule(LightningDataModule):

    def __init__(self, data_dir='./', k=1, split_seed=123, num_splits=10, batch_size=256, num_workers=8, pin_memory=False):
        super().__init__()
        self.data_dir = data_dir
        self.batch_size = batch_size  
        self.k = k
        self.num_splits = num_splits
        self.split_seed = split_seed


        # self.transform = transforms.ToTensor()

    def fill_state(self, cfg, settings):
        self.cfg = cfg
        self.settings = settings

        transform_joint = tfm.Transform(tfm.ToGrayscale(probability=0.05),
                                        tfm.RandomHorizontalFlip(probability=0.5))

        transform_train = tfm.Transform(tfm.ToTensorAndJitter(0.2),
                                        tfm.RandomHorizontalFlip_Norm(probability=0.5),
                                        tfm.Normalize(mean=cfg.DATA.MEAN, std=cfg.DATA.STD))

        # transform_val = tfm.Transform(tfm.ToTensor(),
        #                             tfm.Normalize(mean=cfg.DATA.MEAN, std=cfg.DATA.STD))

        # The tracking pairs processing module
        output_sz = settings.output_sz
        search_area_factor = settings.search_area_factor

        self.data_processing_train = processing.STARKProcessing(search_area_factor=search_area_factor,
                                                        output_sz=output_sz,
                                                        center_jitter_factor=settings.center_jitter_factor,
                                                        scale_jitter_factor=settings.scale_jitter_factor,
                                                        mode='sequence',
                                                        transform=transform_train,
                                                        joint_transform=transform_joint,
                                                        settings=settings)

        # self.data_processing_val = processing.STARKProcessing(search_area_factor=search_area_factor,
        #                                                 output_sz=output_sz,
        #                                                 center_jitter_factor=settings.center_jitter_factor,
        #                                                 scale_jitter_factor=settings.scale_jitter_factor,
        #                                                 mode='sequence',
        #                                                 transform=transform_val,
        #                                                 joint_transform=transform_joint,
        #                                                 settings=settings)

        # Train sampler and loader
        settings.num_template = getattr(cfg.DATA.TEMPLATE, "NUMBER", 1)
        settings.num_search = getattr(cfg.DATA.SEARCH, "NUMBER", 1)
        self.sampler_mode = getattr(cfg.DATA, "SAMPLER_MODE", "causal")
        self.train_cls = getattr(cfg.TRAIN, "TRAIN_CLS", False)
        
    # def prepare_data(self):
    #     '''called only once and on 1 GPU'''
    #     # download data
    #     MNIST(self.data_dir, train=True, download=True)
    #     MNIST(self.data_dir, train=False, download=True)

    def setup(self, stage=None):
        '''called on each GPU separately - stage defines if we are at fit or test step'''
        # we set up only relevant datasets when stage is specified (automatically set by Pytorch-Lightning)
        cfg = self.cfg
        settings = self.settings
        kfold = KFold(n_splits = self.num_splits, shuffle = True, random_state = self.split_seed)
        
        if stage == 'fit' or stage is None:
            self.dataset_full = sampler.TrackingSampler(datasets=names2datasets(cfg.DATA.TRAIN.DATASETS_NAME, settings, opencv_loader),
                                            p_datasets=cfg.DATA.TRAIN.DATASETS_RATIO,
                                            samples_per_epoch=cfg.DATA.TRAIN.SAMPLE_PER_EPOCH,
                                            max_gap=cfg.DATA.MAX_SAMPLE_INTERVAL, num_search_frames=settings.num_search,
                                            num_template_frames=settings.num_template, processing=self.data_processing_train,
                                            frame_sample_mode=self.sampler_mode, batch_size=self.batch_size, train_cls=self.train_cls)
            
            # self.dataset_val = sampler.TrackingSampler(datasets=names2datasets(cfg.DATA.VAL.DATASETS_NAME, settings, opencv_loader),
            #                               p_datasets=cfg.DATA.VAL.DATASETS_RATIO,
            #                               samples_per_epoch=cfg.DATA.VAL.SAMPLE_PER_EPOCH,
            #                               max_gap=cfg.DATA.MAX_SAMPLE_INTERVAL, num_search_frames=settings.num_search,
            #                               num_template_frames=settings.num_template, processing=self.data_processing_val,
            #                               frame_sample_mode=self.sampler_mode, batch_size=self.batch_size, train_cls=self.train_cls)

            all_splits = [k for k in kfold.split(self.dataset_full)]
            train_indexes, val_indexes = all_splits[self.k]
            self.train_subsampler = torch.utils.data.SubsetRandomSampler(train_indexes)
            self.val_subsampler = torch.utils.data.SubsetRandomSampler(val_indexes)

        # if stage == 'test' or stage is None:
        #     self.dataset_test = get_dataset(cfg.DATA.TEST.DATASETS_NAME)
        #     pass

    def train_dataloader(self):
        '''returns training dataloader'''
        cfg = self.cfg        
        loader_train = LTRLoader('train', self.dataset_full, training=True, sampler=self.train_subsampler, batch_size=self.batch_size, 
                             num_workers=cfg.TRAIN.NUM_WORKER, drop_last=True, stack_dim=1)
        return loader_train

    def val_dataloader(self):
        '''returns validation dataloader'''
        cfg = self.cfg
        loader_val = LTRLoader('val', self.dataset_full, training=False, sampler=self.val_subsampler, batch_size=self.batch_size,
                           num_workers=cfg.TRAIN.NUM_WORKER, drop_last=True, stack_dim=1, 
                           epoch_interval=cfg.TRAIN.VAL_EPOCH_INTERVAL)

        return loader_val

    # def test_dataloader(self):
    #     '''returns test dataloader'''
    #     mnist_test = DataLoader(self.mnist_test, batch_size=self.batch_size)
    #     return mnist_test

Collecting scikit-learn
  Downloading scikit_learn-1.0.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (24.8 MB)
[K     |████████████████████████████████| 24.8 MB 3.9 MB/s eta 0:00:01
[?25hCollecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Collecting joblib>=0.11
  Downloading joblib-1.1.0-py2.py3-none-any.whl (306 kB)
[K     |████████████████████████████████| 306 kB 5.5 MB/s eta 0:00:01
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.1.0 scikit-learn-1.0.2 threadpoolctl-3.1.0


In [8]:
from lib.utils.box_ops import giou_loss
from torch.nn.functional import l1_loss
from torch.nn import BCEWithLogitsLoss
# train pipeline related
from lib.train.trainers import LTRTrainer
from lib.train.base_functions import *

from lib.models.exot import build_exotst

# forward propagation related
from lib.train.actors import EXOTActor, EXOTSTActor
import importlib



In [12]:
torch.multiprocessing.set_start_method('spawn')# good solution !!!!

In [14]:
args = parse_args(args = ['--script', 'exot_st1', '--config', 'baseline_mix', '--save_dir', '.', '--mode', 'single'])
cv.setNumThreads(0)
torch.backends.cudnn.benchmark = args.cudnn_benchmark

print('script_name: {}.py  config_name: {}.yaml'.format(args.script, args.config))

'''2021.1.5 set seed for different process'''
if args.seed is not None:
    if args.local_rank != -1:
        init_seeds(args.seed + args.local_rank)
    else:
        init_seeds(args.seed)

settings = Settings()
cfg = settings.set_args(args)
# update settings based on cfg
if settings.local_rank in [-1, 0]:
    print("New configuration is shown below.")
    for key in cfg.keys():
        print("%s configuration:" % key, cfg[key])
        print('\n')  
update_settings(settings, cfg)

'''
results = []
nums_folds = 10
split_seed = 12345

for k in range(nums_folds):
    datamodule = ProteinsKFoldDataModule(k=k, num_folds=num_folds, split_seed=split_seed, ...)
    datamodule.prepare_data()
    datamodule.setup()

    ...
    # here we train the model on given split...
    ...
    results.append(score)

score = sum(results) / num_folds
'''



objective = {'giou': giou_loss, 'l1': l1_loss, 'joint': l1_loss, 'reId': l1_loss, 'exit_top': BCEWithLogitsLoss(), 'exit_bottom': l1_loss}

loss_weight = {'giou': cfg.TRAIN.GIOU_WEIGHT, 'l1': cfg.TRAIN.L1_WEIGHT, 'reId': cfg.TRAIN.REID_WEIGHT, 'exit': cfg.TRAIN.EXIT_WEIGHT}


script_name: exot_st1.py  config_name: baseline_mix.yaml
New configuration is shown below.
MODEL configuration: {'HEAD_TYPE': 'CORNER', 'NLAYER_HEAD': 3, 'HEAD_ABS': False, 'HIDDEN_DIM': 256, 'NUM_OBJECT_QUERIES': 1, 'POSITION_EMBEDDING': 'sine', 'PREDICT_MASK': False, 'LOSS_TYPE': 'None', 'BACKBONE': {'TYPE': 'resnet50', 'OUTPUT_LAYERS': ['layer3'], 'STRIDE': 16, 'DILATION': False}, 'TRANSFORMER': {'NHEADS': 8, 'DROPOUT': 0.1, 'DIM_FEEDFORWARD': 2048, 'ENC_LAYERS': 6, 'DEC_LAYERS': 6, 'PRE_NORM': False, 'DIVIDE_NORM': False}}


TRAIN configuration: {'LR': 0.0001, 'WEIGHT_DECAY': 0.0001, 'EPOCH': 100, 'LR_DROP_EPOCH': 95, 'BATCH_SIZE': 16, 'NUM_WORKER': 0, 'OPTIMIZER': 'ADAMW', 'BACKBONE_MULTIPLIER': 0.1, 'REID_WEIGHT': 3.0, 'EXIT_WEIGHT': 3.0, 'GIOU_WEIGHT': 2.0, 'L1_WEIGHT': 5.0, 'DEEP_SUPERVISION': False, 'FREEZE_BACKBONE_BN': True, 'FREEZE_LAYERS': ['conv1', 'layer1'], 'PRINT_INTERVAL': 50, 'VAL_EPOCH_INTERVAL': 5, 'GRAD_CLIP_NORM': 0.1, 'SCHEDULER': {'TYPE': 'step', 'DECAY_RATE': 

In [15]:
nums_folds = 5
for k in range(nums_folds):
    wandb_logger = WandbLogger(project='EXOT', name=f"fold_{k}")
    trainer = Trainer(logger=wandb_logger, accelerator='gpu', devices=-1, max_epochs=50)
    robot_data = RobotDataModule(data_dir='data/robot-data/', k=k, num_splits=nums_folds, batch_size=16)
    robot_data.fill_state(cfg, settings)
    net = build_exotst(cfg)
    model = LitEXOTActor(cfg, settings, loss_type=cfg.MODEL.LOSS_TYPE, lr=cfg.TRAIN.LR)
    model.construct(net, objective, loss_weight)
    trainer.fit(model, datamodule=robot_data)
    model.save(os.path.join(wandb.run.dir, "exot_st1_mix_model_%d.pth"%k))
    #self.log(..., batch_size=batch_size)
    wandb.finish()
    ## ODIN batchwise - classification diff objects possible?

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


head channel: 256


Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/2
Initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/2
----------------------------------------------------------------------------------------------------
distributed_backend=nccl
All distributed processes registered. Starting with 2 processes
----------------------------------------------------------------------------------------------------



ProcessRaisedException: 

-- Process 1 terminated with the following error:
Traceback (most recent call last):
  File "/home/hskim/miniconda/envs/deformable_detr/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 59, in _wrap
    fn(i, *args)
  File "/home/hskim/miniconda/envs/deformable_detr/lib/python3.7/site-packages/pytorch_lightning/strategies/launchers/multiprocessing.py", line 129, in _wrapping_function
    results = function(*args, **kwargs)
  File "/home/hskim/miniconda/envs/deformable_detr/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 737, in _fit_impl
    results = self._run(model, ckpt_path=self.ckpt_path)
  File "/home/hskim/miniconda/envs/deformable_detr/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1104, in _run
    self.strategy.setup_environment()
  File "/home/hskim/miniconda/envs/deformable_detr/lib/python3.7/site-packages/pytorch_lightning/strategies/strategy.py", line 130, in setup_environment
    self.accelerator.setup_environment(self.root_device)
  File "/home/hskim/miniconda/envs/deformable_detr/lib/python3.7/site-packages/pytorch_lightning/accelerators/cuda.py", line 43, in setup_environment
    torch.cuda.set_device(root_device)
  File "/home/hskim/miniconda/envs/deformable_detr/lib/python3.7/site-packages/torch/cuda/__init__.py", line 311, in set_device
    torch._C._cuda_setDevice(device)
  File "/home/hskim/miniconda/envs/deformable_detr/lib/python3.7/site-packages/torch/cuda/__init__.py", line 205, in _lazy_init
    "Cannot re-initialize CUDA in forked subprocess. To use CUDA with "
RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method


: 