In [40]:
%load_ext autoreload
%autoreload 1
%matplotlib inline

import matplotlib
matplotlib.rcParams['figure.figsize'] = [14, 10]

import torch
from torch.utils.data import Dataset
from retinanet.encoder import DataEncoder
import imgaug as ia
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
import numpy as np
from retinanet.retinanet import RetinaNet
from retinanet.loss import FocalLoss
from utils.pikachu_dataset import load_data_pikachu
from torch import nn
from torch.utils.data import Dataset
from livelossplot import PlotLosses

device  = 'cuda' if torch.cuda.is_available() else 'cpu'


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [41]:
# %load retinanet/encoder.py
'''Encode object boxes and labels.'''
import math
import torch

from retinanet.utils import meshgrid, box_iou, box_nms, change_box_order


class DataEncoder:
    def __init__(self, anchor_areas=[32*32., 64*64., 128*128., 256*256., 512*512.]):
        self.anchor_areas = anchor_areas   # p3 -> p7
        self.aspect_ratios = [1/2., 1/1., 2/1.]
        self.scale_ratios = [1., pow(2,1/3.), pow(2,2/3.)]
        self.anchor_wh = self._get_anchor_wh()

    def _get_anchor_wh(self):
        '''Compute anchor width and height for each feature map.

        Returns:
          anchor_wh: (tensor) anchor wh, sized [#fm, #anchors_per_cell, 2].
        '''
        anchor_wh = []
        for s in self.anchor_areas:
            for ar in self.aspect_ratios:  # w/h = ar
                h = math.sqrt(s/ar)
                w = ar * h
                for sr in self.scale_ratios:  # scale
                    anchor_h = h*sr
                    anchor_w = w*sr
                    anchor_wh.append([anchor_w, anchor_h])
        num_fms = len(self.anchor_areas)
        return torch.Tensor(anchor_wh).view(num_fms, -1, 2)

    def _get_anchor_boxes(self, input_size):
        '''Compute anchor boxes for each feature map.

        Args:
          input_size: (tensor) model input size of (w,h).

        Returns:
          boxes: (list) anchor boxes for each feature map. Each of size [#anchors,4],
                        where #anchors = fmw * fmh * #anchors_per_cell
        '''
        num_fms = len(self.anchor_areas)
        fm_sizes = [(input_size/pow(2.,i+3)).ceil() for i in range(num_fms)]  # p3 -> p7 feature map sizes

        boxes = []
        for i in range(num_fms):
            fm_size = fm_sizes[i]
            grid_size = input_size / fm_size
            fm_w, fm_h = int(fm_size[0]), int(fm_size[1])
            xy = meshgrid(fm_w,fm_h) + 0.5 # [fm_h*fm_w, 2]
            xy = (xy.float()*grid_size.float()).view(fm_h,fm_w,1,2).expand(fm_h,fm_w,9,2)
            wh = self.anchor_wh[i].view(1,1,9,2).expand(fm_h,fm_w,9,2)
           
            
            box = torch.cat([xy,wh], 3)  # [x,y,w,h]
            boxes.append(box.view(-1,4))

        return torch.cat(boxes, 0), boxes

    def encode(self, boxes, labels, input_size):
        '''Encode target bounding boxes and class labels.

        We obey the Faster RCNN box coder:
          tx = (x - anchor_x) / anchor_w
          ty = (y - anchor_y) / anchor_h
          tw = log(w / anchor_w)
          th = log(h / anchor_h)

        Args:
          boxes: (tensor) bounding boxes of (xmin,ymin,xmax,ymax), sized [#obj, 4].
          labels: (tensor) object class labels, sized [#obj,].
          input_size: (int/tuple) model input size of (w,h).

        Returns:
          loc_targets: (tensor) encoded bounding boxes, sized [#anchors,4].
          cls_targets: (tensor) encoded class labels, sized [#anchors,].
        '''
        input_size = torch.Tensor([input_size,input_size]) if isinstance(input_size, int) \
                     else torch.Tensor(input_size)
        anchor_boxes, _ = self._get_anchor_boxes(input_size)
        boxes = change_box_order(boxes, 'xyxy2xywh')

        ious = box_iou(anchor_boxes, boxes, order='xywh')
        max_ious, max_ids = ious.max(1)
        
        boxes = boxes[max_ids]

        loc_xy = (boxes[:,:2]-anchor_boxes[:,:2]) / anchor_boxes[:,2:]
        loc_wh = torch.log(boxes[:,2:]/anchor_boxes[:,2:])
        loc_targets = torch.cat([loc_xy,loc_wh], 1)
        cls_targets = 1 + labels[max_ids]

        cls_targets[max_ious<0.5] = 0
        ignore = (max_ious>0.4) & (max_ious<0.5)  # ignore ious between [0.4,0.5]
        cls_targets[ignore] = -1  # for now just mark ignored to -1
        return loc_targets, cls_targets

    def decode(self, loc_preds, cls_preds, input_size):
        '''Decode outputs back to bouding box locations and class labels.

        Args:
          loc_preds: (tensor) predicted locations, sized [#anchors, 4].
          cls_preds: (tensor) predicted class labels, sized [#anchors, #classes].
          input_size: (int/tuple) model input size of (w,h).

        Returns:
          boxes: (tensor) decode box locations, sized [#obj,4].
          labels: (tensor) class labels for each box, sized [#obj,].
        '''
        CLS_THRESH = 0.3
        NMS_THRESH = 0.6

        input_size = torch.Tensor([input_size,input_size]) if isinstance(input_size, int) \
                     else torch.Tensor(input_size)
        anchor_boxes, _ = self._get_anchor_boxes(input_size)

        loc_xy = loc_preds[:,:2]
        loc_wh = loc_preds[:,2:]

        xy = loc_xy * anchor_boxes[:,2:] + anchor_boxes[:,:2]
        wh = loc_wh.exp() * anchor_boxes[:,2:]
        boxes = torch.cat([xy-wh/2, xy+wh/2], 1)  # [#anchors,4]

        score, labels = cls_preds.sigmoid().max(1)          # [#anchors,]
        ids = score > CLS_THRESH
        ids = ids.nonzero().squeeze()             # [#obj,]
        keep = box_nms(boxes[ids], score[ids],labels, threshold=NMS_THRESH)
        return boxes[ids][keep], labels[ids][keep], score[ids][keep]


In [42]:
from retinanet.loss import  * 

class FocalLoss(nn.Module):
    def __init__(self, num_classes=20):
        super(FocalLoss, self).__init__()
        self.num_classes = num_classes

    def focal_loss(self, x, y):
        '''Focal loss.
        Args:
          x: (tensor) sized [N,D].
          y: (tensor) sized [N,].
        Return:
          (tensor) focal loss.
        '''
        alpha = 0.25
        gamma = 2

        t = one_hot_embedding(y.data.cpu(), 1+self.num_classes)  # [N,21]
        import pdb
        pdb.set_trace()
        t = t[:,1:]  # exclude background
        t = Variable(t).cuda()  # [N,20]

        p = x.sigmoid().detach()
        pt = p*t + (1-p)*(1-t)         # pt = p if t > 0 else 1-p
        w = alpha*t + (1-alpha)*(1-t)  # w = alpha if t > 0 else 1-alpha
        w = w * (1-pt).pow(gamma)
        return F.binary_cross_entropy_with_logits(x, t, w, size_average=False)

    def focal_loss_alt(self, x, y):
        '''Focal loss alternative.
        Args:
          x: (tensor) sized [N,D].
          y: (tensor) sized [N,].
        Return:
          (tensor) focal loss.
        '''
        alpha = 0.25

        t = one_hot_embedding(y.data.cpu(), 1+self.num_classes)
        t = t[:,1:]
        t = Variable(t).cuda()

        xt = x*(2*t-1)  # xt = x if t > 0 else -x
        pt = (2*xt+1).sigmoid() 

        w = alpha*t + (1-alpha)*(1-t)
        loss = -w*pt.log() / 2
        return loss.sum()

    def forward(self, pred, target):

        '''Compute loss between (loc_preds, loc_targets) and (cls_preds, cls_targets).
        Args:
          loc_preds: (tensor) predicted locations, sized [batch_size, #anchors, 4].
          loc_targets: (tensor) encoded target locations, sized [batch_size, #anchors, 4].
          cls_preds: (tensor) predicted class confidences, sized [batch_size, #anchors, #classes].
          cls_targets: (tensor) encoded target labels, sized [batch_size, #anchors].
        loss:
          (tensor) loss = SmoothL1Loss(loc_preds, loc_targets) + FocalLoss(cls_preds, cls_targets).
        '''
        
        loc_preds, cls_preds = pred
        loc_targets, cls_targets = target

        import pdb; pdb.set_trace()
        
        batch_size, num_boxes = cls_targets.size()
        pos = cls_targets > 0  # [N,#anchors]
        num_pos = pos.data.long().sum()

        ################################################################
        # loc_loss = SmoothL1Loss(pos_loc_preds, pos_loc_targets)
        ################################################################
        mask = pos.unsqueeze(2).expand_as(loc_preds)       # [N,#anchors,4]
        masked_loc_preds = loc_preds[mask].view(-1,4)      # [#pos,4]
        masked_loc_targets = loc_targets[mask].view(-1,4)  # [#pos,4]
        loc_loss = F.smooth_l1_loss(masked_loc_preds, masked_loc_targets, size_average=False)

        ################################################################
        # cls_loss = FocalLoss(loc_preds, loc_targets)
        ################################################################
        pos_neg = cls_targets > -1  # exclude ignored anchors
        num_peg = pos_neg.data.long().sum()
        mask = pos_neg.unsqueeze(2).expand_as(cls_preds)
        masked_cls_preds = cls_preds[mask].view(-1,self.num_classes)
        
#         fl = FL()
#         cls_loss = self.focal_loss_alt(masked_cls_preds, cls_targets[pos_neg])

        import pdb
        pdb.set_trace()
        cls_loss = self.focal_loss(masked_cls_preds, cls_targets[pos_neg])
#         print("clsloss: {} locloss: {}".format(cls_loss.item(), loc_loss.item()))
        pos = cls_targets > 0  # [N,#anchors]
        num_pos = pos.data.long().sum()
        num_pos_neg = pos_neg.data.long().sum()

        if num_pos > 0:
            loss = (cls_loss + loc_loss) / num_pos
        elif num_pos_neg > 0:
            loss = cls_loss
        else:
            raise Exception('num_pos_neg == 0')
            
        loss = loc_loss + cls_loss
        return loss
    
class PikachuDataset(Dataset):
    def __init__(self, anchor_areas=None):
        
        self.anchor_areas = anchor_areas
        self.train, self.val = load_data_pikachu(1)
        if anchor_areas is not None:
            encoder = DataEncoder(anchor_areas=anchor_areas)
        else:
            encoder = DataEncoder()            
        self.encoder = encoder 
        
    def __len__(self):
        return 900
        
    def __getitem__(self, i):
        try:
            b = self.train.next()
        except StopIteration:
            self.train.reset()
            b = self.train.next()
            
        image = b.data[0].asnumpy()
        bbox = b.label[0].asnumpy()[:, 0, 1:] * 256
        label = b.label[0].asnumpy()[:, 0, 0]
    
        image, bbox, label = torch.from_numpy(image), torch.from_numpy(bbox), torch.from_numpy(label)   
        
        encoded = self.encoder.encode(bbox, label, torch.Tensor([256, 256]))
        
        loc_target, cls_target = encoded
        
        return (image / 255)[0], (loc_target, cls_target)
        
    
    def collate_func(self, batch):

        images = [b[0][0] for b in batch]
        bbox = [b[1] for b in batch]
        labels = [b[2] for b in batch]
        
        
        encoded = [self.encoder.encode(bb, l, torch.Tensor([256, 256])) for bb, l in zip(bbox, labels)]
        
        
        loc_target = [l[0] for l in encoded]
        cls_target = [l[1] for l in encoded]
        

        return torch.stack(images) / 255, torch.stack(loc_target), torch.stack(cls_target)
        
        
def down_sample(in_channels,out_channels):
    
    return nn.Sequential(
        nn.Conv2d(in_channels, out_channels, 3, stride=1, padding=1), 
        nn.BatchNorm2d(out_channels), 
        nn.ReLU(),
        nn.Conv2d(out_channels, out_channels, 3, stride=1, padding=1), 
        nn.BatchNorm2d(out_channels), 
        nn.ReLU(),
        nn.MaxPool2d(2,2)       
    )

class SimpleSSD(nn.Module):
    
    
    def __init__(self, n_cls=1, num_anchors=9):
        super(SimpleSSD, self).__init__()
        
        self.n_cls = n_cls
        self.num_anchors = num_anchors
        
        
        # Base CNN (think resnet/vgg or other base network)
        self.step1 = down_sample(3, 128)
        self.step2 = down_sample(128, 128)
        self.step3 = down_sample(128, 128)
        self.step4 = down_sample(128, 128)
        self.step5 = down_sample(128, 128)


        self.cls_head1 = nn.Conv2d(128, self.num_anchors * self.n_cls , 3, padding=1)
        self.bbox_head1 = nn.Conv2d(128, self.num_anchors *4, 3, padding=1)
        
        self.cls_head2 = nn.Conv2d(128, self.num_anchors * self.n_cls , 3, padding=1)
        self.bbox_head2 = nn.Conv2d(128,self.num_anchors * 4,3, padding=1)
        
        self.cls_head3 = nn.Conv2d(128, self.num_anchors * self.n_cls , 3, padding=1)
        self.bbox_head3 = nn.Conv2d(128, self.num_anchors * 4,3, padding=1)
        
        
    def forward(self, x):

        step1=self.step1(x)
        step2=self.step2(step1)

        step3=self.step3(step2)
        step4 = self.step4(step3)
        step5 = self.step5(step4)
                
        cls1 = self.cls_head1(step3)
        bbox1 = self.bbox_head1(step3)
        
        cls2 = self.cls_head2(step4)
        bbox2 = self.bbox_head2(step4)
        
        cls3 = self.cls_head3(step5)
        bbox3 = self.bbox_head3(step5) 

    
        cls1 = cls1.permute(0,2,3,1).contiguous().view(x.size(0), -1, self.n_cls)
        cls2 = cls2.permute(0,2,3,1).contiguous().view(x.size(0), -1, self.n_cls)
        cls3 = cls3.permute(0,2,3,1).contiguous().view(x.size(0), -1, self.n_cls)
        
        bbox1 = bbox1.permute(0,2,3,1).contiguous().view(x.size(0), -1, 4)
        bbox2 = bbox2.permute(0,2,3,1).contiguous().view(x.size(0), -1, 4)
        bbox3 = bbox3.permute(0,2,3,1).contiguous().view(x.size(0), -1, 4)
      
        cls_pred = torch.cat([cls1, cls2, cls3], dim=1)
        bbox_pred = torch.cat([bbox1, bbox2, bbox3], dim=1)
        
        return (bbox_pred, cls_pred)
                

    

In [43]:
pikachu_ds =PikachuDataset(anchor_areas=[30*30, 50*50, 90*90])
pikachu_dl = DataLoader(pikachu_ds, batch_size=2, collate_fn=pikachu_ds.collate_func)

# for b in pikachu_dl:
#     break
# import pandas as pd  
# b[0].shape
# b[1].shape
# pd.DataFrame(b[2][0].cpu().numpy())[0].value_counts()
# # b[0].shape

# for b in pikachu_dl:
#     break
    
# # b[0].shape
# pikachu_ds[0]



In [44]:
model= SimpleSSD(1)
model = model.to(device)
criterion = FocalLoss(num_classes=1)



In [45]:
# optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
# collect = []


# for epoch in range(2): 
#     for i, b in enumerate(pikachu_dl):
#         logs = {}
#         optimizer.zero_grad()
#         image, bounding_boxes, labels = b
#         image = image.to(device)
#         bounding_boxes = bounding_boxes.to(device)
#         labels = labels.to(device)
#         loc_pred, cls_pred = model(image)
#         total_loss = criterion(loc_pred, bounding_boxes, cls_pred, labels)
#         total_loss.backward()
#         optimizer.step()
#         collect.append([total_loss.detach().cpu().numpy()])
#         logs['loss'] = total_loss.item()
#         if i % 10 == 0:
#             plot_losses.update(logs)
#             plot_losses.draw()


# Visualize

In [46]:
from skorch.net import NeuralNet

net = NeuralNet(model,
                criterion=FocalLoss, 
                criterion__num_classes=1,
                optimizer=torch.optim.Adam,
                lr=0.0001,
                batch_size=8, device='cuda')

net.fit(pikachu_ds)


> <ipython-input-42-e207751f2a84>(69)forward()
-> batch_size, num_boxes = cls_targets.size()
(Pdb) l
 64  	        loc_preds, cls_preds = pred
 65  	        loc_targets, cls_targets = target
 66  	
 67  	        import pdb; pdb.set_trace()
 68  	
 69  ->	        batch_size, num_boxes = cls_targets.size()
 70  	        pos = cls_targets > 0  # [N,#anchors]
 71  	        num_pos = pos.data.long().sum()
 72  	
 73  	        ################################################################
 74  	        # loc_loss = SmoothL1Loss(pos_loc_preds, pos_loc_targets)
(Pdb) loc_targets
tensor([[[ 8.8488,  2.8443,  0.5010, -0.1376],
         [ 7.0233,  2.2575,  0.2699, -0.3687],
         [ 5.5744,  1.7918,  0.0389, -0.5997],
         ...,
         [-0.2851, -1.6236, -1.2908, -0.5431],
         [-0.2263, -1.2887, -1.5218, -0.7741],
         [-0.1796, -1.0228, -1.7529, -1.0052]],

        [[ 1.8082,  2.8577,  0.8967,  0.3911],
         [ 1.4352,  2.2681,  0.6656,  0.1601],
         [ 1.1391,  1.8002, 

(Pdb) loc_preds
tensor([[[-0.0456, -0.1506,  0.2617, -0.5339],
         [-0.0064,  0.1060, -0.1955, -0.0417],
         [ 0.2040,  0.3076, -0.0858, -0.1872],
         ...,
         [-0.0357, -0.2670,  0.0439,  0.0329],
         [-0.0967, -0.0485, -0.0843, -0.0971],
         [-0.0953,  0.0049,  0.2119,  0.1706]],

        [[ 0.2547, -0.4716,  0.3697, -0.4373],
         [ 0.0782,  0.0124, -0.1051, -0.0122],
         [-0.0499,  0.1857, -0.0480, -0.1715],
         ...,
         [ 0.0014, -0.1792,  0.0294, -0.1873],
         [-0.1598, -0.3353,  0.0073, -0.1370],
         [-0.2038,  0.0189,  0.3149,  0.0304]],

        [[ 0.1455, -0.3014,  0.2963, -0.6725],
         [ 0.0826, -0.0351, -0.5331, -0.2374],
         [ 0.3603,  0.0526,  0.1720, -0.1059],
         ...,
         [ 0.1299, -0.2406,  0.1025, -0.0623],
         [-0.3832, -0.1602, -0.4046, -0.2097],
         [-0.1315,  0.2797,  0.3770,  0.0726]],

        ...,

        [[ 0.1398, -0.2267,  0.3478, -0.3984],
         [-0.1103,  0.1563, -

(Pdb) cls_targets.shape
torch.Size([8, 12096])
(Pdb) cls_targets[0]
tensor([0., 0., 0.,  ..., 0., 0., 0.], device='cuda:0')
(Pdb) pos.shape
torch.Size([8, 12096])
(Pdb) pos[0]
tensor([0, 0, 0,  ..., 0, 0, 0], device='cuda:0', dtype=torch.uint8)
(Pdb) l
 71  	        num_pos = pos.data.long().sum()
 72  	
 73  	        ################################################################
 74  	        # loc_loss = SmoothL1Loss(pos_loc_preds, pos_loc_targets)
 75  	        ################################################################
 76  ->	        mask = pos.unsqueeze(2).expand_as(loc_preds)       # [N,#anchors,4]
 77  	        masked_loc_preds = loc_preds[mask].view(-1,4)      # [#pos,4]
 78  	        masked_loc_targets = loc_targets[mask].view(-1,4)  # [#pos,4]
 79  	        loc_loss = F.smooth_l1_loss(masked_loc_preds, masked_loc_targets, size_average=False)
 80  	
 81  	        ################################################################
(Pdb) loc_preds.shape
torch.Size([8, 12096

BdbQuit: 

In [None]:
# preds = net.forward(pikachu_ds)

preds

In [38]:
from torchvision.transforms import Resize, ToPILImage, ToTensor
it = iter(pikachu_dl)
b = next(it)

im = ToTensor()(Resize((256, 256))(ToPILImage()(torch.cat([b[0][0], b[0][0]], dim=1))))
image = im.unsqueeze(0)

enc = DataEncoder(anchor_areas=[30*30, 50*50, 90*90])
loc_pred, cls_pred = model(image.cuda())

# i = 0
# bbspred, labelpred, score  = enc.decode(
#     loc_pred[i].float().cpu(), 
#     cls_pred[i].float().cpu(), 
#     torch.Tensor([256, 256]).float().cpu()
# )

# image_to_show = np.moveaxis(
#     image[i].detach().cpu().numpy(),0, 2)

# matched_anchors_on_image = ia.BoundingBoxesOnImage(
#     [ia.BoundingBox(*b) for b in bbspred.detach().cpu().numpy()], shape=(256, 256))

# image_to_show = matched_anchors_on_image.draw_on_image(image_to_show, thickness=2)
# plt.imshow(image_to_show)
# plt.title('score ' + str(score))

IndexError: tuple index out of range

### Exercises

-  Load a custom image with pikachu( or many ) and try to make predictiosn with the network and visualize the result
-  Can you think of anything that could confuse our detector? yellow dots ?
- Currently the code is not really modular, try to make it nice by splitting it into logical parts
    - Base feature extractor Module
    - Head Creator module
- Currently the detection/cls HEADS are very simple (just one CONV layer) they can be more complex. Try using more convolutions, check other architectures how its done

- Can you use our network to train using some new data for instance:
    - https://www.kaggle.com/tomluther/ships-in-google-earth
    - https://www.kaggle.com/aruchomu/data-for-yolo-v3-kernel
    - https://www.kaggle.com/dataturks/face-detection-in-images
    - https://www.kaggle.com/dataturks/face-dataset-with-age-emotion-ethnicity  
 You will need to create a data loader/data sets similar as we did for the pikachu loader. 


In [272]:
def py_sigmoid_focal_loss(pred,
                          target,
                          weight,
                          gamma=2.0,
                          alpha=0.25,
                          reduction='mean'):
    pred_sigmoid = pred.sigmoid()
    target = target.type_as(pred)
    pt = (1 - pred_sigmoid) * target + pred_sigmoid * (1 - target)
    weight = (alpha * target + (1 - alpha) * (1 - target)) * weight
    weight = weight * pt.pow(gamma)
    loss = F.binary_cross_entropy_with_logits(
        pred, target, reduction='none') * weight
    reduction_enum = F._Reduction.get_enum(reduction)
    # none: 0, mean:1, sum: 2
    if reduction_enum == 0:
        return loss
    elif reduction_enum == 1:
        return loss.mean()
    elif reduction_enum == 2:
        return loss.sum()
    
    
def focal_loss(x, y, num_class=2):
    '''Focal loss.
    Args:
      x: (tensor) sized [N,D].
      y: (tensor) sized [N,].
    Return:
      (tensor) focal loss.
    '''
    alpha = 0.25
    gamma = 2

    t = one_hot_embedding(y.data.cpu(), num_class)  # [N,21]
#     t = t[:,1:]  # exclude background
#     t = Variable(t).cuda()  # [N,20]


    p = x.sigmoid().detach()
    pt = p*t + (1-p)*(1-t)         # pt = p if t > 0 else 1-p
    w = alpha*t + (1-alpha)*(1-t)  # w = alpha if t > 0 else 1-alpha
    w = w * (1-pt).pow(gamma)
    return F.binary_cross_entropy_with_logits(x, t, w, size_average=False)

In [214]:
pred = torch.tensor([0, 0.1, 0, 1.0])
target = torch.tensor([0,1,1,0])

py_sigmoid_focal_loss(pred, target, weight=torch.tensor([1.0,1,1,1]))

tensor(0.1840)

In [215]:
x = torch.rand([10, 2])
y = torch.rand([10, 2]).round().int().float()
x

x = torch.tensor([[10000.0, 0.0, 10]])
y = torch.tensor([[1, 0.0, 1]])

py_sigmoid_focal_loss(x, y, 1)

tensor(0.0433)

In [216]:
pred = torch.stack([torch.ones_like(pred) - pred, pred]).view(-1, 2)

In [222]:
target = target.reshape(-1, 1)
focal_loss(pred, target)

RuntimeError: The size of tensor a (4) must match the size of tensor b (0) at non-singleton dimension 1

torch.Size([1, 3]) torch.Size([1])
> <ipython-input-254-ff27f5bc0c68>(41)focal_loss()
-> p = x.sigmoid().detach()
(Pdb) l
 36  	    t = t[:,1:]  # exclude background
 37  	#     t = Variable(t).cuda()  # [N,20]
 38  	    import pdb
 39  	    pdb.set_trace()
 40  	
 41  ->	    p = x.sigmoid().detach()
 42  	    pt = p*t + (1-p)*(1-t)         # pt = p if t > 0 else 1-p
 43  	    w = alpha*t + (1-alpha)*(1-t)  # w = alpha if t > 0 else 1-alpha
 44  	    w = w * (1-pt).pow(gamma)
 45  	    return F.binary_cross_entropy_with_logits(x, t, w, size_average=False)
[EOF]
(Pdb) x.shape
torch.Size([1, 3])
(Pdb) y.shape
torch.Size([1])
(Pdb) n
> <ipython-input-254-ff27f5bc0c68>(42)focal_loss()
-> pt = p*t + (1-p)*(1-t)         # pt = p if t > 0 else 1-p
(Pdb) n
> <ipython-input-254-ff27f5bc0c68>(43)focal_loss()
-> w = alpha*t + (1-alpha)*(1-t)  # w = alpha if t > 0 else 1-alpha
(Pdb) n
> <ipython-input-254-ff27f5bc0c68>(44)focal_loss()
-> w = w * (1-pt).pow(gamma)
(Pdb) n
> <ipython-input-254-ff27f



ValueError: Target size (torch.Size([1, 1])) must be the same as input size (torch.Size([1, 3]))
> <ipython-input-254-ff27f5bc0c68>(45)focal_loss()
-> return F.binary_cross_entropy_with_logits(x, t, w, size_average=False)
(Pdb) n
--Return--
> <ipython-input-254-ff27f5bc0c68>(45)focal_loss()->None
-> return F.binary_cross_entropy_with_logits(x, t, w, size_average=False)
(Pdb) n
ValueError: Target size (torch.Size([1, 1])) must be the same as input size (torch.Size([1, 3]))
> <ipython-input-255-fb104846a17c>(5)<module>()
-> focal_loss(x, y, n=3)
(Pdb) n
--Return--
> <ipython-input-255-fb104846a17c>(5)<module>()->None
-> focal_loss(x, y, n=3)
(Pdb) n
ValueError: Target size (torch.Size([1, 1])) must be the same as input size (torch.Size([1, 3]))
> /home/i008/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py(2961)run_code()
-> exec(code_obj, self.user_global_ns, self.user_ns)
(Pdb) n
> /home/i008/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py(29

ValueError: Target size (torch.Size([1, 1])) must be the same as input size (torch.Size([1, 3]))

In [None]:
(Pdb) masked_cls_preds.shape
torch.Size([96382, 1])
(Pdb) cls_targets[pos_neg].shape
torch.Size([96382])

In [274]:
x = torch.tensor([[0.9, 0, 0]])
y = torch.tensor([0])


# t = one_hot_embedding(torch.tensor(y), 3) 
# t = one_hot_embedding(y, 1+1) # [N,21]
# print(t)
# t = t[:,1:]  # exclude background

In [275]:
focal_loss(x, y, num_class=3)

> <ipython-input-272-2febcda2fe0e>(41)focal_loss()
-> p = x.sigmoid().detach()
(Pdb) c




tensor(0.2671)

In [8]:

def box_iou(box1, box2, order='xyxy'):
    '''Compute the intersection over union of two set of boxes.

    The default box order is (xmin, ymin, xmax, ymax).

    Args:
      box1: (tensor) bounding boxes, sized [N,4].
      box2: (tensor) bounding boxes, sized [M,4].
      order: (str) box order, either 'xyxy' or 'xywh'.

    Return:
      (tensor) iou, sized [N,M].

    Reference:
      https://github.com/chainer/chainercv/blob/master/chainercv/utils/bbox/bbox_iou.py
    '''
    if order == 'xywh':
        box1 = change_box_order(box1, 'xywh2xyxy')
        box2 = change_box_order(box2, 'xywh2xyxy')

    N = box1.size(0)
    M = box2.size(0)

    lt = torch.max(box1[:,None,:2], box2[:,:2])  # [N,M,2]
    rb = torch.min(box1[:,None,2:], box2[:,2:])  # [N,M,2]

    wh = (rb-lt+1).clamp(min=0)      # [N,M,2]
    inter = wh[:,:,0] * wh[:,:,1]  # [N,M]

    area1 = (box1[:,2]-box1[:,0]+1) * (box1[:,3]-box1[:,1]+1)  # [N,]
    area2 = (box2[:,2]-box2[:,0]+1) * (box2[:,3]-box2[:,1]+1)  # [M,]
    iou = inter / (area1[:,None] + area2 - inter)
    return iou

In [32]:
b1 = torch.tensor([[10.0,10.0,20,20],[8,8,15,15], [3,3,10,10], [9,9,20,20], [30,30,40,40]])
b2 = torch.tensor([[10.0, 10,23, 20],[12,12,30,30], [28,28,35, 35]])

In [34]:
ious = box_iou(b1, b2)

In [35]:
ious


tensor([[0.7857, 0.2020, 0.0000],
        [0.1978, 0.0391, 0.0000],
        [0.0046, 0.0000, 0.0000],
        [0.6836, 0.1910, 0.0000],
        [0.0000, 0.0021, 0.2416]])

In [37]:
m, ix = ious.max(1)

In [38]:
b2[ix]

tensor([[10., 10., 23., 20.],
        [10., 10., 23., 20.],
        [10., 10., 23., 20.],
        [10., 10., 23., 20.],
        [28., 28., 35., 35.]])

In [47]:
ix

tensor([0, 0, 0, 0, 2])

In [51]:
b1[ix]


tensor([[10., 10., 20., 20.],
        [10., 10., 20., 20.],
        [10., 10., 20., 20.],
        [10., 10., 20., 20.],
        [ 3.,  3., 10., 10.]])