In [1]:
import json
import os
import random
import numpy as np
import torch
import torch.utils.data
from PIL import Image
from torchvision import transforms
from torch.utils.data import Dataset,DataLoader
from torch.autograd import Variable

In [2]:
carpath = "nuimages/train/car.json"
humanpath = "nuimages/train/human.json"

This is a sample data loader

In [3]:
def load_image(path, classes):
    with open(path, 'r', encoding='utf8') as fp:
            json_data = json.load(fp)
    imgs = []
    for sample_data in json_data['smaple_data']:
        img = {}
        img['height'] = sample_data['height']
        img['width'] = sample_data['width']
        for i in range(len(sample_data['filename'])):
                    if sample_data['filename'][i] == 'n' and sample_data['filename'][i+1] == '0':
                        break
        sample_path = sample_data['filename'][i:]
        if(classes == 0):
            img['label'] = 0 #car
            img['path'] = 'nuimages/train/car/' + sample_path
        else:
            img['label'] = 1 #human
            img['path'] = 'nuimages/train/human/' + sample_path
        img['annotations'] = []
        for annotation in json_data['annotations']:
            if(annotation['sample_data_token'] == sample_data['token']):
                img['annotations'].append(annotation)
        imgs.append(img)
    return imgs
car_imgs = []
car_imgs = load_image(carpath, 0)
human_imgs = []
human_imgs = load_image(humanpath, 0)
car_imgs

[{'height': 900,
  'width': 1600,
  'label': 0,
  'path': 'nuimages/train/car/n008-2018-05-24-12-02-23-0400__CAM_FRONT_RIGHT__1527178879169956.jpg',
  'annotations': [{'token': '00004dd38b934690965746970fc39e1f',
    'category_token': 'fd69059b62a3469fbaef25340c0eab7f',
    'bbox': [515, 425, 629, 467],
    'mask': {'size': [900, 1600],
     'counts': 'blFSPzFTbDAwMDBPMTAxTjEwMDAwTzJPMDAwMDFPMDAwMDAwMU8wMDAwMDAxTzFPMU8wMDFPMU8yTjNNNGVUT0BqajBtMDAwMU8wMDAwMDAwMDAwMDFPTzJPMDAwMDAwME8xMDFPMDAwMDBPMTAwMDFPMDAwTzEwMDAwMDFPMDAwMDBPMTAwMDAwMDAwMDFPMDAwTzEwMDAwMDAwME8xMDAwMU8yTjJON0gzTTRMW29kajA='},
    'attribute_tokens': ['9f65c1eaa74e4d5db46e87a34811e994'],
    'sample_data_token': '6f458d474f824067a850187b5ebb5f0c'},
   {'token': '06f87d245cae4ac9b8f1855de3b140db',
    'category_token': 'fd69059b62a3469fbaef25340c0eab7f',
    'bbox': [1472, 405, 1600, 596],
    'mask': {'size': [900, 1600],
     'counts': 'XldeWDE6ZGswOUg3TDNNNE0zTDRjVU9uTmNpMFAySTRLMk8yTjEwMU4xTzJOMTAwMDFPMDAxTjEwMDAwMDBP

## Custom Dataset

In [4]:
def Myloader(path):
    return Image.open(path).convert('RGB')

In [5]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, data, transform=None, loader=Myloader):
        self.data = data #data is image list
        self.transform = transform
        self.loader = loader

    def __getitem__(self, idx): #data is imgs[index]
        img_path = self.data[idx]['path']
        img = self.loader(img_path)
        obj_ids = self.data[idx]['annotations']
        boxes = []
        labels = []
        # read bbox and lable
        for obj in obj_ids: #bbox <int> [4] -- Annotated amodal bounding box. Given as [xmin, ymin, xmax, ymax].
            boxes.append(obj['bbox'])
            labels.append(self.data[idx]['label'])
        boxes = torch.tensor(boxes, dtype=torch.float32)
        labels = torch.tensor(labels, dtype=torch.float32)
        area = (boxes[:, 3]-boxes[:,1]) * (boxes[:, 2]-boxes[:, 0])
        image_id = torch.tensor([idx])
        
        boxes = transforms.Pad([20,4], fill=0, padding_mode='constant')(boxes)
        #labels = transforms.Pad(20, fill=4, padding_mode='constant')(labels) #fill with 4, 4 is invalid value means there is no object
        target = {}
        target["boxes"] = boxes
        target["lables"] = labels
        target["image_id"] = image_id
        target["area"] = area

        img = self.transform(img)

        return img, target

    def __len__(self):
        return len(self.data)

## Extract features

We use pretrained Net to extract features for each images [resnext](https://pytorch.org/hub/facebookresearch_WSL-Images_resnext/)

Firstly Normalization our image. All pre-trained models expect input images normalized in the same way, i.e. mini-batches of 3-channel RGB images of shape (3 x H x W), where H and W are expected to be at least 224. The images have to be loaded in to a range of [0, 1] and then normalized using mean = [0.485, 0.456, 0.406] and std = [0.229, 0.224, 0.225].

In [6]:
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [7]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
resnext_model = torch.hub.load('facebookresearch/WSL-Images', 'resnext101_32x8d_wsl').to(device)
resnext_model.eval()

Using cache found in C:\Users\Jiarun/.cache\torch\hub\facebookresearch_WSL-Images_main


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
      (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1

Use Pytorch DataLoader to put our images to our Net

In [8]:
batch_size = 40
car_train = Dataset(car_imgs, transform=transform, loader=Myloader)
human_train = Dataset(human_imgs, transform=transform, loader=Myloader)
car_train_data =  DataLoader(dataset=car_train, batch_size=batch_size, shuffle=True, num_workers=0)
human_train_data =  DataLoader(dataset=human_train, batch_size=batch_size, shuffle=True, num_workers=0)

In [9]:

#device = torch.device("cpu")
Cuda = True
#list(enumerate(car_train_data, 0))
#
for i, image in enumerate(car_train_data, 0):
    image = Variable(image.to(device))
    with torch.no_grad():
        output = resnext_model(image).cuda()
    print(output[0])

RuntimeError: stack expects each tensor to be equal size, but got [21, 44] at entry 0 and [14, 44] at entry 1