In [94]:
import os
import numpy as np
import torch
from PIL import Image
import torch.nn.functional as nnf
import torchvision.transforms.functional as F
sample_mask = "/mnt/hdd/eric/.tmp_ipy/15.Lab_Detection/05.Training/Segmentation/00.data_penn/PennFudanPed/PedMasks/FudanPed00001_mask.png"
mask = Image.open(sample_mask)
mask = np.array(mask)
mask.shape

(536, 559)

In [95]:
obj_ids = np.unique(mask)

In [96]:
obj_ids

array([0, 1, 2], dtype=uint8)

In [97]:
obj_ids = obj_ids[1:]

In [98]:
masks = mask == obj_ids[:,None,None]

In [99]:
obj_ids[:,None,None]

array([[[1]],

       [[2]]], dtype=uint8)

In [100]:
obj_ids[:,None,None]

array([[[1]],

       [[2]]], dtype=uint8)

In [101]:
masks.shape

(2, 536, 559)

In [102]:
import torchvision
torchvision.__version__

'0.14.1+cu117'

In [103]:
torch.__version__

'1.13.1+cu117'

In [104]:
class PennFudanDataset(torch.utils.data.Dataset):
    def __init__(self, root, transforms):
        self.root = root
        self.transforms = transforms
        # load all image files, sorting them to
        # ensure that they are aligned
        self.imgs = list(sorted(os.listdir(os.path.join(root, "PNGImages"))))
        self.masks = list(sorted(os.listdir(os.path.join(root, "PedMasks"))))

    def __getitem__(self, idx):
        # load images and masks
        img_path = os.path.join(self.root, "PNGImages", self.imgs[idx])
        mask_path = os.path.join(self.root, "PedMasks", self.masks[idx])
        img = Image.open(img_path).convert("RGB")
        # note that we haven't converted the mask to RGB,
        # because each color corresponds to a different instance
        # with 0 being background
        mask = Image.open(mask_path)
        # convert the PIL Image into a numpy array
        mask = np.array(mask)
        
        # instances are encoded as different colors
        obj_ids = np.unique(mask)
        # first id is the background, so remove it
        obj_ids = obj_ids[1:]

        # split the color-encoded mask into a set
        # of binary masks
        masks = mask == obj_ids[:, None, None]

        # get bounding box coordinates for each mask
        num_objs = len(obj_ids)
        boxes = []
        for i in range(num_objs):
            pos = np.nonzero(masks[i])
            xmin = np.min(pos[1])
            xmax = np.max(pos[1])
            ymin = np.min(pos[0])
            ymax = np.max(pos[0])
            boxes.append([xmin, ymin, xmax, ymax])

        # convert everything into a torch.Tensor
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        # there is only one class
        labels = torch.ones((num_objs,), dtype=torch.int64)
        masks = torch.as_tensor(masks, dtype=torch.uint8)

        image_id = torch.tensor([idx])
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        # suppose all instances are not crowd
        iscrowd = torch.zeros((num_objs,), dtype=torch.int64)


        # if self.transforms is not None:
        # img, target = self.transforms(img, target)
        if self.transforms is not None:
            img = self.transforms(img)

        #-- mask resize 
        resize = torchvision.transforms.Resize((256,256))
        masks = resize(masks)
        masks = torch.argmax(masks, dim=0)
            
        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["masks"] = masks
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd

        return img, target

    def __len__(self):
        return len(self.imgs)

In [105]:
root_ = "/mnt/hdd/eric/.tmp_ipy/15.Lab_Detection/05.Training/Segmentation/00.data_penn/PennFudanPed"
from torchvision import transforms as T

def get_transform(train):
    transforms = []
    transforms.append(T.Resize((256,256)))
    transforms.append(T.PILToTensor())
    transforms.append(T.ConvertImageDtype(torch.float))
    
    if train:
        transforms.append(T.RandomHorizontalFlip(0.5))
    
    return T.Compose(transforms)

#--------------
dataset_ = PennFudanDataset(root_, get_transform(train=True))

In [106]:
dataset_.__getitem__(9)[0].shape

torch.Size([3, 256, 256])

In [107]:
dataset_.__getitem__(5)[1]['masks'].shape

torch.Size([256, 256])

In [108]:
m1 = dataset_.__getitem__(15)[1]['masks']
m2 = dataset_.__getitem__(10)[1]['masks']
m3 = dataset_.__getitem__(3)[1]['masks']

print(m1.shape,m2.shape,m3.shape)

torch.Size([256, 256]) torch.Size([256, 256]) torch.Size([256, 256])


In [109]:
#dataset = PennFudanDataset('PennFudanPed', get_transform(train=True))

def collate_fn_v(batch):
    images = [ i[0] for i in batch]
    masks = [ i[1]['masks'] for i in batch]

    struct_ = {
        'images' : torch.stack(images),
        'masks' : torch.stack(masks)
    }

    return struct_

import utils
collate_fn_z = utils.collate_fn

data_loader = torch.utils.data.DataLoader(
dataset_, batch_size=2, shuffle=True, num_workers=4,
collate_fn=collate_fn_z)
# For Training

In [112]:
data_  = next(iter(data_loader))

In [119]:
data_[0][0].shape
data_[0][1].shape

torch.Size([3, 256, 256])

In [121]:
data_[1]

({'boxes': tensor([[164.,  52., 303., 345.]]),
  'labels': tensor([1]),
  'masks': tensor([[0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          ...,
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0]]),
  'image_id': tensor([32]),
  'area': tensor([40727.]),
  'iscrowd': tensor([0])},
 {'boxes': tensor([[  5.,  37., 101., 323.],
          [100.,  25., 205., 322.]]),
  'labels': tensor([1, 1]),
  'masks': tensor([[0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          ...,
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0]]),
  'image_id': tensor([169]),
  'area': tensor([27456., 31185.]),
  'iscrowd': tensor([0, 0])})

In [122]:
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

# load a model pre-trained on COCO
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT")

# replace the classifier with a new one, that has
# num_classes which is user-defined
num_classes = 2  # 1 class (person) + background
# get number of input features for the classifier
in_features = model.roi_heads.box_predictor.cls_score.in_features
# replace the pre-trained head with a new one
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
import torchvision
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator

# load a pre-trained model for classification and return
# only the features
backbone = torchvision.models.mobilenet_v2(weights="DEFAULT").features
# ``FasterRCNN`` needs to know the number of
# output channels in a backbone. For mobilenet_v2, it's 1280
# so we need to add it here
backbone.out_channels = 1280

# let's make the RPN generate 5 x 3 anchors per spatial
# location, with 5 different sizes and 3 different aspect
# ratios. We have a Tuple[Tuple[int]] because each feature
# map could potentially have different sizes and
# aspect ratios
anchor_generator = AnchorGenerator(
    sizes=((32, 64, 128, 256, 512),),
    aspect_ratios=((0.5, 1.0, 2.0),)
)

# let's define what are the feature maps that we will
# use to perform the region of interest cropping, as well as
# the size of the crop after rescaling.
# if your backbone returns a Tensor, featmap_names is expected to
# be [0]. More generally, the backbone should return an
# ``OrderedDict[Tensor]``, and in ``featmap_names`` you can choose which
# feature maps to use.
roi_pooler = torchvision.ops.MultiScaleRoIAlign(
    featmap_names=['0'],
    output_size=7,
    sampling_ratio=2
)

# put the pieces together inside a Faster-RCNN model
model = FasterRCNN(
    backbone,
    num_classes=2,
    rpn_anchor_generator=anchor_generator,
    box_roi_pool=roi_pooler
)
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor


def get_model_instance_segmentation(num_classes):
    # load an instance segmentation model pre-trained on COCO
    model = torchvision.models.detection.maskrcnn_resnet50_fpn(weights="DEFAULT")

    # get number of input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    # replace the pre-trained head with a new one
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    # now get the number of input features for the mask classifier
    in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
    hidden_layer = 256
    # and replace the mask predictor with a new one
    model.roi_heads.mask_predictor = MaskRCNNPredictor(
        in_features_mask,
        hidden_layer,
        num_classes
    )

    return model

Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth" to /mnt/hdd/eric/.cache/torch/hub/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth


  0%|          | 0.00/160M [00:00<?, ?B/s]

Downloading: "https://download.pytorch.org/models/mobilenet_v2-7ebf99e0.pth" to /mnt/hdd/eric/.cache/torch/hub/checkpoints/mobilenet_v2-7ebf99e0.pth


  0%|          | 0.00/13.6M [00:00<?, ?B/s]

In [135]:
import utils


model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT")
# dataset = PennFudanDataset('data/PennFudanPed', get_transform(train=True))
# data_loader = torch.utils.data.DataLoader(
#     dataset,
#     batch_size=2,
#     shuffle=True,
#     num_workers=4,
#     collate_fn=utils.collate_fn
# )

# For Training
images, targets = next(iter(data_loader))
images = list(image for image in images)
targets = [{k: v for k, v in t.items()} for t in targets]
output = model(images, targets)  # Returns losses and detections
print(output)

# For inference
model.eval()
x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
predictions = model(x)  # Returns predictions
print(predictions[0])

{'loss_classifier': tensor(0.1991, grad_fn=<NllLossBackward0>), 'loss_box_reg': tensor(0.0080, grad_fn=<DivBackward0>), 'loss_objectness': tensor(0.1421, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), 'loss_rpn_box_reg': tensor(0.0506, grad_fn=<DivBackward0>)}
{'boxes': tensor([], size=(0, 4), grad_fn=<StackBackward0>), 'labels': tensor([], dtype=torch.int64), 'scores': tensor([], grad_fn=<IndexBackward0>)}


In [138]:
import segmentation_models_pytorch as smp

model = smp.DeepLabV3Plus(
    encoder_name="resnet152",        # choose encoder, e.g. mobilenet_v2 or efficientnet-b7
    encoder_weights="imagenet",     # use `imagenet` pre-trained weights for encoder initialization
    in_channels=3,                  # model input channels (1 for gray-scale images, 3 for RGB, etc.)
    classes=7,                      # model output channels (number of classes in your dataset)
)

Downloading: "https://download.pytorch.org/models/resnet152-b121ed2d.pth" to /mnt/hdd/eric/.cache/torch/hub/checkpoints/resnet152-b121ed2d.pth


  0%|          | 0.00/230M [00:00<?, ?B/s]