In [None]:
# Computer Vision HW6
# Author: Kai Liao
# Trained on Google Cloud

In [1]:
import os
import numpy as np
from glob import glob
from PIL import Image
from scipy.io import loadmat
from matplotlib.path import Path

import torch
import torchvision.transforms as transforms

In [2]:
class EgoHands(torch.utils.data.Dataset):
    def __init__(self, path, transform=None):
        self.path = path
        folders = sorted(glob(os.path.join(self.path, "*")))
        self.imgs = []
        self.polygons = []
        for folder in folders:
            # Add images
            self.imgs += sorted(glob(os.path.join(folder, "*.jpg")))
            
            # Add polygons
            polygon_path = glob(os.path.join(folder, "*.mat"))[0]
            polygon = loadmat(polygon_path)['polygons'][0]
            for i in range(len(polygon)):
                self.polygons.append(polygon[i])
                
        # TODO: use suitable transformations
        self.transform = transforms.Compose([transforms.ToTensor()])

    def __getitem__(self, index):
        # Load image
        img = np.array(Image.open(self.imgs[index]))

        # Compute mask
        polygons = self.polygons[index]
        gt_mask = []
        x, y = np.meshgrid(
            np.arange(img.shape[1]), np.arange(img.shape[0]))
        x, y = x.flatten(), y.flatten()
        points = np.vstack((x, y)).T
        for i, polygon in enumerate(polygons):
            if polygon.size == 0:
                continue
            path = Path(polygon)
            grid = path.contains_points(points)
            grid = grid.reshape((*img.shape[:2]))
            gt_mask.append(np.expand_dims(grid, axis=-1))
        
        try:
            gt_mask = np.concatenate(gt_mask, axis=-1)
            target = {}
            boxes = []
            for i in range(gt_mask.shape[2]):
                pos = np.where(gt_mask[:,:,i])
                xmin = np.min(pos[1])
                xmax = np.max(pos[1])
                ymin = np.min(pos[0])
                ymax = np.max(pos[0])
                boxes.append([xmin, ymin, xmax, ymax])

            labels = torch.ones((gt_mask.shape[2],), dtype=torch.int64)
            boxes = torch.as_tensor(boxes, dtype=torch.float32)
            target["boxes"] = boxes
            target["labels"] = labels

            if self.transform:
                img = self.transform(img)

            return img, target
        
        except:
            '''
            return next image that has mask
            it may happen that an image is used multiple times during training
            but the effect is parhaps negligible given the size of dataset.
            '''
            return self.__getitem__(index + 1)

    def __len__(self):
        return len(self.imgs)

In [3]:
import transforms as T
import utils

def get_transform(train):
    transforms = []
    transforms.append(T.ToTensor())
    if train:
        transforms.append(T.RandomHorizontalFlip(0.5))
    return T.Compose(transforms)


# set up model
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
num_classes = 2
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')


# define dataset: 70/30 train/val
train_set = EgoHands('_LABELLED_SAMPLES/', get_transform(train=True))
val_set = EgoHands('_LABELLED_SAMPLES/', get_transform(train=False))
indices = torch.randperm(len(train_set)).tolist()
train_size = int(len(train_set) * 0.7)
val_set = torch.utils.data.Subset(val_set, indices[:-train_size])
train_set = torch.utils.data.Subset(train_set, indices[-train_size:])

# define dataloader
train_loader = torch.utils.data.DataLoader(
        train_set, batch_size=2, shuffle=True, num_workers=4,
        collate_fn=utils.collate_fn)

val_loader = torch.utils.data.DataLoader(
        val_set, batch_size=1, shuffle=False, num_workers=4,
        collate_fn=utils.collate_fn)

model.to(device)

FasterRCNN(
  (transform): GeneralizedRCNNTransform()
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d()
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d()
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d()
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d()
          (relu): ReLU(inplace=True)
          (downsample): Sequential(
            (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): FrozenBatchNorm2d()
          )
  

In [4]:
from engine import train_one_epoch, evaluate
import utils

def training():
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params, lr=0.005,
                                momentum=0.9, weight_decay=0.0005)
    
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                   step_size=3,
                                                   gamma=0.1)

    num_epochs = 1

    for epoch in range(num_epochs):
        train_one_epoch(model, optimizer, train_loader, device, epoch, print_freq=120)
        lr_scheduler.step()

    print("Finished Training")

In [5]:
training()

Epoch: [0]  [   0/1680]  eta: 4:10:42  lr: 0.000010  loss: 1.1070 (1.1070)  loss_classifier: 0.8615 (0.8615)  loss_box_reg: 0.1972 (0.1972)  loss_objectness: 0.0380 (0.0380)  loss_rpn_box_reg: 0.0104 (0.0104)  time: 8.9539  data: 7.1215  max mem: 2478
Epoch: [0]  [ 120/1680]  eta: 1:31:31  lr: 0.000609  loss: 0.3827 (0.5668)  loss_classifier: 0.1414 (0.2852)  loss_box_reg: 0.2023 (0.2162)  loss_objectness: 0.0093 (0.0450)  loss_rpn_box_reg: 0.0144 (0.0205)  time: 3.5910  data: 3.1086  max mem: 2743
Epoch: [0]  [ 240/1680]  eta: 1:25:25  lr: 0.001209  loss: 0.2114 (0.4101)  loss_classifier: 0.1006 (0.1972)  loss_box_reg: 0.0833 (0.1609)  loss_objectness: 0.0055 (0.0317)  loss_rpn_box_reg: 0.0143 (0.0203)  time: 3.3144  data: 2.8603  max mem: 2743
Epoch: [0]  [ 360/1680]  eta: 1:17:49  lr: 0.001808  loss: 0.1509 (0.3465)  loss_classifier: 0.0695 (0.1597)  loss_box_reg: 0.0594 (0.1331)  loss_objectness: 0.0030 (0.0323)  loss_rpn_box_reg: 0.0122 (0.0214)  time: 3.0920  data: 2.6411  max me

In [27]:
# evaluate
model.eval()
sample_inds = np.random.randint(low=0, high=len(val_set) - 1, size=5)
predictions = []

with torch.no_grad():
    for ind in sample_inds:
        predictions.append(model([val_set[ind][0].to(device)]))

In [36]:
# sample prediction
print("sample inference on the", sample_inds[3], "th image in the validation set")
print(predictions[3])
print(val_set[sample_inds[3]][1]['boxes'])

sample inference on the 1086 th image in the validation set
[{'boxes': tensor([[640.1021, 618.8013, 898.8427, 718.0145],
        [624.5226, 219.9917, 822.4734, 431.0540],
        [304.4763, 173.6108, 502.3412, 416.6602],
        [291.0628, 690.9357, 414.8874, 717.5993],
        [630.2924, 681.4257, 716.7034, 718.1188]], device='cuda:0'), 'labels': tensor([1, 1, 1, 1, 1], device='cuda:0'), 'scores': tensor([0.9997, 0.9989, 0.9987, 0.9949, 0.0725], device='cuda:0')}]
tensor([[304., 698., 414., 718.],
        [648., 624., 894., 718.],
        [632., 240., 803., 423.],
        [325., 217., 492., 411.]])


In [37]:
# save
torch.save(model, "./model.pth")