In [1]:
import os
import numpy as np
from glob import glob
from PIL import Image
from scipy.io import loadmat
from matplotlib.path import Path

import torch
import torchvision.transforms as transforms

In [2]:
class EgoHands(torch.utils.data.Dataset):
    def __init__(self, path, transform=None):
        self.path = path
        folders = sorted(glob(os.path.join(self.path, "*")))
        self.imgs = []
        self.polygons = []
        for folder in folders:
            # Add images
            self.imgs += sorted(glob(os.path.join(folder, "*.jpg")))

            # Add polygons
            polygon_path = glob(os.path.join(folder, "*.mat"))[0]
            polygon = loadmat(polygon_path)['polygons'][0]
            for i in range(len(polygon)):
                self.polygons.append(polygon[i])

        # TODO: use suitable transformations
        self.transform = transforms.Compose([transforms.ToTensor()])


    def __getitem__(self, index):
        # Load image
        img = np.array(Image.open(self.imgs[index]))

        # Compute mask
        polygons = self.polygons[index]
        gt_mask = []
        x, y = np.meshgrid(
            np.arange(img.shape[1]), np.arange(img.shape[0]))
        x, y = x.flatten(), y.flatten()
        points = np.vstack((x, y)).T
        for i, polygon in enumerate(polygons):
            if polygon.size == 0:
                continue
            path = Path(polygon)
            grid = path.contains_points(points)
            grid = grid.reshape((*img.shape[:2]))
            gt_mask.append(np.expand_dims(grid, axis=-1))
        gt_mask = np.concatenate(gt_mask, axis=-1)
        

        # TODO: compute minimal bounding boxes
        target = {}
        boxes = []
        for i in range(gt_mask.shape[2]):
            pos = np.where(gt_mask[:,:,i])
            xmin = np.min(pos[1])
            xmax = np.max(pos[1])
            ymin = np.min(pos[0])
            ymax = np.max(pos[0])
            boxes.append([xmin, ymin, xmax, ymax])
            
        labels = torch.ones((gt_mask.shape[2],), dtype=torch.int64)
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        target["boxes"] = boxes
        target["labels"] = labels
            
        if self.transform:
            img = self.transform(img)

        return img, target

    def __len__(self):
        return len(self.imgs)

In [3]:
# dataset = EgoHands('_LABELLED_SAMPLES/')
# print(len(dataset))
# print(dataset[0])

In [4]:
# train_size = int(len(dataset) * 0.001)
# val_size = len(dataset) - train_size
# train_set, val_set = torch.utils.data.random_split(dataset, [train_size, val_size])
# print(len(train_set))
# print(len(val_set) == len(train_set))

In [5]:
import transforms as T

def get_transform(train):
    transforms = []
    transforms.append(T.ToTensor())
    if train:
        transforms.append(T.RandomHorizontalFlip(0.5))
    return T.Compose(transforms)

In [6]:
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)

num_classes = 2  
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

In [None]:
from engine import train_one_epoch, evaluate
import utils


def main():
    # train on the GPU or on the CPU, if a GPU is not available
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    # our dataset has two classes only - background and person
    num_classes = 2
    # use our dataset and defined transformations
    dataset = EgoHands('_LABELLED_SAMPLES/', get_transform(train=True))
    dataset_val = EgoHands('_LABELLED_SAMPLES/', get_transform(train=False))

    # split the dataset in train and test set
    indices = torch.randperm(len(dataset)).tolist()
    train_size = int(len(dataset) * 0.7)
    dataset = torch.utils.data.Subset(dataset, indices[:-train_size])
    dataset_val = torch.utils.data.Subset(dataset_val, indices[-train_size:])

    # define training and validation data loaders
    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=2, shuffle=True, num_workers=4,
        collate_fn=utils.collate_fn)

    data_loader_val = torch.utils.data.DataLoader(
        dataset_val, batch_size=1, shuffle=False, num_workers=4,
        collate_fn=utils.collate_fn)

    # move model to the right device
    model.to(device)

    # construct an optimizer
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params, lr=0.005,
                                momentum=0.9, weight_decay=0.0005)
    
    # and a learning rate scheduler
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                   step_size=3,
                                                   gamma=0.1)

    # let's train it for 10 epochs
    num_epochs = 1

    for epoch in range(num_epochs):
        # train for one epoch, printing every 10 iterations
        train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10)
        # update the learning rate
        lr_scheduler.step()
        # evaluate on the test dataset
#         evaluate(model, data_loader_test, device=device)

    print("That's it!")

main()

Epoch: [0]  [  0/720]  eta: 10:42:59  lr: 0.000012  loss: 1.1140 (1.1140)  loss_classifier: 0.7690 (0.7690)  loss_box_reg: 0.1342 (0.1342)  loss_objectness: 0.1845 (0.1845)  loss_rpn_box_reg: 0.0263 (0.0263)  time: 53.5830  data: 2.8146
Epoch: [0]  [ 10/720]  eta: 8:39:41  lr: 0.000081  loss: 0.9639 (1.0193)  loss_classifier: 0.7059 (0.6881)  loss_box_reg: 0.1756 (0.1670)  loss_objectness: 0.1049 (0.1336)  loss_rpn_box_reg: 0.0263 (0.0307)  time: 43.9180  data: 0.2625
