This project is to use the learnings from the Torchvision Object Detection Finetuning Tutorial to have the model predict my dogs Kal and Jinse.

I started by downloading photos of my two dogs that were in many different lighting and positions. At the end I had about 80 photos.

I used CVAT.ai to label the images for masks and bounding boxes

Steps to complete in this model creation
1. Read annotations, images and masks from PASCAL VOC 1.1 format
2. Create Dataset and Dataloader for training and test data
3. Pull pre-trained model
4. Modify the last layer to work to identify Kal And Jinse
5. Train the model layer to identify Kal and Jinse for a few epochs.
6. Test functionality of the model
7. Adjust as necessary based on results

In [None]:
# Investigate out data
import matplotlib.pyplot as plt
from torchvision.io import read_image

image = read_image("Data/KalAndJinseIdentifying/Images/PXL_20220822_235544436.jpg")
mask = read_image("Data/KalAndJinseIdentifying/SegmentationClass/PXL_20220822_235544436.png")

In [None]:
figure = plt.figure(figsize=(8,4))
plt.subplot(1, 2, 1)
plt.title("Image")
plt.axis("off")
plt.imshow(image.permute(1, 2, 0))

plt.subplot(1, 2, 2)
plt.title("Mask")
plt.axis("off")
plt.imshow(mask.permute(1, 2, 0))

Get Item should return the image and the target value. Target will have boxes, masks, labels, image_id, area and iscrowd.

target = {}
target["boxes"] = tv_tensors.BoundingBoxes(boxes, format="XYXY", canvas_size = F.get_size(img))
target["masks"] = tv_tensors.Mask(masks)
target["labels"] = labels
target["image_id"] = image_id
target["area"] = area
target["iscrowd"] = iscrowd

In [None]:
import torch
from torchvision.io import read_image
from torchvision.ops.boxes import masks_to_boxes
from torchvision.transforms.functional import InterpolationMode

from torchvision.utils import draw_bounding_boxes, draw_segmentation_masks

from torchvision import transforms

# First we need to collect the image associated with the index
img = read_image("Data/KalAndJinseIdentifying/Images/PXL_20220907_111402520.jpg")

# Next we need to get the associated mask png
mask = read_image("Data/KalAndJinseIdentifying/SegmentationClass/PXL_20220907_111402520.png")
resized_transform = transforms.Resize((438, 567), interpolation=InterpolationMode.NEAREST)
img = resized_transform(img)
mask = resized_transform(mask)

for h in range(mask.shape[1]):
    for w in range(mask.shape[2]):
        print(mask[:, h, w])

kal_mask = (mask[0, :] == 128)
jinse_mask = (mask[1, :] == 128)
mask[0, kal_mask] = 1
mask[0, jinse_mask] = 2
mask = mask[0:1]

# Collect all the different unique elements labled in the mask. (In this set each person)
obj_ids = torch.unique(mask)

# Remove the first element as it is background labeling
obj_ids = obj_ids[1:]
num_objs = len(obj_ids)
print(obj_ids)
print(mask.shape)

# Making binary mask of objects found.
# [:, None, None] is needed to reshape the obj_ids tensor which is a 1D tensor containing the different colors
# So that an element wise comparison against the pixels in the mask can be done. given Trues where pixels
# part of an object and false elsewhere. Then translating that to 1 and 0s instead of booleans.
masks = (mask == obj_ids[:, None, None]).to(dtype=torch.uint8)
print(masks.shape)

# get boxes of that bound the objects
boxes = masks_to_boxes(masks)

labels = []
if 1 in obj_ids:
    labels.append("Kal")
if 2 in obj_ids:
    labels.append("Jinse")

output_image = draw_bounding_boxes(img, boxes, labels, colors=["green", "red"], width=5)

plt.figure(figsize=(12, 12))
plt.imshow(output_image.permute(1, 2, 0))


In [None]:
import torch
import os

from torchvision.io import read_image
from torchvision.ops.boxes import masks_to_boxes
from torchvision import tv_tensors
from torchvision.transforms.v2 import functional as F

img = read_image(os.path.join('Data/PennFudanPed', 'PNGImages', 'FudanPed00046.png'))
mask = read_image(os.path.join('Data/PennFudanPed', 'PedMasks', 'FudanPed00046_mask.png'))

# Collect all the different unique elements labled in the mask. (In this set each person)
obj_ids = torch.unique(mask)

# Remove the first element as it is background labeling
obj_ids = obj_ids[1:]
num_objs = len(obj_ids)
print(obj_ids)
print(mask.shape)

# Making binary mask of objects found.
# [:, None, None] is needed to reshape the obj_ids tensor which is a 1D tensor containing the different colors
# So that an element wise comparison against the pixels in the mask can be done. given Trues where pixels
# part of an object and false elsewhere. Then translating that to 1 and 0s instead of booleans.
masks = (mask == obj_ids[:, None, None]).to(dtype=torch.uint8)
print(masks.shape)

# get boxes of that bound the objects
boxes = masks_to_boxes(masks)

tensor([1, 2, 3, 4], dtype=torch.uint8)
torch.Size([1, 438, 567])
torch.Size([4, 438, 567])


In [None]:
import torch
import os

from torchvision.io import read_image
from torchvision.ops.boxes import masks_to_boxes
from torchvision import tv_tensors
from torchvision.transforms.v2 import functional as F

class KalAndJinseDataset(torch.utils.data.Dataset):
    def __init__(self, image_dir, mask_dir, transforms=None):
        # Collect image file names to sorted list
        self.image_dir = image_dir
        self.mask_dir = mask_dir
        self.image_file_list = sorted(os.listdir(image_dir))
        self.mask_file_list = sorted(os.listdir(mask_dir))
        self.transforms = transforms

    def __len__(self):
        return len(self.image_file_list)

    def __getitem__(self, idx):
        # First we need to collect the image associated with the index
        img = read_image(os.path.join(self.image_dir, self.image_file_list[idx]))

        # Next we need to get the associated mask png
        mask = read_image(os.path.join(self.mask_dir, self.mask_file_list[idx]))

        # Convert the pixels to single representative identifiers.
        kal_mask = (mask[0, :] == 128)
        jinse_mask = (mask[1, :] == 128)
        mask[0, kal_mask] = 1
        mask[0, jinse_mask] = 2
        # Convert to a single dimention for pixel i.e. [1, H, W]
        mask = mask[0:1]

        # Collect all the different unique elements labled in the mask. (In this set each person)
        obj_ids = torch.unique(mask)

        # Remove the first element as it is background labeling
        obj_ids = obj_ids[1:]
        num_objs = len(obj_ids)

        # Making binary mask of objects found.
        # [:, None, None] is needed to reshape the obj_ids tensor which is a 1D tensor containing the different colors
        # So that an element wise comparison against the pixels in the mask can be done. given Trues where pixels
        # part of an object and false elsewhere. Then translating that to 1 and 0s instead of booleans.
        masks = (mask == obj_ids[:, None, None]).to(dtype=torch.uint8)

        # get boxes of that bound the objects
        boxes = masks_to_boxes(masks)

#         labels = []
#         if 1 in obj_ids:
#             labels.append("Kal")
#         if 2 in obj_ids:
#             labels.append("Jinse")

        image_id = idx

        # Boxes are in this format:
        # Column 0: x-coordinate of the top-left corner
        # Column 1: y-coordinate of the top-left corner
        # Column 2: x-coordinate of the bottom-right corner
        # Column 3: y-coordinate of the bottom-right corner
        area = (boxes[:,3] - boxes[:,1]) * (boxes[:,2] - boxes[:,0])

        # suppose all instances are not crowd
        iscrowd = torch.zeros((num_objs,), dtype=torch.int64)

        # Make image a TVTensor
        img = tv_tensors.Image(img)

        target = {}
        target["boxes"] = tv_tensors.BoundingBoxes(boxes, format="XYXY", canvas_size = F.get_size(img))
        target["masks"] = tv_tensors.Mask(masks)
        target["labels"] = obj_ids.to(dtype=torch.int64)
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd

        if self.transforms:
            img, target = self.transforms(img, target)

        return img, target

In [None]:
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor

def get_model_instance_segmentation(num_classes):
    # load pre-trained model
    model = torchvision.models.detection.maskrcnn_resnet50_fpn(weights="DEFAULT")

    # get input features for bounding box
    in_features = model.roi_heads.box_predictor.cls_score.in_features

    # replace with our predictor
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    # Get segmentation predictor input features
    in_features = model.roi_heads.mask_predictor.conv5_mask.in_channels
    # Common hidden layer value
    hidden_layer = 256

    # replace
    model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features, hidden_layer, num_classes)

    return model

In [None]:
# Provide transformations

from torchvision.transforms import v2 as T
from torchvision.transforms.functional import InterpolationMode

def get_transform(train):
    transforms = []
    if train:
        transforms.append(T.RandomHorizontalFlip(0.5))
    transforms.append(T.ToDtype(torch.float, scale=True))
    transforms.append(T.ToPureTensor())
#     transforms.append(T.Resize((438, 567), interpolation=InterpolationMode.NEAREST))
    return T.Compose(transforms)

In [None]:
# Test functionality before actual training and evaluating

import utils


model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT")
dataset = KalAndJinseDataset('Data/KalAndJinseIdentifying/Images', 'Data/KalAndJinseIdentifying/SegmentationClass', get_transform(train=True))
data_loader = torch.utils.data.DataLoader(
    dataset,
    batch_size=2,
    shuffle=True,
    num_workers=4,
    collate_fn=utils.collate_fn
)

# For Training
images, targets = next(iter(data_loader))
images = list(image for image in images)
targets = [{k: v for k, v in t.items()} for t in targets]
output = model(images, targets)  # Returns losses and detections
print(output)

# For inference
model.eval()
x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
predictions = model(x)  # Returns predictions
print(predictions[0])

{'loss_classifier': tensor(0.3232, grad_fn=<NllLossBackward0>), 'loss_box_reg': tensor(0.0798, grad_fn=<DivBackward0>), 'loss_objectness': tensor(0.0037, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), 'loss_rpn_box_reg': tensor(0.0014, grad_fn=<DivBackward0>)}
{'boxes': tensor([], size=(0, 4), grad_fn=<StackBackward0>), 'labels': tensor([], dtype=torch.int64), 'scores': tensor([], grad_fn=<IndexBackward0>)}


In [None]:
# Training & dataset in train and test set (Save last 15 for test set)
indices = torValidation

from engine import train_one_epoch, evaluate

# train on the GPU or on the CPU, if a GPU is not available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# our dataset has 3 classes only - background, kal, jinse
num_classes = 3
# use our dataset and defined transformations
dataset = KalAndJinseDataset('Data/KalAndJinseIdentifying/Images', 'Data/KalAndJinseIdentifying/SegmentationClass', get_transform(train=True))
dataset_test = KalAndJinseDataset('Data/KalAndJinseIdentifying/Images', 'Data/KalAndJinseIdentifying/SegmentationClass', get_transform(train=False))

# split the dataset in train and test set (Save last 15 for test set)
indices = tordataset in train and test set (Save last 15 for test set)
indices = torch.randperm(len(dataset)).tolist()
dataset = torch.utils.data.Subset(dataset, indices[:-15])
dataset_test = torch.utils.data.Subset(dataset_test, indices[-15:])

# define training and validation data loaders
data_loader = torch.utils.data.DataLoader(
    dataset,
    batch_size=2,
    shuffle=True,
    num_workers=4,
    collate_fn=utils.collate_fn
)

data_loader_test = torch.utils.data.DataLoader(
    dataset_test,
    batch_size=1,
    shuffle=False,
    num_workers=4,
    collate_fn=utils.collate_fn
)

# get the model using our helper function
# model = get_model_instance_segmentation(num_classes)

# # move model to the right device
# model.to(device)

# construct an optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(
    params,
    lr=0.005,
    momentum=0.9,
    weight_decay=0.0005
)

# and a learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(
    optimizer,
    step_size=3,
    gamma=0.1
)

# let's train it just for 10 epochs
num_epochs = 30

for epoch in range(num_epochs):
    # train for one epoch, printing every 10 iterations
    train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10)
    # update the learning rate
    lr_scheduler.step()
    # evaluate on the test dataset
    evaluate(model, data_loader_test, device=device)

print("Training Completed")

In [None]:
import matplotlib.pyplot as plt

from torchvision.utils import draw_bounding_boxes, draw_segmentation_masks


image = read_image("Data/KalAndJinseIdentifying/Images/PXL_20220924_193720820.jpg")
eval_transform = get_transform(train=False)

model.eval()
with torch.no_grad():
    x = eval_transform(image)
    # convert RGBA -> RGB and move to device
    x = x[:3, ...].to(device)
    predictions = model([x, ])
    pred = predictions[0]
good_score_mask = pred["scores"] > 0.5
kal_mask = pred["labels"] == 1
jinse_mask = pred["labels"] == 2

kal_mask = good_score_mask & kal_mask
jinse_mask = good_score_mask & jinse_mask
image = (255.0 * (image - image.min()) / (image.max() - image.min())).to(torch.uint8)
image = image[:3, ...]
# kal_mask = pred["labels"][good_scores_mask] == 1
# jinse_mask = pred["labels"][good_scores_mask] == 2
kal_labels = [f"{label}: {score:.3f}" for label, score in zip(pred["labels"][kal_mask], pred["scores"][kal_mask])]
kal_boxes = pred["boxes"][kal_mask].long()
output_image = draw_bounding_boxes(image, kal_boxes, kal_labels, colors="yellow")

jinse_labels = [f"{label}: {score:.3f}" for label, score in zip(pred["labels"][jinse_mask], pred["scores"][jinse_mask])]
jinse_boxes = pred["boxes"][jinse_mask].long()
output_image = draw_bounding_boxes(output_image, jinse_boxes, jinse_labels, colors="red")

kal_masks = (pred["masks"][kal_mask] > 0.5).squeeze(1)
output_image = draw_segmentation_masks(output_image, kal_masks, alpha=0.5, colors="yellow")

jinse_masks = (pred["masks"][jinse_mask] > 0.5).squeeze(1)
output_image = draw_segmentation_masks(output_image, jinse_masks, alpha=0.5, colors="red")

plt.figure(figsize=(12, 12))
plt.imshow(output_image.permute(1, 2, 0))

In [None]:
import torch
torch.save(model.state_dict(), "SavedModels/KalAndJinseIdentifier/KalAndJinseIdentifierModel")

In [None]:
# Stream Webcamera now to identify in real time. Before putting it on RC car.

import cv2
import matplotlib.pyplot as plt
from torchvision.utils import draw_bounding_boxes, draw_segmentation_masks
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# get the model using our helper function
model = get_model_instance_segmentation(3)

# move model to the right device
model.to(device)
model.load_state_dict(torch.load("SavedModels/KalAndJinseIdentifier/KalAndJinseIdentifierModel"))

video_capture = cv2.VideoCapture(0)

if not video_capture.isOpened():
    raise IOError("Cannot open webcam")

while True:
    ret, frame = video_capture.read()
    if not ret:
        break

    ## Process image with model
    # Convert from BGR (OpenCV format) to RGB (PyTorch format)
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    # Convert from [H, W, C] to [C, H, W] tensor
    frame_tensor = torch.from_numpy(frame).permute(2, 0, 1)
    image = frame_tensor

    eval_transform = get_transform(train=False)

    model.eval()
    with torch.no_grad():
        x = eval_transform(image)
        # convert RGBA -> RGB and move to device
        x = x[:3, ...].to(device)
        predictions = model([x, ])
        pred = predictions[0]
    good_score_mask = pred["scores"] > 0.8
    kal_mask = pred["labels"] == 1
    jinse_mask = pred["labels"] == 2

    kal_mask = good_score_mask & kal_mask
    jinse_mask = good_score_mask & jinse_mask
    image = (255.0 * (image - image.min()) / (image.max() - image.min())).to(torch.uint8)
    image = image[:3, ...]
    # kal_mask = pred["labels"][good_scores_mask] == 1
    # jinse_mask = pred["labels"][good_scores_mask] == 2
    kal_labels = [f"{label}: {score:.3f}" for label, score in zip(pred["labels"][kal_mask], pred["scores"][kal_mask])]
    kal_boxes = pred["boxes"][kal_mask].long()
    output_image = draw_bounding_boxes(image, kal_boxes, kal_labels, colors="yellow")

    jinse_labels = [f"{label}: {score:.3f}" for label, score in zip(pred["labels"][jinse_mask], pred["scores"][jinse_mask])]
    jinse_boxes = pred["boxes"][jinse_mask].long()
    output_image = draw_bounding_boxes(output_image, jinse_boxes, jinse_labels, colors="red")

    kal_masks = (pred["masks"][kal_mask] > 0.5).squeeze(1)
    output_image = draw_segmentation_masks(output_image, kal_masks, alpha=0.5, colors="yellow")

    jinse_masks = (pred["masks"][jinse_mask] > 0.5).squeeze(1)
    output_image = draw_segmentation_masks(output_image, jinse_masks, alpha=0.5, colors="red")

    # Convert output_image back to cv2 format
    output_frame = output_image.permute(1, 2, 0).numpy()
    output_frame = cv2.cvtColor(output_frame, cv2.COLOR_RGB2BGR)

    cv2.imshow('Detect Jinse & Kal', output_frame)
    if cv2.waitKey(1) == ord('q'):
        break

video_capture.release()
cv2.destroyAllWindows()