In [None]:
%matplotlib inline

In [None]:
import itertools
import os
import pprint
import requests
import tarfile
import zipfile

import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np
import torch
import torch.nn.functional as F
import torchvision
import torchvision.transforms as T

from io import BytesIO
from IPython.display import YouTubeVideo
from PIL import Image

## History

Object detection shares a similar history to image recognition.
They both currently use convolutional neural networks and both require large image datasets.
With convolutional neural networks, they share the common origina of the Neocognitron, LeNet, and AlexNet, but object detection was inspired by different methods that were used before the current era of neural networks.

There were two popular methods in the 2000s:


*   Lowe's scale-invariant feature transform (SIFT) method
*   Dalel and Trigg's historgram of oriented gradients (HOG)

They both use some type of convolutional features: a Gaussian is used in SIFT and a Gaussian or a Sobel operator are used in HOG.

The current methods are primarily convolutional neural network based.
They can be roughly classified into two types: single stage and two stage.
A single stage detector predicts an object and a bounding box for that object.
Two stage object detection extracts a region proposal (potential bounding boxes), then classifies these regions, e.g. does it have a car, a dog, a person, etc.

We'll be focusing on the single stage classifiers, in particular, the YOLO (You Only Look Once) class of models.


# Lakota AI Code Camp Lesson 03: Introduction to Object Detection

Object detection is a subfield of computer vision.
It is focused on detecting if an object is in an image and if it is in the image, where it's located.
Let's look at an example.

The following is the label of the image:

In [None]:
object_dict = {
   "objects":[
      {
         "rectangle":{
            "x":730,
            "y":66,
            "w":135,
            "h":85
         },
         "object":"kitchen appliance",
         "confidence":0.501
      },
      {
         "rectangle":{
            "x":523,
            "y":377,
            "w":185,
            "h":46
         },
         "object":"computer keyboard",
         "confidence":0.51
      },
      {
         "rectangle":{
            "x":471,
            "y":218,
            "w":289,
            "h":226
         },
         "object":"Laptop",
         "confidence":0.85,
         "parent":{
            "object":"computer",
            "confidence":0.851
         }
      },
      {
         "rectangle":{
            "x":654,
            "y":0,
            "w":584,
            "h":473
         },
         "object":"person",
         "confidence":0.855
      }
   ],
   "requestId":"25018882-a494-4e64-8196-f627a35c1135",
   "metadata":{
      "height":473,
      "width":1260,
      "format":"Jpeg"
   },
   "modelVersion":"2021-05-01"
}

We are downloading an image from a website, then we are converting it into it an array of numbers

In [None]:
img_src = "https://learn.microsoft.com/en-us/azure/cognitive-services/computer-vision/images/windows-kitchen.jpg"

resp = requests.get(img_src)

img = Image.open(BytesIO(resp.content))

img_array = np.asarray(img)

Images are typically stored in a 3 channel format.
In our case, the channels are red, green, and blue.
It is a combination of the different intensities of red, green, and blue in each matrix.

In [None]:
img

In [None]:
plt.figure(figsize=(473 / 30, 1260 / 30))
plt.imshow(img_array[:,:,0], cmap='Reds')
plt.show()

In [None]:
plt.figure(figsize=(473 / 30, 1260 / 30))
plt.imshow(img_array[:,:,1], cmap='Greens')
plt.show()

In [None]:
plt.figure(figsize=(473 / 30, 1260 / 30))
plt.imshow(img_array[:,:,2], cmap='Blues')
plt.show()

Now, we display the label of the image on the image:

In [None]:
fig, ax = plt.subplots(figsize=(473 / 30, 1260 / 30))

# Display the image
ax.imshow(img)

# Create a Rectangle patch
rect = []
for t in object_dict['objects']:
    rectangle = t['rectangle']
    x = rectangle['x']
    y = rectangle['y']
    w = rectangle['w']
    h = rectangle['h']
    obj=t['object']
    rect.append((x, y, w, h, obj))

# Add the patch to the Axes
for r in rect:
    x, y, w, h = r[:4]

    ax.add_patch(
        patches.Rectangle((x, y), w, h, linewidth=1, edgecolor='r', facecolor='none')
    )
    plt.annotate(r[-1], (x, y), color='green')

plt.show()

In [None]:
sobel_x = torch.Tensor([[1, 0, -1],
                        [2, 0, -2],
                        [1, 0, -1]])

sobel_y = torch.Tensor([[1, 2, 1],
                        [0, 0, 0],
                        [-1, -2, -1]])

In [None]:
sobel_x = torch.cat((sobel_x, sobel_x, sobel_x)).reshape(3, 3, 3)

sobel_y = torch.cat((sobel_y, sobel_y, sobel_y)).reshape(3, 3, 3)

In [None]:
sobel_x = torch.cat((sobel_x, sobel_x, sobel_x)).reshape(3, 3, 3, 3)

sobel_y = torch.cat((sobel_y, sobel_y, sobel_y)).reshape(3, 3, 3, 3)

In [None]:
img_tensor = torch.Tensor(img_array.copy())

In [None]:
out_img_x = F.conv2d(torch.permute(img_tensor, (2, 0, 1)).unsqueeze(0), weight=sobel_x)
out_img_y = F.conv2d(torch.permute(img_tensor, (2, 0, 1)).unsqueeze(0), weight=sobel_y)

In [None]:
out_img = torch.sqrt(torch.square(out_img_x) + torch.square(out_img_y))

In [None]:
out_img /= torch.max(out_img)

In [None]:
out_img = out_img.squeeze(0)

In [None]:
out_img = torch.permute(out_img, (1, 2, 0))

In [None]:
plt.figure(figsize=(473 / 30, 1260 / 30))
plt.imshow(out_img.numpy())
plt.show()

## Datasets

We're going to quickly go over some of the datasets.
We're going to go over them in more detail later.

There are 4 datasets that arose out of the need for high quality annotated images to train neural networks:


1.   Caltech 101
1.   Pascal VOC
1.   MS COCO
1.   Objects 365.

Caltech 101 originated in 2003 by several researchers, Fei-Fei Li, Marco Andreetto, Marc'Aurelio Ranzato, and Pietro Perona, at the California Institute of Technology. Let's look at some examples.

In [None]:
caltech101_ds = torchvision.datasets.Caltech101(root='../data', target_type=['category','annotation'], download=True)

In [None]:
caltech101_ds.annotation_categories

In [None]:
garfield = caltech101_ds.annotation_categories[43]

In [None]:
caltech101_ds.annotation_categories[caltech101_ds[0][1][0]]

In [None]:
def show_rand_image():
    num = torch.randint(low=0, high=8677, size=(1, )).item()

    image, label = caltech101_ds[num]
    
    category, annotation = label

    plt.imshow(image)
    plt.plot(annotation[0, :], annotation[1, :])
    plt.title(caltech101_ds.annotation_categories[category])

    plt.show()

In [None]:
show_rand_image()

In [None]:
garfield_indices = [
    5112, 5113, 5114, 5115, 5116, 5117, 5118, 5119, 5120,
    5121, 5122, 5123, 5124, 5125, 5126, 5127, 5128, 5129,
    5130, 5131, 5132, 5133, 5134, 5135, 5136, 5137, 5138,
    5139, 5140, 5141, 5142, 5143, 5144, 5145
]

num = np.random.randint(low=0, high=len(garfield_indices))

image, label = caltech101_ds[garfield_indices[num]]
    
category, annotation = label

plt.imshow(image)
plt.plot(annotation[0, :], annotation[1, :])
plt.title(caltech101_ds.annotation_categories[category])

plt.show()

In [None]:
root = '../data/caltech101'

In [None]:
categories = sorted(os.listdir(os.path.join(root, "101_ObjectCategories")))
categories.remove("BACKGROUND_Google")  # this is not a real class

In [None]:
name_map = {
            "Faces": "Faces_2",
            "Faces_easy": "Faces_3",
            "Motorbikes": "Motorbikes_16",
            "airplanes": "Airplanes_Side_2",
        }

In [None]:
annotation_categories = list(map(lambda x: name_map[x] if x in name_map else x, categories))

In [None]:
index = []
y = []
for (i, c) in enumerate(categories):
    n = len(os.listdir(os.path.join(root, "101_ObjectCategories", c)))
    index.extend(range(1, n + 1))
    y.extend(n * [i])

In [None]:
target_type = ['annotation']

In [None]:
import scipy.io

img = Image.open(
    os.path.join(
        root,
        "101_ObjectCategories",
        categories[y[num]],
        f"image_{index[num]:04d}.jpg",
    )
)

target = []
for t in target_type:
    if t == "category":
        target.append(y[num])
    elif t == "annotation":
        data = scipy.io.loadmat(
            os.path.join(
                root,
                "Annotations",
                annotation_categories[y[num]],
                f"annotation_{index[num]:04d}.mat",
            )
        )
        target.append(data["obj_contour"])
target = tuple(target) if len(target) > 1 else target[0]

In [None]:
pp = pprint.PrettyPrinter(indent=4)

In [None]:
pp.pprint(data)

In [None]:
y1, y2, x1, x2 = data['box_coord'][0]

In [None]:
fig, ax = plt.subplots()

# Display the image
ax.imshow(img)

# Create a Rectangle patch
# Add the patch to the Axes
plt.plot(target[0, :] + x1, target[1, :] + y1)

ax.add_patch(
    patches.Rectangle([x1, y1], width=x2-x1, height=y2-y1, linewidth=1, edgecolor='r', facecolor='none')
)


plt.show()

Let's look at Pascal VOC now.

In [None]:
pascal_voc_ds = torchvision.datasets.VOCDetection(root='../data/', download=False)

In [None]:
num = torch.randint(low=0, high=len(pascal_voc_ds), size=(1, )).item()
print(num)

In [None]:
image, target = pascal_voc_ds[num]

In [None]:
pp.pprint(target)

In [None]:
pp.pprint(target['annotation'])

In [None]:
fig, ax = plt.subplots()

# Display the image
ax.imshow(image)

annotations = target['annotation']['object']

# Create a Rectangle patch
rect = []
for t in annotations:
    rectangle = t['bndbox']
    xmin = int(rectangle['xmin'])
    ymin = int(rectangle['ymin'])
    xmax = int(rectangle['xmax'])
    ymax = int(rectangle['ymax'])
    x = xmin
    y = ymin
    w = xmax - xmin
    h = ymax - ymin
    obj=t['name']
    rect.append((x, y, w, h, obj))

# Add the patch to the Axes
for r in rect:
    x, y, w, h = r[:4]

    ax.add_patch(
        patches.Rectangle((x, y), w, h, linewidth=1, edgecolor='r', facecolor='none')
    )
    plt.annotate(r[-1], (x, y), color='green')


plt.show()

We'll examine the COCO dataset

In [None]:
!git clone https://github.com/cocodataset/cocoapi/

In [None]:
coco_ds = torchvision.datasets.CocoDetection(root='../data/val2017/images',
                                             annFile='../data/val2017/annotations/instances_val2017.json')

In [None]:
import json

with open('../data/val2017/annotations/instances_val2017.json', 'r') as f:
    coco_cats = json.load(f)

In [None]:
categories = [0] * 91

for c in coco_cats['categories']:
    categories[c['id']] = c['name']

In [None]:
num = np.random.randint(low=0, high=len(coco_ds))

image, label = coco_ds[num]

In [None]:
fig, ax = plt.subplots(figsize=(10, 20))

# Display the image
ax.imshow(image)

# Create a Rectangle patch
rect = []
for t in label:
    rectangle = t['bbox']
    x, y, w, h = rectangle
    obj = t['category_id']
    rect.append((x, y, w, h, obj))

# Add the patch to the Axes
for r in rect:
    x, y, w, h = r[:4]

    ax.add_patch(
        patches.Rectangle((x, y), w, h, linewidth=1, edgecolor='r', facecolor='none')
    )
    plt.annotate(categories[r[-1]], (x, y), color='green')


plt.show()