In [1]:
from PIL import Image
import torchvision
import torchvision.transforms as transforms
import cv2
import numpy as np
import torch

COCO_NAMES = [
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
    'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A',
    'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'ball',
    'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
    'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
    'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
    'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table',
    'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
    'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book',
    'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]

COLORS = np.random.uniform(0, 255, size=(len(COCO_NAMES), 3))

TRANSFORM = transforms.Compose([
    transforms.ToTensor(),
])

def get_model(device):
    model = torchvision.models.detection.ssdlite320_mobilenet_v3_large(
        #weights='DEFAULT',
        weights_backbone='DEFAULT', 
        trainable_backbone_layers=0,
        num_classes=2
    )
    model = model.eval().to(device)
    return model

def predict(image, model, device, detection_threshold):
    """
    Predict the output of an image after forward pass through
    the model and return the bounding boxes, class names, and 
    class labels. 
    """
    # transform the image to tensor
    image = TRANSFORM(image).to(device)
    # add a batch dimension
    image = image.unsqueeze(0) 
    # get the predictions on the image
    with torch.no_grad():
        outputs = model(image) 
    # get score for all the predicted objects
    pred_scores = outputs[0]['scores'].detach().cpu().numpy()
    # get all the predicted bounding boxes
    pred_bboxes = outputs[0]['boxes'].detach().cpu().numpy()
    # get boxes above the threshold score
    boxes = pred_bboxes[pred_scores >= detection_threshold].astype(np.int32)

    labels = outputs[0]['labels'][:len(boxes)]
    # get all the predicited class names
    pred_classes = [COCO_NAMES[i] for i in labels.cpu().numpy()]
    return boxes, pred_classes, labels

def draw_boxes(boxes, classes, labels, image):
    image = cv2.cvtColor(np.asarray(image), cv2.COLOR_BGR2RGB)
    for i, box in enumerate(boxes):
        color = COLORS[labels[i]]
        cv2.rectangle(
            image,
            (int(box[0]), int(box[1])),
            (int(box[2]), int(box[3])),
            color, 2
        )
        cv2.putText(image, classes[i], (int(box[0]), int(box[1]-5)),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.8, color, 2, 
                    lineType=cv2.LINE_AA)
    return image

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = get_model(device)
print([p.size() for p in model.parameters()])
image = Image.open('i2l-dataset/ball/ball_01.jpg')

boxes, classes, labels = predict(image, model, device, detection_threshold=0.1)
print([COCO_NAMES[label] for label in labels])
image = draw_boxes(boxes, classes, labels, image)

save_name = f'ssd_out-test'
#cv2.imshow('Image', image)
cv2.imwrite(f'{save_name}.jpg', image)
#cv2.waitKey(0)

In [4]:
# YOLOS
from transformers import YolosImageProcessor, YolosForObjectDetection
from PIL import Image
import torch
import requests

#url = "http://images.cocodataset.org/val2017/000000039769.jpg"
#image = Image.open(requests.get(url, stream=True).raw)

model = YolosForObjectDetection.from_pretrained('hustvl/yolos-tiny')
image_processor = YolosImageProcessor.from_pretrained("hustvl/yolos-tiny")


  from .autonotebook import tqdm as notebook_tqdm
The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


In [5]:
import torch.nn as nn

M = 3
last_layer = nn.Linear(in_features=model.class_labels_classifier.layers[2].in_features, out_features=M)
model.class_labels_classifier.layers[2] = last_layer


In [9]:
from glob import glob
import cv2
import numpy as np

COLORS = np.random.uniform(0, 255, size=(M, 3))

def draw_boxes(box, label, image):
    image = cv2.cvtColor(np.asarray(image), cv2.COLOR_BGR2RGB)
    color = COLORS[0]
    cv2.rectangle(
        image,
        (int(box[0]), int(box[1])),
        (int(box[2]), int(box[3])),
        color, 2
    )
    cv2.putText(image, label, (int(box[0]), int(box[1]-5)),
                cv2.FONT_HERSHEY_SIMPLEX, 0.8, color, 2, 
                lineType=cv2.LINE_AA)
    return image

labels_test = {
    0: 'ball',
    1: 'mug',
    2: 'pen'
}

for true_label in ('ball', 'mug', 'pen'):
    for fn in glob(f'i2l-dataset/{true_label}/*.jpg'):
        image = Image.open(fn)
        print(f'true label: {true_label}')

        inputs = image_processor(images=image, return_tensors="pt")
        outputs = model(**inputs)

        logits = outputs.logits
        bboxes = outputs.pred_boxes

        # print results
        target_sizes = torch.tensor([image.size[::-1]])
        
        results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[0]

        for score, label, box in zip(results["scores"][:10], results["labels"], results["boxes"]):
            box = [round(i, 2) for i in box.tolist()]
            print(
                f"Detected {labels_test[label.item()]} with confidence "
                f"{round(score.item(), 3)} at location {box}"
            )

            image = draw_boxes(box, labels_test[label.item()], image)

        #cv2.imshow('Image', image)
        #cv2.waitKey(0)
        cv2.imwrite(f'test-yolow-true_{fn.split("/")[-1]}', image)
        #input('next?')

true label: ball
Detected mug with confidence 0.734 at location [28.31, 13.28, 168.5, 87.82]
Detected mug with confidence 0.706 at location [10.33, 12.57, 173.29, 221.33]
Detected mug with confidence 0.677 at location [11.34, 16.24, 128.02, 200.25]
Detected mug with confidence 0.661 at location [12.07, 162.79, 73.07, 211.01]
Detected mug with confidence 0.678 at location [72.37, 90.88, 173.91, 210.64]
Detected mug with confidence 0.715 at location [14.49, 13.76, 216.98, 204.35]
Detected mug with confidence 0.693 at location [0.06, 10.12, 224.84, 174.39]
Detected mug with confidence 0.647 at location [129.42, 68.01, 209.8, 193.32]
Detected mug with confidence 0.547 at location [15.61, 155.53, 213.48, 221.07]
Detected mug with confidence 0.675 at location [33.88, 17.47, 165.23, 150.71]
true label: ball
Detected mug with confidence 0.703 at location [86.6, 89.59, 329.41, 269.46]
Detected mug with confidence 0.66 at location [8.34, 119.12, 250.41, 582.67]
Detected mug with confidence 0.545

ValueError: Unsupported number of image dimensions: 2

In [4]:
from torch.optim import AdamW
from time import time
from random import shuffle

X = []
Y_true = []

for ob in ('ball', 'mug', 'pen'):
    for i in (1, 3, 5):
        image = Image.open(f'i2l-dataset/{ob}/{ob}_0{i}.jpg')
        x = image_processor(images=image, return_tensors="pt")['pixel_values']
        X.append(x)
        y = torch.tensor([1., 0., 0.]) if ob == 'ball' else torch.tensor([0., 1., 0.]) if ob == 'mug' else torch.tensor([0., 0., 1.])
        Y_true.append(y)

optimizer = AdamW(last_layer.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

inds = list(range(len(X)))

for ep in range(10):
    shuffle(inds)
    for ind in inds:
        x = X[ind]
        y_true = Y_true[ind]

        optimizer.zero_grad()

        t0 = time()
        outputs = model(**{'pixel_values': x})
        print(f'inference time: {time()-t0}')

        y_pred = outputs.logits[0][0]

        loss = criterion(y_pred, y_true)

        loss.backward()

        optimizer.step()

        print(f'[ep {ep}] loss: {loss.data}')

inference time: 0.566413164138794
[ep 0] loss: 1.248409390449524
inference time: 0.7442398071289062
[ep 0] loss: 0.01679835096001625
inference time: 0.3843560218811035
[ep 0] loss: 7.556079387664795
inference time: 0.35074400901794434
[ep 0] loss: 8.062366485595703
inference time: 0.5266430377960205
[ep 0] loss: 5.394925117492676
inference time: 0.3230311870574951
[ep 0] loss: 2.286015033721924
inference time: 0.3327350616455078
[ep 0] loss: 4.939010143280029
inference time: 0.32449793815612793
[ep 0] loss: 0.832539439201355
inference time: 0.3219747543334961
[ep 0] loss: 1.7695698738098145
inference time: 0.3338611125946045
[ep 1] loss: 2.3216917514801025
inference time: 0.5902340412139893
[ep 1] loss: 0.23371371626853943
inference time: 0.35471510887145996
[ep 1] loss: 0.05608527734875679
inference time: 0.3724658489227295
[ep 1] loss: 0.04904178902506828
inference time: 0.43518614768981934
[ep 1] loss: 2.039592742919922
inference time: 0.3842461109161377
[ep 1] loss: 0.0246119890362