In [1]:
import os
import cv2
import torch
import numpy as np
import torch.nn as nn
import matplotlib.pyplot as plt
import torchvision.transforms as transforms
from torch.autograd import Variable
from torchvision.models import vgg16
from scipy.misc import imread, imresize

### gpu usage

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
use_gpu = torch.cuda.is_available()

In [3]:
class_file = 'routine_generate_vot2017_train/vot2017/list_20classes.txt'

In [4]:
CLASSES = [line.rstrip('\n') for line in open(class_file)]

In [5]:
len(CLASSES)

20

### set hyperparameters

In [6]:
img_size = 224
S = 14
B = 2
C = len(CLASSES)
threshold = 0.5
n_fatures = 1000

### load model with trained parameters

In [7]:
model = vgg16(pretrained=False)
model.classifier = nn.Sequential(
        nn.Linear(512 * 7 * 7, n_fatures),
        nn.LeakyReLU(0.1, inplace=True),
        nn.Dropout(),
        nn.Linear(n_fatures, (B*5+C) * S * S),
        nn.Sigmoid(),
    )
model.load_state_dict(torch.load('./results/model_200iters_S14_1000feas_vot20classes_fixlossbug.pth'))
model.eval()
if use_gpu:
    model.cuda()

### decode the prediction tensor

In [8]:
def decoder(pred, S, B, C, threshold):
    """

    :param pred: (tensor) 1x7x7x30
    :return: (tensor) box[[x,y,w,h]] label[...]
    """
    n_elements = B * 5 + C
    pred = pred.data
    pred = pred.squeeze(0)  # 7x7x11
    pred = pred.view(S, S, -1) # (7,7,11)
    cell_size = 1. / S
    boxes = []

    contain_list = []
    for i in range(B):
        contain_list.append(pred[:,:,5*i+4].unsqueeze(2)) # (7,7,1)
    for index, contain_part in enumerate(contain_list):
        if index == 0:
            contain = contain_part
        else:
            contain = torch.cat((contain, contain_part), dim=2) # (7,7,2)

    max_conf, max_indices = contain.max(dim=2) # figure out the max confidence in all bboxes (7,7)
    mask = max_conf>=threshold # indices of grids that choose it which >= threshold (7,7)
    #print(max_conf[mask]) # indices of grids which has maximum prediction confidence
    maxbbox_index = max_indices[mask] # indices of bbox which has maximum prediction confidence
    max_tensor_indices = mask.unsqueeze(-1).repeat(1, 1, n_elements) # choose tensor which >= threshold
    #print(mask.unsqueeze(-1).size())
    #print(n_elements)
    #print(pred.size())
    #print(max_tensor_indices.size())
    #print(pred[max_tensor_indices].size())
    max_tensor = pred[max_tensor_indices].view(-1, n_elements)
    max_tensor = max_tensor.numpy()

    # choose bounding box of max_tensor which is in bbox_index
    for i in range(len(max_tensor)):
        box = max_tensor[i, maxbbox_index[i]*5:maxbbox_index[i]*5+4]
        boxes.append(box)
    boxes = np.array(boxes) # predicted bounding boxes
    probs = max_conf[mask].numpy() # probabilities of predicted objects
    #print(contain.size())
    #print(contain)
    #a = max_tensor[:,B*5:]#.argmax(axis=1)

    labels = max_tensor[:,B*5:].argmax(axis=1) # indices of predicted objects
    print(labels)
    #print(max_conf)
    return labels, probs, boxes

### make prediction function

In [9]:
def make_predict(model, image_name, S, B, C, threshold, root_path, use_gpu):
    result = []
    image = imread(root_path + image_name)
    h, w, _ = image.shape
    img = imresize(image, (224, 224))
    #plt.imshow(img)
    #plt.show()
    transform = transforms.Compose([transforms.ToTensor(), ])
    img = transform(img)
    img = Variable(img[None, :, :, :], volatile=True)
    if use_gpu:
        img = img.cuda()
    pred = model(img)  # 1x7x7x30
    pred = pred.cpu()
    labels, probs, boxes = decoder(pred, S, B, C, threshold)
    for index, item in enumerate(labels):
        print(CLASSES[item])
        print(probs[index])
        print('')

    top_left = boxes[:,:2] - 0.5 * boxes[:,2:]**2
    bottom_right = boxes[:,:2] + 0.5 * boxes[:,2:]**2
    for i in range(len(boxes)):
        x1 = int(top_left[i,0] * w)
        x2 = int(bottom_right[i,0] * w)
        y1 = int(top_left[i,1] * h)
        y2 = int(bottom_right[i,1] * h)
        result.append([(x1,y1), (x2,y2), CLASSES[labels[i]], image_name, probs[i]])
    return result

### dataset and file folder

In [10]:
root_path = './routine_generate_vot2017_train/vot2017/' + CLASSES[0] + '/'
image_name = '00000080.jpg'

In [11]:
root_path

'./routine_generate_vot2017_train/vot2017/ants1/'

In [12]:
result = make_predict(model, image_name, S=S, B=B, C=C, threshold=threshold, root_path=root_path, use_gpu=use_gpu)

[0]
ants1
0.969752



### plot the prediction on image

In [13]:
image = cv2.imread(os.path.join(root_path, image_name))
image = cv2.cvtColor(image,cv2.COLOR_BGR2RGB)

In [14]:
for left_up,right_bottom,class_name,_,prob in result:
    cv2.rectangle(image,left_up,right_bottom,(0,255,0),2)
    cv2.putText(image,class_name,left_up,cv2.FONT_HERSHEY_SIMPLEX,1,(0,0,255),1,cv2.LINE_AA)
    print(prob)

0.969752


In [15]:
result

[[(550, 736), (596, 800), 'ants1', '00000080.jpg', 0.96975213]]

### save prediction image

In [16]:
cv2.imwrite('result_vot_ant1.jpg',image)

True