In [1]:
# Load Model & Dataset
'''Flickr 30k Grounding with PyTorch.'''
from __future__ import print_function
from FlickrDataset2 import FlickrDataset2
from tensorboardX import SummaryWriter
import matplotlib.patches as patches
from torch.autograd import Variable
from torchvision import transforms
import matplotlib.pyplot as plt
from skimage.morphology import *

from Model2 import Model2
from net_util import *
from parser import *
import statistics
import math
import cv2
%matplotlib inline  
import os
# os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
# os.environ["CUDA_VISIBLE_DEVICES"]="1"

def retrieve_bboxes(att_map, bboxes):
    # compute the mean value of attention map
    mean = np.mean(att_map)
    # transform to 0-255 scale image
    test = (att_map * 255).astype('uint8')
    # threshold set to 3 times of mean value
    ret,thresh = cv2.threshold(test,round(mean*3*255), 255, 0)
    # contour detection
    im2, cts, hierarchy = cv2.findContours(thresh,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE)
    contours = []
    # storing all countors and exlude area less then 4 pixels
    for i in range(0,len(cts)): 
        x, y, w, h = cv2.boundingRect(cts[i]) 
        # expanding the detected region to 120% for sub-window search
        x -= 0.5*w
        y -= 0.5*h
        w += 1.*w
        h += 1.*h
        if w>2 and h>2:
            contours.append(np.clip([round(x), round(y), round(x+w), round(y+h)], 0, att_map.shape[0]))
    # selecting all bboxes inside the contours
    # instead of picking bboxes out, we store the desired bboxes index
    target = np.zeros(500,)
    feat_bboxes = []
    count = 0
    num_boxes = 0
    # selected boxes index
    target = np.zeros(bboxes.shape[0],)
    for box in bboxes:
#         resized_box = box/(256/att_map.shape[0])
        x_min, y_min, x_max, y_max = box.cpu().data.numpy()
        for contour in contours:
            contour2 = contour * (1024/att_map.shape[0])
            # check if box is inside the contour
#             print(x_min, y_min, w, h)
            if x_min >= contour2[0] and y_min >= contour2[1] and x_max <= contour2[2] and y_max <= contour2[3]:
                target[count] = 1
                num_boxes += 1
                break
        count += 1
    return target, num_boxes



In [12]:
# Load Pretrained Model.
class opts():
    backbone_model = './models/mrcnn.pth'
    class_num = 81
    batch_size = 64
    resume = './checkpoint/Model2_flickr_P3-P4-P5_att.pth'

opts = opts()
model = Model2(opts)
# Load Back bone Module
state_dict = torch.load(opts.resume)['state_dict']
new_params = model.state_dict()
new_params.update(state_dict)
model.load_state_dict(new_params)
model.cuda()
model.eval()
print('Model loaded')

# Load dataset and images
print("Preparing Flickr data set...")
size = (1024, 1024)
feat_size = (128, 128)
transform = transforms.Compose([transforms.Resize(size), transforms.ToTensor()])
data_set = FlickrDataset2('/media/drive1/Data/flickr30k/flickr30k_images/', feat_size, transform)

count = 500
check_num = 0
check_ratio = 0.5

# Evaluate on first 1000 images
for index in range(count):
    # attention map
    (img, category, (one_hot, label), textual_emb, phrase, mask, line, filename, size, all_one_hot, att_emb, att_label) = data_set[index]
    model.visual_net.config.IMAGES_PER_GPU = 1
    images = Variable(img.view(1, 3, 1024, 1024)).cuda()
    # One-hot input
    one_hot = Variable(torch.from_numpy(one_hot.reshape(1,-1))).cuda().float()
    category_p3, att_map3, category_p4, att_map4, category_p5, att_map5, rpn_rois, visual_cls = model(images, one_hot, label)
    
    #Evaluate on last attention maps
    att = (torch.nn.functional.avg_pool2d(att_map3,2)[0,0].data+att_map4[0,0].data+att_map5[0,0].data).cpu().numpy()
#     att = att_map5[0,0].data.cpu().numpy()
    target, num_boxes = retrieve_bboxes(att/3>0.35, rpn_rois[0])
    
    # Retrieve largest bbox
    temp = 0
    max_area = 0
    mx1, my1, mx2, my2 = 0, 0, 0, 0
    for box in rpn_rois[0]:
        if target[temp] == 1.:
            x1, y1, x2, y2 = box.cpu().data.numpy()
            # pick max window
            area = (x2-x1) * (y2-y1)
            if area > max_area:
                mx1, my1, mx2, my2 = x1, y1, x2, y2
                max_area = area
        temp += 1
    
    # Check if hit
    mx1, my1, mx2, my2 = mx1/8, my1/8, mx2/8, my2/8
    pixel_num = 0
    for i in range(mask.shape[0]):
        for j in range(mask.shape[1]):
            if mx1<i<mx2 and my1<j<my2 and mask[i,j] == 1:
                pixel_num += 1
    area = (my2-my1)*(mx2-mx1)
    if area != 0:
        ratio = pixel_num/area
    else:
        ratio = 0
    if ratio > check_ratio:
        check_num += 1
    
    # print instant ratio
    print(index, ratio, check_num/(index+1))
print('mAP:', check_num/count)

==> Building backbone model...
<config.Config object at 0x7f2c2b2fae80>
Model loaded
Preparing Flickr data set...
Loading dictionary...
Dictionary loaded.


  if ran not in attributes:
  visual_cls = self.softmax(self.fc(visual_feat))


0 0.8247603227815066 1.0
1 0.9885558462299218 1.0
2 0.0 0.6666666666666666
3 0 0.5
4 0 0.4
5 0.9069525430136354 0.5
6 0 0.42857142857142855
7 0 0.375
8 1.0240747936521688 0.4444444444444444
9 0 0.4
10 0.0 0.36363636363636365
11 0 0.3333333333333333
12 0.0 0.3076923076923077
13 0.7714807489343385 0.35714285714285715
14 0 0.3333333333333333
15 0.0 0.3125
16 0 0.29411764705882354
17 0.0 0.2777777777777778
18 0 0.2631578947368421
19 1.045986729691761 0.3
20 0.21196725912471895 0.2857142857142857
21 0 0.2727272727272727
22 1.0083163277895155 0.30434782608695654
23 1.009653939952486 0.3333333333333333
24 0.9023322241024557 0.36
25 0.0 0.34615384615384615
26 0 0.3333333333333333
27 0.0 0.32142857142857145
28 0 0.3103448275862069
29 0 0.3
30 0.6499673663177651 0.3225806451612903
31 0.881666281579521 0.34375
32 0 0.3333333333333333
33 0 0.3235294117647059
34 0.0 0.3142857142857143
35 1.0256423889786934 0.3333333333333333
36 0.0 0.32432432432432434
37 0.9814277955275063 0.34210526315789475
38 0.

In [None]:
# Attention pixel evaluation
