In [1]:
import os
import io
import tqdm
import detectron2

# import some common detectron2 utilities
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2.utils.visualizer import Visualizer
from detectron2.data import MetadataCatalog

# import some common libraries
import numpy as np
import cv2
import torch

# Show the image in ipynb
from IPython.display import clear_output, Image, display
import PIL.Image
def showarray(a, fmt='jpeg'):
    a = np.uint8(np.clip(a, 0, 255))
    f = io.BytesIO()
    PIL.Image.fromarray(a).save(f, fmt)
    display(Image(data=f.getvalue()))

In [2]:
# Load VG Classes
data_path = '/scratch/gobi1/johnchen/new_git_stuff/py-bottom-up-attention/data'

vg_classes = []
with open(os.path.join(data_path, 'objects_vocab.txt')) as f:
    for object in f.readlines():
        vg_classes.append(object.split(',')[0].lower().strip())
        
vg_attrs = []
with open(os.path.join(data_path, 'attributes_vocab.txt')) as f:
    for object in f.readlines():
        vg_attrs.append(object.split(',')[0].lower().strip())


MetadataCatalog.get("vg").thing_classes = vg_classes
MetadataCatalog.get("vg").attr_classes = vg_attrs

In [3]:
cfg = get_cfg()
cfg.merge_from_file("../configs/VG-Detection/faster_rcnn_R_101_C4_attr_caffemaxpool.yaml")
cfg.MODEL.RPN.POST_NMS_TOPK_TEST = 300
cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST = 0.6
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.2
# VG Weight
cfg.MODEL.WEIGHTS = "http://nlp.cs.unc.edu/models/faster_rcnn_from_caffe_attr.pkl"
predictor = DefaultPredictor(cfg)

Config '../configs/VG-Detection/faster_rcnn_R_101_C4_attr_caffemaxpool.yaml' has no VERSION. Assuming it to be compatible with latest v2.


Modifications for VG in ResNet Backbone (modeling/backbone/resnet.py):
	Using pad 0 in stem max_pool instead of pad 1.

Modifications for VG in RPN (modeling/proposal_generator/rpn.py):
	Use hidden dim 512 instead fo the same dim as Res4 (1024).

Modifications for VG in RoI heads (modeling/roi_heads/roi_heads.py):
	1. Change the stride of conv1 and shortcut in Res5.Block1 from 2 to 1.
	2. Modifying all conv2 with (padding: 1 --> 2) and (dilation: 1 --> 2).
	For more details, please check 'https://github.com/peteanderson80/bottom-up-attention/blob/master/models/vg/ResNet-101/faster_rcnn_end2end_final/test.prototxt'.

Modifications for VG in RoI heads (modeling/roi_heads/fast_rcnn.py))
	Embedding: 1601 --> 256	Linear: 2304 --> 512	Linear: 512 --> 401



In [4]:
import os
from tqdm.auto import tqdm
images_path = "/scratch/gobi1/johnchen/new_git_stuff/lxmert/data/medvqa/VQA-Med-2020-Task1-VQAnswering-TrainVal-Sets/VQAMed2020-VQAnswering-TrainingSet/VQAnswering_2020_Train_images"
all_imgs = []
def process_images():
    
    for root,dirs,files in os.walk(images_path):
        for file in tqdm(files):
            im = cv2.imread(os.path.join(root,file))
            im_rgb = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
            all_imgs.append(im_rgb)
#             showarray(im_rgb)

process_images()
# im = cv2.imread("/scratch/gobi1/johnchen/new_git_stuff/py-bottom-up-attention/data/synpic593.jpg")
# im_rgb = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
showarray(all_imgs[-1])

HBox(children=(FloatProgress(value=0.0, max=4000.0), HTML(value='')))




<IPython.core.display.Image object>

In [6]:
import pandas as pd
from collections import defaultdict

In [9]:
NUM_OBJECTS = 36

from detectron2.modeling.postprocessing import detector_postprocess
from detectron2.modeling.roi_heads.fast_rcnn import FastRCNNOutputLayers, FastRCNNOutputs, fast_rcnn_inference_single_image

def doit(raw_image):
    with torch.no_grad():
        raw_height, raw_width = raw_image.shape[:2]
        print("Original image size: ", (raw_height, raw_width))
        
        # Preprocessing
        image = predictor.transform_gen.get_transform(raw_image).apply_image(raw_image)
        print("Transformed image size: ", image.shape[:2])
        image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1))
        inputs = [{"image": image, "height": raw_height, "width": raw_width}]
        images = predictor.model.preprocess_image(inputs)
        
        # Run Backbone Res1-Res4
        features = predictor.model.backbone(images.tensor)
        
        # Generate proposals with RPN
        proposals, _ = predictor.model.proposal_generator(images, features, None)
        proposal = proposals[0]
        print('Proposal Boxes size:', proposal.proposal_boxes.tensor.shape)
        
        # Run RoI head for each proposal (RoI Pooling + Res5)
        proposal_boxes = [x.proposal_boxes for x in proposals]
        features = [features[f] for f in predictor.model.roi_heads.in_features]
        box_features = predictor.model.roi_heads._shared_roi_transform(
            features, proposal_boxes
        )
        feature_pooled = box_features.mean(dim=[2, 3])  # pooled to 1x1
        print('Pooled features size:', feature_pooled.shape)
        
        # Predict classes and boxes for each proposal.
        pred_class_logits, pred_attr_logits, pred_proposal_deltas = predictor.model.roi_heads.box_predictor(feature_pooled)
        outputs = FastRCNNOutputs(
            predictor.model.roi_heads.box2box_transform,
            pred_class_logits,
            pred_proposal_deltas,
            proposals,
            predictor.model.roi_heads.smooth_l1_beta,
        )
        probs = outputs.predict_probs()[0]
        boxes = outputs.predict_boxes()[0]
        
        attr_prob = pred_attr_logits[..., :-1].softmax(-1)
        max_attr_prob, max_attr_label = attr_prob.max(-1)
        
        # Note: BUTD uses raw RoI predictions,
        #       we use the predicted boxes instead.
        # boxes = proposal_boxes[0].tensor    
        
        # NMS
        for nms_thresh in np.arange(0.5, 1.0, 0.1):
            instances, ids = fast_rcnn_inference_single_image(
                boxes, probs, image.shape[1:], 
                score_thresh=0.2, nms_thresh=nms_thresh, topk_per_image=NUM_OBJECTS
            )
            if len(ids) == NUM_OBJECTS:
                break
                
        instances = detector_postprocess(instances, raw_height, raw_width)
        roi_features = feature_pooled[ids].detach()
        max_attr_prob = max_attr_prob[ids].detach()
        max_attr_label = max_attr_label[ids].detach()
        instances.attr_scores = max_attr_prob
        instances.attr_classes = max_attr_label
        
        print(instances)
        
        return instances, roi_features
LIMIT_EXAMPLES = 10
df = defaultdict(list)
for i,cv2_img in enumerate(tqdm(all_imgs)):
    if i > LIMIT_EXAMPLES:
        break
    instances, features = doit(cv2_img)

    df["objects_id"].append(instances.pred_classes.cpu().numpy())
    df["objects_conf"].append( instances.scores.cpu().numpy())

    df["attrs_id"].append( instances.attr_classes.cpu().numpy())

    df["attrs_scores"].append( instances.attr_scores.cpu().numpy())

    df["boxes"].append( instances.pred_boxes.tensor.cpu().numpy())

    df["features"].append( features.cpu().numpy())
all_features = pd.DataFrame(df)
all_features.to_csv("my_dump", )
#     all_features.append((instances,features))
# instances, features = doit(all_imgs[-1])

# print(instances.pred_boxes)
# print(instances.scores)
# print(instances.pred_classes)
# print(instances.attr_classes)
# print(instances.attr_scores)



HBox(children=(FloatProgress(value=0.0, max=4000.0), HTML(value='')))

Original image size:  (1024, 848)
Transformed image size:  (966, 800)
Proposal Boxes size: torch.Size([93, 4])
Pooled features size: torch.Size([93, 2048])
Instances(num_instances=25, image_height=1024, image_width=848, fields=[pred_boxes, scores, pred_classes, attr_scores, attr_classes])
Original image size:  (428, 360)
Transformed image size:  (951, 800)
Proposal Boxes size: torch.Size([77, 4])
Pooled features size: torch.Size([77, 2048])
Instances(num_instances=21, image_height=428, image_width=360, fields=[pred_boxes, scores, pred_classes, attr_scores, attr_classes])
Original image size:  (832, 1024)
Transformed image size:  (800, 985)
Proposal Boxes size: torch.Size([93, 4])
Pooled features size: torch.Size([93, 2048])
Instances(num_instances=19, image_height=832, image_width=1024, fields=[pred_boxes, scores, pred_classes, attr_scores, attr_classes])
Original image size:  (1280, 1024)
Transformed image size:  (1000, 800)
Proposal Boxes size: torch.Size([96, 4])
Pooled features siz

Instances(num_instances=21, image_height=655, image_width=1024, fields=[pred_boxes, scores, pred_classes, attr_scores, attr_classes])
Original image size:  (768, 1024)
Transformed image size:  (800, 1067)
Proposal Boxes size: torch.Size([96, 4])
Pooled features size: torch.Size([96, 2048])
Instances(num_instances=1, image_height=768, image_width=1024, fields=[pred_boxes, scores, pred_classes, attr_scores, attr_classes])
Original image size:  (454, 383)
Transformed image size:  (948, 800)
Proposal Boxes size: torch.Size([107, 4])
Pooled features size: torch.Size([107, 2048])
Instances(num_instances=23, image_height=454, image_width=383, fields=[pred_boxes, scores, pred_classes, attr_scores, attr_classes])
Original image size:  (512, 512)
Transformed image size:  (800, 800)
Proposal Boxes size: torch.Size([91, 4])
Pooled features size: torch.Size([91, 2048])
Instances(num_instances=36, image_height=512, image_width=512, fields=[pred_boxes, scores, pred_classes, attr_scores, attr_classes]

Instances(num_instances=36, image_height=768, image_width=463, fields=[pred_boxes, scores, pred_classes, attr_scores, attr_classes])
Original image size:  (512, 512)
Transformed image size:  (800, 800)
Proposal Boxes size: torch.Size([95, 4])
Pooled features size: torch.Size([95, 2048])
Instances(num_instances=11, image_height=512, image_width=512, fields=[pred_boxes, scores, pred_classes, attr_scores, attr_classes])
Original image size:  (1307, 1024)
Transformed image size:  (1021, 800)
Proposal Boxes size: torch.Size([77, 4])
Pooled features size: torch.Size([77, 2048])
Instances(num_instances=24, image_height=1307, image_width=1024, fields=[pred_boxes, scores, pred_classes, attr_scores, attr_classes])
Original image size:  (1127, 1024)
Transformed image size:  (880, 800)
Proposal Boxes size: torch.Size([76, 4])
Pooled features size: torch.Size([76, 2048])
Instances(num_instances=1, image_height=1127, image_width=1024, fields=[pred_boxes, scores, pred_classes, attr_scores, attr_class

Instances(num_instances=14, image_height=1170, image_width=1024, fields=[pred_boxes, scores, pred_classes, attr_scores, attr_classes])
Original image size:  (854, 766)
Transformed image size:  (892, 800)
Proposal Boxes size: torch.Size([79, 4])
Pooled features size: torch.Size([79, 2048])
Instances(num_instances=5, image_height=854, image_width=766, fields=[pred_boxes, scores, pred_classes, attr_scores, attr_classes])
Original image size:  (800, 600)
Transformed image size:  (1067, 800)
Proposal Boxes size: torch.Size([79, 4])
Pooled features size: torch.Size([79, 2048])
Instances(num_instances=1, image_height=800, image_width=600, fields=[pred_boxes, scores, pred_classes, attr_scores, attr_classes])
Original image size:  (410, 323)
Transformed image size:  (1015, 800)
Proposal Boxes size: torch.Size([81, 4])
Pooled features size: torch.Size([81, 2048])
Instances(num_instances=0, image_height=410, image_width=323, fields=[pred_boxes, scores, pred_classes, attr_scores, attr_classes])
Or

In [10]:
pd.read_csv("my_dump")

Unnamed: 0.1,Unnamed: 0,objects_id,objects_conf,attrs_id,attrs_scores,boxes,features
0,0,[ 72 72 72 72 72 956 956 72 956 956 941 9...,[0.69351244 0.49890116 0.48968267 0.37984845 0...,[163 163 163 163 163 11 11 163 11 11 115 ...,[0.4452995 0.4949437 0.4695939 0.51592565 0...,[[5.0339971e+00 0.0000000e+00 7.8052393e+02 2....,[[0.0000000e+00 0.0000000e+00 0.0000000e+00 .....
1,1,[191 53 274 274 53 274 53 274 53 274 53 ...,[0.50076324 0.40472943 0.37945783 0.36862713 0...,[11 0 0 0 0 0 0 0 7 0 7 7 0 11 0 ...,[0.3537783 0.19075967 0.2965493 0.27186775 0...,[[144.6067 248.95781 202.94795 306.4671...,[[4.9657074e-01 8.4801614e-02 1.4183599e+00 .....
2,2,[1251 1251 1251 395 395 395 395 395 758 ...,[0.3819849 0.36146998 0.35105303 0.33720773 0...,[210 210 7 163 163 163 163 163 7 210 163 2...,[0.2025735 0.23160517 0.20085882 0.39709234 0...,[[1.92684525e+02 1.74144272e+02 3.60033844e+02...,[[0.0000000e+00 0.0000000e+00 2.2475598e-03 .....
3,3,[ 956 623 242 623 976 956 623 956 956 ...,[0.42947546 0.422869 0.32063138 0.31490225 0...,[11 11 11 11 11 11 11 11 11 11 11 7 11 11 11 ...,[0.5459262 0.28252634 0.6158573 0.27720866 0...,[[5.80043755e+01 5.09970665e-01 1.01501337e+03...,[[3.5524480e-02 0.0000000e+00 0.0000000e+00 .....
4,4,[ 248 1069 1069 683 907 248 1069],[0.30651996 0.26655832 0.24919212 0.22590521 0...,[0 0 0 7 0 0 0],[0.46289015 0.23835789 0.29643387 0.38620916 0...,[[2.8595343e+02 0.0000000e+00 5.0729361e+02 4....,[[1.8191656e-02 1.6593523e-02 0.0000000e+00 .....
...,...,...,...,...,...,...,...
96,96,[ 60 60 715 60 72 72 72 60 72 72 60 ...,[0.5942373 0.539264 0.47682664 0.45885992 0...,[ 11 11 7 161 163 163 163 161 163 163 161 1...,[0.15446633 0.1649724 0.7775907 0.27600783 0...,[[3.85870270e+02 6.82138748e+01 8.47848267e+02...,[[1.4768969e+00 5.7344109e-02 4.2200065e-01 .....
97,97,[117 96 274 96 96 191 96 274 96 117 96 ...,[0.45193458 0.44480965 0.39654863 0.39499548 0...,[ 7 11 120 7 11 11 7 7 11 120 7 ...,[0.18166438 0.18142919 0.17640404 0.15566485 0...,[[175.58174 55.92364 413.1317 304.8...,[[1.31476998e-01 0.00000000e+00 2.15384558e-01...
98,98,[ 117 117 117 117 117 117 117 117 117 ...,[0.48544645 0.47749144 0.47132853 0.45126337 0...,[ 11 11 11 11 11 11 11 11 11 11 11 ...,[0.19828098 0.2468157 0.21104896 0.20126459 0...,[[178.07211 102.06256 570.98004 476.37582 ]...,[[0.04437328 0. 0. ... 0. ...
99,99,[1094 1094 453 1094 1094 453 1094 453 117 ...,[0.4763075 0.37306616 0.35470927 0.3113755 0...,[ 7 210 7 7 7 7 7 7 7 7 7 ...,[0.14432126 0.1323388 0.20676573 0.1458718 0...,[[2.91491730e+02 2.33148074e+00 8.01851257e+02...,[[5.56907542e-02 0.00000000e+00 0.00000000e+00...


In [19]:
instances.attr_classes

tensor([ 11,  11,   7, 163,   7,   7,   7, 163,  11, 163,   7,  11, 163,   7,
          7,   7,   7,   7,  11,   7,   7, 163,   7,   7,  11,   7],
       device='cuda:0')

In [27]:
(instances.pred_boxes)

Boxes(tensor([[2.2778e+02, 1.3005e+02, 5.3285e+02, 2.9880e+02],
        [2.2738e+02, 9.9733e+01, 5.6675e+02, 2.8024e+02],
        [4.3529e+02, 8.8691e+00, 9.5445e+02, 4.2413e+02],
        [1.2483e+00, 1.9482e+00, 7.3195e+02, 2.4749e+02],
        [3.8193e+02, 3.1409e+01, 8.6606e+02, 4.6317e+02],
        [1.7067e+02, 8.4857e+01, 5.4752e+02, 2.6166e+02],
        [4.1901e+02, 1.2294e+02, 9.0890e+02, 5.4064e+02],
        [4.8942e-01, 1.8325e+00, 5.7993e+02, 2.2042e+02],
        [0.0000e+00, 2.0419e+00, 4.0364e+02, 3.5917e+02],
        [0.0000e+00, 1.1431e+00, 4.7678e+02, 2.8606e+02],
        [2.5373e+02, 5.2835e+01, 9.7608e+02, 4.5294e+02],
        [2.7456e+02, 1.0568e+02, 5.2482e+02, 2.8500e+02],
        [1.0370e+02, 2.7992e+00, 8.9314e+02, 2.5076e+02],
        [4.9485e+02, 4.4994e+01, 1.0213e+03, 4.3988e+02],
        [3.6639e+02, 1.2751e+02, 8.2658e+02, 5.2072e+02],
        [3.8502e+02, 4.8876e+01, 1.0240e+03, 5.1301e+02],
        [3.2656e+02, 6.5756e+01, 8.1299e+02, 4.7967e+02],
        

In [26]:
instances.pred_boxes.tensor

tensor([[2.2778e+02, 1.3005e+02, 5.3285e+02, 2.9880e+02],
        [2.2738e+02, 9.9733e+01, 5.6675e+02, 2.8024e+02],
        [4.3529e+02, 8.8691e+00, 9.5445e+02, 4.2413e+02],
        [1.2483e+00, 1.9482e+00, 7.3195e+02, 2.4749e+02],
        [3.8193e+02, 3.1409e+01, 8.6606e+02, 4.6317e+02],
        [1.7067e+02, 8.4857e+01, 5.4752e+02, 2.6166e+02],
        [4.1901e+02, 1.2294e+02, 9.0890e+02, 5.4064e+02],
        [4.8942e-01, 1.8325e+00, 5.7993e+02, 2.2042e+02],
        [0.0000e+00, 2.0419e+00, 4.0364e+02, 3.5917e+02],
        [0.0000e+00, 1.1431e+00, 4.7678e+02, 2.8606e+02],
        [2.5373e+02, 5.2835e+01, 9.7608e+02, 4.5294e+02],
        [2.7456e+02, 1.0568e+02, 5.2482e+02, 2.8500e+02],
        [1.0370e+02, 2.7992e+00, 8.9314e+02, 2.5076e+02],
        [4.9485e+02, 4.4994e+01, 1.0213e+03, 4.3988e+02],
        [3.6639e+02, 1.2751e+02, 8.2658e+02, 5.2072e+02],
        [3.8502e+02, 4.8876e+01, 1.0240e+03, 5.1301e+02],
        [3.2656e+02, 6.5756e+01, 8.1299e+02, 4.7967e+02],
        [3.393

In [30]:
instances.pred_boxes.tensor.cpu().numpy().shape

(26, 4)

In [31]:
instances.attr_scores.cpu().numpy().shape

(26,)

In [39]:
import pandas as pd
from collections import defaultdict

In [42]:
pd.DataFrame(df)

Unnamed: 0,objects_id,objects_conf,attrs_id,attrs_scores,boxes,features
0,"[180, 180, 758, 72, 758, 180, 758, 72, 72, 72,...","[0.55572504, 0.44533208, 0.3769128, 0.37550977...","[11, 11, 7, 163, 7, 7, 7, 163, 11, 163, 7, 11,...","[0.20495716, 0.15364008, 0.33251885, 0.2434687...","[[227.7801, 130.04909, 532.8466, 298.8005], [2...","[[0.09992137, 0.0, 0.041029725, 0.62173855, 0...."


In [None]:
# Show the boxes, labels, and features
pred = instances.to('cpu')
v = Visualizer(im[:, :, :], MetadataCatalog.get("vg"), scale=1.2)
v = v.draw_instance_predictions(pred)
showarray(v.get_image()[:, :, ::-1])
print('instances:\n', instances)
print()
print('boxes:\n', instances.pred_boxes)
print()
print('Shape of features:\n', features.shape)

In [None]:
# Verify the correspondence of RoI features
pred_class_logits, pred_attr_logits, pred_proposal_deltas = predictor.model.roi_heads.box_predictor(features)
pred_class_probs = torch.nn.functional.softmax(pred_class_logits, -1)[:, :-1]
max_probs, max_classes = pred_class_probs.max(-1)
print("%d objects are different, it is because the classes-aware NMS process" % (NUM_OBJECTS - torch.eq(instances.pred_classes, max_classes).sum().item()))
print("The total difference of score is %0.4f" % (instances.scores - max_probs).abs().sum().item())