In [1]:
import sys
import os
import torch
import yaml

from easydict import EasyDict as edict
from pytorch_transformers.tokenization_bert import BertTokenizer
from vilbert.datasets import ConceptCapLoaderTrain, ConceptCapLoaderVal
from vilbert.vilbert import VILBertForVLTasks, BertConfig, BertForMultiModalPreTraining
from vilbert.task_utils import LoadDatasetEval

import numpy as np
import matplotlib.pyplot as plt
import PIL

from maskrcnn_benchmark.config import cfg
from maskrcnn_benchmark.layers import nms
from maskrcnn_benchmark.modeling.detector import build_detection_model
from maskrcnn_benchmark.structures.image_list import to_image_list
from maskrcnn_benchmark.utils.model_serialization import load_state_dict
from PIL import Image
import cv2
import argparse
import glob
from types import SimpleNamespace
import pdb

%matplotlib inline  

In [2]:
class FeatureExtractor:
    MAX_SIZE = 1333
    MIN_SIZE = 800

    def __init__(self):
        self.args = self.get_parser()
        self.detection_model = self._build_detection_model()

    def get_parser(self):        
        parser = SimpleNamespace(model_file= 'data/detectron_model.pth',
                                config_file='data/detectron_config.yaml',
                                batch_size=1,
                                num_features=100,
                                feature_name="fc6",
                                confidence_threshold=0,
                                background=False,
                                partition=0)
        return parser
    
    def _build_detection_model(self):
        cfg.merge_from_file(self.args.config_file)
        cfg.freeze()

        model = build_detection_model(cfg)
        checkpoint = torch.load(self.args.model_file, map_location=torch.device("cpu"))

        load_state_dict(model, checkpoint.pop("model"))

        model.to("cuda")
        model.eval()
        return model

    def _image_transform(self, path):
        img = Image.open(path)
        im = np.array(img).astype(np.float32)
        # IndexError: too many indices for array, grayscale images
        if len(im.shape) < 3:
            im = np.repeat(im[:, :, np.newaxis], 3, axis=2)
        im = im[:, :, ::-1]
        im -= np.array([102.9801, 115.9465, 122.7717])
        im_shape = im.shape
        im_height = im_shape[0]
        im_width = im_shape[1]
        im_size_min = np.min(im_shape[0:2])
        im_size_max = np.max(im_shape[0:2])

        # Scale based on minimum size
        im_scale = self.MIN_SIZE / im_size_min

        # Prevent the biggest axis from being more than max_size
        # If bigger, scale it down
        if np.round(im_scale * im_size_max) > self.MAX_SIZE:
            im_scale = self.MAX_SIZE / im_size_max

        im = cv2.resize(
            im, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_LINEAR
        )
        img = torch.from_numpy(im).permute(2, 0, 1)

        im_info = {"width": im_width, "height": im_height}

        return img, im_scale, im_info

    def _process_feature_extraction(
        self, output, im_scales, im_infos, feature_name="fc6", conf_thresh=0
    ):
        batch_size = len(output[0]["proposals"])
        n_boxes_per_image = [len(boxes) for boxes in output[0]["proposals"]]
        score_list = output[0]["scores"].split(n_boxes_per_image)
        score_list = [torch.nn.functional.softmax(x, -1) for x in score_list]
        feats = output[0][feature_name].split(n_boxes_per_image)
        cur_device = score_list[0].device

        feat_list = []
        info_list = []

        for i in range(batch_size):
            dets = output[0]["proposals"][i].bbox / im_scales[i]
            scores = score_list[i]
            max_conf = torch.zeros((scores.shape[0])).to(cur_device)
            conf_thresh_tensor = torch.full_like(max_conf, conf_thresh)
            start_index = 1
            # Column 0 of the scores matrix is for the background class
            if self.args.background:
                start_index = 0
            for cls_ind in range(start_index, scores.shape[1]):
                cls_scores = scores[:, cls_ind]
                keep = nms(dets, cls_scores, 0.5)
                max_conf[keep] = torch.where(
                    # Better than max one till now and minimally greater than conf_thresh
                    (cls_scores[keep] > max_conf[keep])
                    & (cls_scores[keep] > conf_thresh_tensor[keep]),
                    cls_scores[keep],
                    max_conf[keep],
                )

            sorted_scores, sorted_indices = torch.sort(max_conf, descending=True)
            num_boxes = (sorted_scores[: self.args.num_features] != 0).sum()
            keep_boxes = sorted_indices[: self.args.num_features]
            feat_list.append(feats[i][keep_boxes])
            bbox = output[0]["proposals"][i][keep_boxes].bbox / im_scales[i]
            # Predict the class label using the scores
            objects = torch.argmax(scores[keep_boxes][start_index:], dim=1)
            cls_prob = torch.max(scores[keep_boxes][start_index:], dim=1)

            info_list.append(
                {
                    "bbox": bbox.cpu().numpy(),
                    "num_boxes": num_boxes.item(),
                    "objects": objects.cpu().numpy(),
                    "image_width": im_infos[i]["width"],
                    "image_height": im_infos[i]["height"],
                    "cls_prob": scores[keep_boxes].cpu().numpy(),
                }
            )

        return feat_list, info_list

    def get_detectron_features(self, image_paths):
        img_tensor, im_scales, im_infos = [], [], []

        for image_path in image_paths:
            im, im_scale, im_info = self._image_transform(image_path)
            img_tensor.append(im)
            im_scales.append(im_scale)
            im_infos.append(im_info)

        # Image dimensions should be divisible by 32, to allow convolutions
        # in detector to work
        current_img_list = to_image_list(img_tensor, size_divisible=32)
        current_img_list = current_img_list.to("cuda")

        with torch.no_grad():
            output = self.detection_model(current_img_list)

        feat_list = self._process_feature_extraction(
            output,
            im_scales,
            im_infos,
            self.args.feature_name,
            self.args.confidence_threshold,
        )

        return feat_list

    def _chunks(self, array, chunk_size):
        for i in range(0, len(array), chunk_size):
            yield array[i : i + chunk_size]

    def _save_feature(self, file_name, feature, info):
        file_base_name = os.path.basename(file_name)
        file_base_name = file_base_name.split(".")[0]
        info["image_id"] = file_base_name
        info["features"] = feature.cpu().numpy()
        file_base_name = file_base_name + ".npy"

        np.save(os.path.join(self.args.output_folder, file_base_name), info)

    def extract_features(self, image_paths):

        features, infos = self.get_detectron_features(image_paths)

        return features, infos


In [3]:
def tokenize_batch(batch):
    return [tokenizer.convert_tokens_to_ids(sent) for sent in batch]

def untokenize_batch(batch):
    return [tokenizer.convert_ids_to_tokens(sent) for sent in batch]

def detokenize(sent):
    """ Roughly detokenizes (mainly undoes wordpiece) """
    new_sent = []
    for i, tok in enumerate(sent):
        if tok.startswith("##"):
            new_sent[len(new_sent) - 1] = new_sent[len(new_sent) - 1] + tok[2:]
        else:
            new_sent.append(tok)
    return new_sent

def printer(sent, should_detokenize=True):
    if should_detokenize:
        sent = detokenize(sent)[1:-1]
    print(" ".join(sent))


# write arbitary string for given sentense. 
import _pickle as cPickle

In [4]:
# def prediction(question, features, spatials, segment_ids, input_mask, image_mask, co_attention_mask, task_tokens, gt_spatials, return_boxes=False):

#     vil_prediction, vil_prediction_gqa, vil_logit, vil_binary_prediction, vil_tri_prediction, vision_prediction, vision_logit, linguisic_prediction, linguisic_logit, attn_data_list = model(
#         question, features, spatials, segment_ids, input_mask, image_mask, co_attention_mask, task_tokens, output_all_attention_masks=True
#     )

#     height, width = img.shape[0], img.shape[1]

#     if return_boxes:
#         _, grounding_idx = torch.sort(vision_logit.view(-1), 0, True)
#         idx = grounding_idx[0]
#         box = spatials[0][idx][:4].tolist()
#         y1 = int(box[1] * height)
#         y2 = int(box[3] * height)
#         x1 = int(box[0] * width)
#         x2 = int(box[2] * width)
#         return [x1, y1, x2, y2]

#     logits = torch.max(vil_prediction, 1)[1].data  # argmax
#     # Load VQA label to answers:
#     label2ans_path = os.path.join('save', "VQA", "trainval_label2ans.pkl")
#     vqa_label2ans = cPickle.load(open(label2ans_path, "rb"))
#     answer = vqa_label2ans[logits[0].item()]
#     print("VQA: " + answer)

#     # Load GQA label to answers:
#     label2ans_path = os.path.join('save', "gqa", "trainval_label2ans.pkl")

#     logtis_gqa = torch.max(vil_prediction_gqa, 1)[1].data
#     gqa_label2ans = cPickle.load(open(label2ans_path, "rb"))
#     answer = gqa_label2ans[logtis_gqa[0].item()]
#     print("GQA: " + answer)

#     # vil_binary_prediction NLVR2, 0: False 1: True Task 12
#     logtis_binary = torch.max(vil_binary_prediction, 1)[1].data
#     print("NLVR: " + str(logtis_binary.item()))

#     # vil_entaliment:  
#     label_map = {0:"contradiction", 1:"neutral", 2:"entailment"}
#     logtis_tri = torch.max(vil_tri_prediction, 1)[1].data
#     print("Entaliment: " + str(label_map[logtis_tri.item()]))

#     # vil_logit: 
#     logits_vil = vil_logit[0].item()
#     print("ViL_logit: %f" %logits_vil)

#     # grounding: 
#     logits_vision = torch.max(vision_logit, 1)[1].data
#     grounding_val, grounding_idx = torch.sort(vision_logit.view(-1), 0, True)

#     examples_per_row = 6
#     ncols = examples_per_row 
#     nrows = 1
#     figsize = [12, ncols*20]     # figure size, inches
#     fig, ax = plt.subplots(nrows=nrows, ncols=ncols, figsize=figsize)

#     for i, axi in enumerate(ax.flat):
#         idx = grounding_idx[i]
#         val = grounding_val[i]
#         box = spatials[0][idx][:4].tolist()
#         y1 = int(box[1] * height)
#         y2 = int(box[3] * height)
#         x1 = int(box[0] * width)
#         x2 = int(box[2] * width)
#         patch = img[y1:y2,x1:x2]
#         axi.imshow(patch)
#         axi.axis('off')
#         axi.set_title(str(i) + ": " + str(val.item()))
        
#     gt_box = gt_spatials[:4].tolist()
#     y1 = int(gt_box[1])
#     y2 = int(gt_box[3])
#     x1 = int(gt_box[0])
#     x2 = int(gt_box[2])
#     patch = img[y1:y2,x1:x2]
#     axi.imshow(patch)
#     axi.axis('off')
#     axi.set_title('Ground Truth')

#     plt.axis('off')
#     plt.tight_layout(True)
#     plt.show()  

In [5]:
def process_feats(features, infos):
    num_image = len(infos)
    max_length = 37
    feature_list = []
    image_location_list = []
    image_mask_list = []
    for i in range(num_image):
        image_w = infos[i]['image_width']
        image_h = infos[i]['image_height']
        feature = features[i]
        num_boxes = feature.shape[0]

        g_feat = torch.sum(feature, dim=0) / num_boxes
        num_boxes = num_boxes + 1
        feature = torch.cat([g_feat.view(1,-1), feature], dim=0)
        boxes = infos[i]['bbox']
        image_location = np.zeros((boxes.shape[0], 5), dtype=np.float32)
        image_location[:,:4] = boxes
        image_location[:,4] = (image_location[:,3] - image_location[:,1]) * (image_location[:,2] - image_location[:,0]) / (float(image_w) * float(image_h))
        image_location[:,0] = image_location[:,0] / float(image_w)
        image_location[:,1] = image_location[:,1] / float(image_h)
        image_location[:,2] = image_location[:,2] / float(image_w)
        image_location[:,3] = image_location[:,3] / float(image_h)
        g_location = np.array([0,0,1,1,1])
        image_location = np.concatenate([np.expand_dims(g_location, axis=0), image_location], axis=0)
        image_mask = [1] * (int(num_boxes))

        feature_list.append(feature)
        image_location_list.append(torch.tensor(image_location))
        image_mask_list.append(torch.tensor(image_mask))

    features = torch.stack(feature_list, dim=0).float().cuda()
    spatials = torch.stack(image_location_list, dim=0).float().cuda()
    image_mask = torch.stack(image_mask_list, dim=0).byte().cuda()
    co_attention_mask = torch.zeros((num_image, num_boxes, max_length)).cuda()
    return features, spatials, image_mask, co_attention_mask

In [6]:
# def custom_prediction(query, task, gt_spatials, features, spatials, image_mask, co_attention_mask, return_boxes=False):

#     tokens = tokenizer.encode(query)
#     tokens = tokenizer.add_special_tokens_single_sentence(tokens)

#     segment_ids = [0] * len(tokens)
#     input_mask = [1] * len(tokens)

#     max_length = 37
#     if len(tokens) < max_length:
#         # Note here we pad in front of the sentence
#         padding = [0] * (max_length - len(tokens))
#         tokens = tokens + padding
#         input_mask += padding
#         segment_ids += padding

#     text = torch.from_numpy(np.array(tokens)).cuda().unsqueeze(0)
#     input_mask = torch.from_numpy(np.array(input_mask)).cuda().unsqueeze(0)
#     segment_ids = torch.from_numpy(np.array(segment_ids)).cuda().unsqueeze(0)
#     task = torch.from_numpy(np.array(task)).cuda().unsqueeze(0)
#     print(text.shape, segment_ids.shape, input_mask.shape)

#     predicted_boxes = prediction(text, features, spatials, segment_ids, input_mask, image_mask, co_attention_mask, task, gt_spatials, return_boxes=return_boxes)
#     if return_boxes:
#         return predicted_boxes

In [7]:

# =============================
# ViLBERT part
# =============================
feature_extractor = FeatureExtractor()

args = SimpleNamespace(from_pretrained= "save/multitask_model/pytorch_model_9.bin",
                       bert_model="bert-base-uncased",
                       config_file="config/bert_base_6layer_6conect.json",
                       max_seq_length=101,
                       train_batch_size=1,
                       do_lower_case=True,
                       predict_feature=False,
                       seed=42,
                       num_workers=0,
                       baseline=False,
                       img_weight=1,
                       distributed=False,
                       objective=1,
                       visual_target=0,
                       dynamic_attention=False,
                       task_specific_tokens=True,
                       tasks='1',
                       save_name='',
                       in_memory=False,
                       batch_size=1,
                       local_rank=-1,
                       split='mteval',
                       clean_train_sets=True
                      )

config = BertConfig.from_json_file(args.config_file)
with open('./vilbert_tasks.yml', 'r') as f:
    task_cfg = edict(yaml.safe_load(f))

task_names = []
for i, task_id in enumerate(args.tasks.split('-')):
    task = 'TASK' + task_id
    name = task_cfg[task]['name']
    task_names.append(name)

timeStamp = args.from_pretrained.split('/')[-1] + '-' + args.save_name
config = BertConfig.from_json_file(args.config_file)
default_gpu=True

if args.predict_feature:
    config.v_target_size = 2048
    config.predict_feature = True
else:
    config.v_target_size = 1601
    config.predict_feature = False

if args.task_specific_tokens:
    config.task_specific_tokens = True    

if args.dynamic_attention:
    config.dynamic_attention = True

config.visualization = True
num_labels = 3129

# if args.baseline:
#     model = BaseBertForVLTasks.from_pretrained(
#         args.from_pretrained, config=config, num_labels=num_labels, default_gpu=default_gpu
#         )
# else:
model = VILBertForVLTasks.from_pretrained(
    args.from_pretrained, config=config, num_labels=num_labels, default_gpu=default_gpu
)
    
model.eval()
cuda = torch.cuda.is_available()
if cuda: model = model.cuda(0)
# tokenizer = BertTokenizer.from_pretrained(
#     args.bert_model, do_lower_case=args.do_lower_case
# )
# print(args.bert_model, args.do_lower_case)

04/26/2021 14:59:33 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer1.0.bn1.bias                  loaded from backbone.body.layer1.0.bn1.bias                  of shape (256,)
04/26/2021 14:59:33 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer1.0.bn1.running_mean          loaded from backbone.body.layer1.0.bn1.running_mean          of shape (256,)
04/26/2021 14:59:33 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer1.0.bn1.running_var           loaded from backbone.body.layer1.0.bn1.running_var           of shape (256,)
04/26/2021 14:59:33 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer1.0.bn1.weight                loaded from backbone.body.layer1.0.bn1.weight                of shape (256,)
04/26/2021 14:59:33 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer1.0.bn2.bias                  loaded from backbone.body.layer1.0.bn2.bias              

04/26/2021 14:59:33 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer1.2.bn2.running_mean          loaded from backbone.body.layer1.2.bn2.running_mean          of shape (256,)
04/26/2021 14:59:33 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer1.2.bn2.running_var           loaded from backbone.body.layer1.2.bn2.running_var           of shape (256,)
04/26/2021 14:59:33 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer1.2.bn2.weight                loaded from backbone.body.layer1.2.bn2.weight                of shape (256,)
04/26/2021 14:59:33 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer1.2.bn3.bias                  loaded from backbone.body.layer1.2.bn3.bias                  of shape (256,)
04/26/2021 14:59:33 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer1.2.bn3.running_mean          loaded from backbone.body.layer1.2.bn3.running_mean      

04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer2.1.bn3.running_var           loaded from backbone.body.layer2.1.bn3.running_var           of shape (512,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer2.1.bn3.weight                loaded from backbone.body.layer2.1.bn3.weight                of shape (512,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer2.1.conv1.weight              loaded from backbone.body.layer2.1.conv1.weight              of shape (512, 512, 1, 1)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer2.1.conv2.weight              loaded from backbone.body.layer2.1.conv2.weight              of shape (512, 16, 3, 3)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer2.1.conv3.weight              loaded from backbone.body.layer2.1.con

04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer2.4.bn2.running_mean          loaded from backbone.body.layer2.4.bn2.running_mean          of shape (512,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer2.4.bn2.running_var           loaded from backbone.body.layer2.4.bn2.running_var           of shape (512,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer2.4.bn2.weight                loaded from backbone.body.layer2.4.bn2.weight                of shape (512,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer2.4.bn3.bias                  loaded from backbone.body.layer2.4.bn3.bias                  of shape (512,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer2.4.bn3.running_mean          loaded from backbone.body.layer2.4.bn3.running_mean      

04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer2.7.bn1.bias                  loaded from backbone.body.layer2.7.bn1.bias                  of shape (512,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer2.7.bn1.running_mean          loaded from backbone.body.layer2.7.bn1.running_mean          of shape (512,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer2.7.bn1.running_var           loaded from backbone.body.layer2.7.bn1.running_var           of shape (512,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer2.7.bn1.weight                loaded from backbone.body.layer2.7.bn1.weight                of shape (512,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer2.7.bn2.bias                  loaded from backbone.body.layer2.7.bn2.bias              

04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.1.bn2.running_mean          loaded from backbone.body.layer3.1.bn2.running_mean          of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.1.bn2.running_var           loaded from backbone.body.layer3.1.bn2.running_var           of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.1.bn2.weight                loaded from backbone.body.layer3.1.bn2.weight                of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.1.bn3.bias                  loaded from backbone.body.layer3.1.bn3.bias                  of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.1.bn3.running_mean          loaded from backbone.body.layer3.1.bn3.running_mean  

04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.12.bn1.bias                 loaded from backbone.body.layer3.12.bn1.bias                 of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.12.bn1.running_mean         loaded from backbone.body.layer3.12.bn1.running_mean         of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.12.bn1.running_var          loaded from backbone.body.layer3.12.bn1.running_var          of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.12.bn1.weight               loaded from backbone.body.layer3.12.bn1.weight               of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.12.bn2.bias                 loaded from backbone.body.layer3.12.bn2.bias         

04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.14.bn3.running_var          loaded from backbone.body.layer3.14.bn3.running_var          of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.14.bn3.weight               loaded from backbone.body.layer3.14.bn3.weight               of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.14.conv1.weight             loaded from backbone.body.layer3.14.conv1.weight             of shape (1024, 1024, 1, 1)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.14.conv2.weight             loaded from backbone.body.layer3.14.conv2.weight             of shape (1024, 32, 3, 3)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.14.conv3.weight             loaded from backbone.body.layer3.

04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.17.bn2.running_mean         loaded from backbone.body.layer3.17.bn2.running_mean         of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.17.bn2.running_var          loaded from backbone.body.layer3.17.bn2.running_var          of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.17.bn2.weight               loaded from backbone.body.layer3.17.bn2.weight               of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.17.bn3.bias                 loaded from backbone.body.layer3.17.bn3.bias                 of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.17.bn3.running_mean         loaded from backbone.body.layer3.17.bn3.running_mean 

04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.2.bn1.bias                  loaded from backbone.body.layer3.2.bn1.bias                  of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.2.bn1.running_mean          loaded from backbone.body.layer3.2.bn1.running_mean          of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.2.bn1.running_var           loaded from backbone.body.layer3.2.bn1.running_var           of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.2.bn1.weight                loaded from backbone.body.layer3.2.bn1.weight                of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.2.bn2.bias                  loaded from backbone.body.layer3.2.bn2.bias          

04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.21.bn3.running_var          loaded from backbone.body.layer3.21.bn3.running_var          of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.21.bn3.weight               loaded from backbone.body.layer3.21.bn3.weight               of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.21.conv1.weight             loaded from backbone.body.layer3.21.conv1.weight             of shape (1024, 1024, 1, 1)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.21.conv2.weight             loaded from backbone.body.layer3.21.conv2.weight             of shape (1024, 32, 3, 3)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.21.conv3.weight             loaded from backbone.body.layer3.

04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.24.bn2.running_mean         loaded from backbone.body.layer3.24.bn2.running_mean         of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.24.bn2.running_var          loaded from backbone.body.layer3.24.bn2.running_var          of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.24.bn2.weight               loaded from backbone.body.layer3.24.bn2.weight               of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.24.bn3.bias                 loaded from backbone.body.layer3.24.bn3.bias                 of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.24.bn3.running_mean         loaded from backbone.body.layer3.24.bn3.running_mean 

04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.27.bn1.bias                 loaded from backbone.body.layer3.27.bn1.bias                 of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.27.bn1.running_mean         loaded from backbone.body.layer3.27.bn1.running_mean         of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.27.bn1.running_var          loaded from backbone.body.layer3.27.bn1.running_var          of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.27.bn1.weight               loaded from backbone.body.layer3.27.bn1.weight               of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.27.bn2.bias                 loaded from backbone.body.layer3.27.bn2.bias         

04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.29.bn3.running_var          loaded from backbone.body.layer3.29.bn3.running_var          of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.29.bn3.weight               loaded from backbone.body.layer3.29.bn3.weight               of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.29.conv1.weight             loaded from backbone.body.layer3.29.conv1.weight             of shape (1024, 1024, 1, 1)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.29.conv2.weight             loaded from backbone.body.layer3.29.conv2.weight             of shape (1024, 32, 3, 3)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.29.conv3.weight             loaded from backbone.body.layer3.

04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.31.bn2.running_mean         loaded from backbone.body.layer3.31.bn2.running_mean         of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.31.bn2.running_var          loaded from backbone.body.layer3.31.bn2.running_var          of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.31.bn2.weight               loaded from backbone.body.layer3.31.bn2.weight               of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.31.bn3.bias                 loaded from backbone.body.layer3.31.bn3.bias                 of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.31.bn3.running_mean         loaded from backbone.body.layer3.31.bn3.running_mean 

04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.34.bn1.bias                 loaded from backbone.body.layer3.34.bn1.bias                 of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.34.bn1.running_mean         loaded from backbone.body.layer3.34.bn1.running_mean         of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.34.bn1.running_var          loaded from backbone.body.layer3.34.bn1.running_var          of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.34.bn1.weight               loaded from backbone.body.layer3.34.bn1.weight               of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.34.bn2.bias                 loaded from backbone.body.layer3.34.bn2.bias         

04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.4.bn3.running_var           loaded from backbone.body.layer3.4.bn3.running_var           of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.4.bn3.weight                loaded from backbone.body.layer3.4.bn3.weight                of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.4.conv1.weight              loaded from backbone.body.layer3.4.conv1.weight              of shape (1024, 1024, 1, 1)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.4.conv2.weight              loaded from backbone.body.layer3.4.conv2.weight              of shape (1024, 32, 3, 3)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.4.conv3.weight              loaded from backbone.body.layer3.

04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.7.bn2.running_mean          loaded from backbone.body.layer3.7.bn2.running_mean          of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.7.bn2.running_var           loaded from backbone.body.layer3.7.bn2.running_var           of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.7.bn2.weight                loaded from backbone.body.layer3.7.bn2.weight                of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.7.bn3.bias                  loaded from backbone.body.layer3.7.bn3.bias                  of shape (1024,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer3.7.bn3.running_mean          loaded from backbone.body.layer3.7.bn3.running_mean  

04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer4.0.bn1.bias                  loaded from backbone.body.layer4.0.bn1.bias                  of shape (2048,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer4.0.bn1.running_mean          loaded from backbone.body.layer4.0.bn1.running_mean          of shape (2048,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer4.0.bn1.running_var           loaded from backbone.body.layer4.0.bn1.running_var           of shape (2048,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer4.0.bn1.weight                loaded from backbone.body.layer4.0.bn1.weight                of shape (2048,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer4.0.bn2.bias                  loaded from backbone.body.layer4.0.bn2.bias          

04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer4.2.bn2.running_mean          loaded from backbone.body.layer4.2.bn2.running_mean          of shape (2048,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer4.2.bn2.running_var           loaded from backbone.body.layer4.2.bn2.running_var           of shape (2048,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer4.2.bn2.weight                loaded from backbone.body.layer4.2.bn2.weight                of shape (2048,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer4.2.bn3.bias                  loaded from backbone.body.layer4.2.bn3.bias                  of shape (2048,)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   backbone.body.layer4.2.bn3.running_mean          loaded from backbone.body.layer4.2.bn3.running_mean  

04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   rpn.anchor_generator.cell_anchors.1              loaded from rpn.anchor_generator.cell_anchors.1              of shape (3, 4)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   rpn.anchor_generator.cell_anchors.2              loaded from rpn.anchor_generator.cell_anchors.2              of shape (3, 4)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   rpn.anchor_generator.cell_anchors.3              loaded from rpn.anchor_generator.cell_anchors.3              of shape (3, 4)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   rpn.anchor_generator.cell_anchors.4              loaded from rpn.anchor_generator.cell_anchors.4              of shape (3, 4)
04/26/2021 14:59:34 - INFO - maskrcnn_benchmark.utils.model_serialization -   rpn.head.bbox_pred.bias                          loaded from rpn.head.bbox_pred.bias                      

In [8]:
# 1: VQA, 2: GenomeQA, 4: Visual7w, 7: Retrieval COCO, 8: Retrieval Flickr30k 
# 9: refcoco, 10: refcoco+ 11: refcocog, 12: NLVR2, 13: VisualEntailment, 15: GQA, 16: GuessWhat, 


# image_path = 'demo/2.jpg'
# query = "glass on red car"

# features, infos = feature_extractor.extract_features(image_path)

# img = PIL.Image.open(image_path).convert('RGB')
# img = torch.tensor(np.array(img))

# plt.axis('off')
# plt.imshow(img)
# plt.show()

# task = [9]
# custom_prediction(query, task, features, infos)

In [9]:
def _iou2d(box_a, box_b):
#     device = box_a.device
#     box_a = box_a * box_std[0].to(device) + box_mean[0].to(device)
#     box_b = box_b * box_std[0].to(device) + box_mean[0].to(device)
    intersection = _intersect(box_a, box_b)
    vol_a = _area(box_a)
    vol_b = _area(box_b)
    union = vol_a + vol_b - intersection
    return intersection / union


def _intersect(box_a, box_b):
    xA = max(box_a[0], box_b[0])
    yA = max(box_a[1], box_b[1])
    xB = min(box_a[2], box_b[2])
    yB = min(box_a[3], box_b[3])
    return max(0, xB - xA) * max(0, yB - yA)


def _area(box):
    return (box[2] - box[0]) * (box[3] - box[1])

In [10]:
# image_path = os.path.join(im_path, annos[0]['filename'])
# query = "tree near house"
# gt_spatials = get_object_rois(annos[0])[5]
# features, infos = feature_extractor.extract_features(image_path)

# img = PIL.Image.open(image_path).convert('RGB')
# img = torch.tensor(np.array(img))

# plt.axis('off')
# plt.imshow(img)
# plt.show()

# task = [11]
# pred_boxes = custom_prediction(query, task, gt_spatials, features, infos, return_boxes=True)

In [11]:
# _iou2d(gt_spatials, pred_boxes)

In [12]:
from copy import deepcopy
import json
import os
import random

import numpy as np
from PIL import Image
import torch
from torch.utils.data import Dataset
from torchvision import transforms

In [13]:
class IterDataset2d(Dataset):
    """Dataset utilities for conditional grounding."""

    def __init__(self, split='test'):
        """Initialize dataset."""
        super().__init__()
        self._path = '/projects/katefgroup/language_grounding/'
        self.im_path = self._path + 'VG/images/'
        self.anno_path = self._path + 'VG200/'
        self.split = split
        self.annos = self.load_annos()
        print('Loaded %d samples' % len(self.annos))
        self.tokenizer = BertTokenizer.from_pretrained(
            "bert-base-uncased", do_lower_case=True
        )

    def load_annos(self):
        """Load annotations."""
        splits = {0, 1} if self.split == 'train' else {2}
        with open(self.anno_path + 'VG200_preddet.json') as fid:
            annos = json.load(fid)
        annos = [
            anno
            for anno in annos
            if anno['relations']['names'] and anno['split_id'] in splits
        ]
        return annos

    def __getitem__(self, index):
        """Get image's data (used by loader to later form a batch)."""
        anno = deepcopy(self.annos[index])
        pairs = self.get_pairs(anno)
        num_pairs = len(pairs)
        gt_boxes = self.get_object_rois(anno)
        obj_ids = self.get_object_ids(anno)
        obj_names = self.get_object_names(anno)
        preds = self.get_predicate_names(anno)
        queries = []
        gt_spatials = []
        input_masks = []
        segment_id_list = []
        for idx in range(num_pairs):
            query = obj_names[pairs[idx][0]] + " " + preds[idx] + " " + obj_names[pairs[idx][1]]
            gt_spatials.append(gt_boxes[pairs[idx][0]])
            
            tokens = self.tokenizer.encode(query)
            tokens = self.tokenizer.add_special_tokens_single_sentence(tokens)

            segment_ids = [0] * len(tokens)
            input_mask = [1] * len(tokens)

            max_length = 37
            if len(tokens) < max_length:
                # Note here we pad in front of the sentence
                padding = [0] * (max_length - len(tokens))
                tokens = tokens + padding
                input_mask += padding
                segment_ids += padding
                
            queries.append(tokens)
            input_masks.append(input_mask)
            segment_id_list.append(segment_ids)
        
        
        return {
            "filename": anno['filename'],
            "file_path": os.path.join(self.im_path, anno['filename']),
            "num_queries": len(queries),
            # "scene": self._load_image(anno['filename']),
            "object_boxes": torch.from_numpy(np.asarray(gt_spatials)).float(),
            "queries": torch.from_numpy(np.asarray(queries)),
            "input_masks": torch.from_numpy(np.asarray(input_masks)),
            "segment_ids": torch.from_numpy(np.asarray(segment_id_list))
        }

    def __len__(self):
        """Override __len__ method, return dataset's size."""
        return len(self.annos)

    def _load_image(self, img_name):
        """Load image and add augmentations."""
        img_name = os.path.join(self.im_path, img_name)
        _img = Image.open(img_name).convert('RGB')
        width, height = _img.size
        max_wh = max(width, height)
        preprocessing = transforms.Compose([
            transforms.Pad((0, 0, max_wh - width, max_wh - height)),
            transforms.ToTensor()
        ])
        return preprocessing(_img)

    @staticmethod
    def get_object_ids(anno):
        """Return object classes ids for given image."""
        return anno['objects']['ids']

    @staticmethod
    def get_object_names(anno):
        """Return object classes for given image."""
        return anno['objects']['names']

    @staticmethod
    def get_object_rois(anno):
        """Return rois for objects of given image."""
        boxes = np.array(anno['objects']['boxes'])
        return np.round(boxes[:, (2, 0, 3, 1)])

    @staticmethod
    def get_pairs(anno):
        """Return an array of related object ids for given image."""
        return np.stack((
            np.array(anno['relations']['subj_ids']),
            np.array(anno['relations']['obj_ids'])
        ), axis=1)

    @staticmethod
    def get_predicate_names(anno):
        """Return predicate classes for given image."""
        return anno['relations']['names']

In [14]:
def iter2d_collate_fn(batch):
    """Collate function for scene-aware ScanNet object classification."""
    total_objects = sum([ex["num_queries"] for ex in batch])
    return {
        "file_path": [ex["file_path"] for ex in batch],
        # "scene": torch.stack([ex["scene"] for ex in batch]),
        "filename": [ex["filename"] for ex in batch],
        "num_queries": [ex["num_queries"] for ex in batch],
        "task": torch.ones((total_objects,1)) * 9,
        "object_boxes": torch.cat([ex["object_boxes"] for ex in batch]),
        "queries": torch.cat([ex["queries"] for ex in batch]),
        "input_masks": torch.cat([ex["input_masks"] for ex in batch]),
        "segment_ids": torch.cat([ex["segment_ids"] for ex in batch]),
    }

In [15]:
# dset = IterDataset2d()

In [16]:
from torch.utils.data import DataLoader

In [17]:
BATCH_SIZE = 4

In [18]:
dataloader = DataLoader(
            IterDataset2d(),
            batch_size=BATCH_SIZE,
            shuffle=False,
            drop_last=False,
            num_workers=2,
            collate_fn = iter2d_collate_fn
        )

04/26/2021 14:59:52 - INFO - pytorch_transformers.tokenization_utils -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/imedirat/.cache/torch/pytorch_transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


Loaded 26446 samples


In [19]:
# for batch in dataloader:
#     print(batch["scene"].shape)
#     print(batch["filename"])
#     print(batch["num_queries"])
#     print(batch["task"].shape)
#     print(batch["object_boxes"].shape)
#     print(batch["queries"].shape)
#     print(batch["input_masks"].shape)
#     print(batch["segment_ids"].shape)
#     break

In [20]:
import csv
list1=['filename','query','pred_bbox_subject','iou']
with open("top1_results.csv", "a") as fp:
    wr = csv.writer(fp, dialect='excel')
    wr.writerow(list1)

num_correct = 0
num_examples = 0

In [21]:
import time

In [None]:
with torch.no_grad():
    for step, batch in enumerate(dataloader):
        if step%10==0 and step>0:
            end_time = time.time()
            print(step, end_time-start_time)
        start_time = time.time()
        features, infos = feature_extractor.extract_features(batch["file_path"])
        features, spatials, image_mask, co_attention_mask = process_feats(features, infos)

        num_queries = torch.tensor(batch["num_queries"]).cuda()
        og_idx = np.repeat(np.arange(0, BATCH_SIZE), batch["num_queries"])
        features = torch.repeat_interleave(features, num_queries, dim=0)
        spatials = torch.repeat_interleave(spatials, num_queries, dim=0)
        image_mask = torch.repeat_interleave(image_mask, num_queries, dim=0)
        co_attention_mask = torch.repeat_interleave(co_attention_mask, num_queries, dim=0)
        
        try:
            _, _, _, _, _, _, vision_logit, _, _, _ = model(
                batch["queries"].cuda(), features, spatials, batch["segment_ids"].cuda(), batch["input_masks"].cuda(), image_mask, co_attention_mask, torch.Tensor(batch["task"]).long().cuda(), output_all_attention_masks=True
            )
        except:
            print(features.size()[0])
            
        stats = []
        for curr_idx in range(features.size()[0]):
            height, width = infos[og_idx[curr_idx]]["image_height"], infos[og_idx[curr_idx]]["image_width"]

            _, grounding_idx = torch.sort(vision_logit[curr_idx].view(-1), 0, True)
            box = spatials[curr_idx][grounding_idx[0]][:4].tolist()
            y1 = int(box[1] * height)
            y2 = int(box[3] * height)
            x1 = int(box[0] * width)
            x2 = int(box[2] * width)
            pred_box = [x1, y1, x2, y2]
            calc_iou = _iou2d(batch["object_boxes"][curr_idx], pred_box)
            if calc_iou > 0.3:
                num_correct += 1
            num_examples += 1
            stats.append([batch["file_path"][og_idx[curr_idx]], batch["queries"][curr_idx], pred_box, calc_iou])
        with open("top1_results.csv", "a") as f:
            writer = csv.writer(f)
            writer.writerows(stats)
#         with open("top1_results.csv", "a") as fp:
#             wr = csv.writer(fp, dialect='excel')
#             wr.writerow([file_names[curr_idx], batch["queries"][curr_idx], pred_boxe, calc_iou])
        #break

	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:882.)
  keep = ((ws >= min_size) & (hs >= min_size)).nonzero().squeeze(1)


10 6.58076286315918
20 6.954320192337036
30 6.220170974731445
40 6.419802665710449
50 7.524116516113281
60 6.14569616317749
70 6.733557462692261
80 6.6103386878967285
90 7.046209096908569
100 6.116795778274536
110 6.007827997207642
120 6.935357332229614
130 6.345007419586182
140 7.62774395942688
150 7.444244384765625
160 6.719703912734985
170 7.037835121154785
180 7.377300024032593
190 6.9635045528411865
200 8.31601357460022
210 7.764005899429321
220 6.61109185218811
230 7.093136787414551
240 8.240348100662231
250 6.964065313339233
260 6.557512044906616
270 6.454111576080322
280 7.937801122665405
290 7.143168687820435
300 6.9461588859558105
310 7.067336320877075
320 8.046345472335815
330 7.832234144210815
340 6.938360691070557
350 6.991305351257324
360 6.610357284545898
370 7.006381511688232
380 6.90432596206665
390 6.809741020202637
400 6.309485197067261
410 6.9454381465911865
420 7.0104079246521
430 8.158775329589844
440 7.314605236053467
450 6.400257587432861
460 8.21868109703064
47

In [23]:
print(len(stats))

37


In [None]:
import csv

# with open("output_stats.csv", "wb") as f:
#     writer = csv.writer(f)
#     writer.writerows(stats)

In [None]:
# from tqdm import tqdm

# with torch.no_grad():
#     for anno in tqdm(annos):
#         pairs = get_pairs(anno)
#         num_pairs = len(pairs)
#         gt_boxes = get_object_rois(anno)
#         obj_ids = get_object_ids(anno)
#         obj_names = get_object_names(anno)
#         preds = get_predicate_names(anno)
#         image_path = os.path.join(im_path, anno['filename'])
#         features, infos = feature_extractor.extract_features(image_path)
#         features, spatials, image_mask, co_attention_mask = process_feats(features, infos)
#         print(features.shape, spatials.shape, image_mask.shape, co_attention_mask.shape)
#         img = PIL.Image.open(image_path).convert('RGB')
#         img = torch.tensor(np.array(img))
#         for idx in range(num_pairs):
#             query = obj_names[pairs[idx][0]] + " " + preds[idx] + " " + obj_names[pairs[idx][1]]
#             gt_spatials = get_object_rois(anno)[pairs[idx][0]]
            
#             tokens = tokenizer.encode(query)
#             tokens = tokenizer.add_special_tokens_single_sentence(tokens)

#             segment_ids = [0] * len(tokens)
#             input_mask = [1] * len(tokens)

#             max_length = 37
#             if len(tokens) < max_length:
#                 # Note here we pad in front of the sentence
#                 padding = [0] * (max_length - len(tokens))
#                 tokens = tokens + padding
#                 input_mask += padding
#                 segment_ids += padding

#             text = torch.from_numpy(np.array(tokens)).cuda().unsqueeze(0)
#             input_mask = torch.from_numpy(np.array(input_mask)).cuda().unsqueeze(0)
#             segment_ids = torch.from_numpy(np.array(segment_ids)).cuda().unsqueeze(0)
#             task = torch.from_numpy(np.array(task)).cuda().unsqueeze(0)
#             print(text.shape, segment_ids.shape, input_mask.shape)

#             predicted_boxes = prediction(text, features, spatials, segment_ids, input_mask, image_mask, co_attention_mask, task, gt_spatials, return_boxes=return_boxes)
#             # pred_boxes = custom_prediction(query, [11], gt_spatials, features, spatials, image_mask, co_attention_mask, return_boxes=True)
#             calc_iou = _iou2d(gt_spatials, pred_boxes)
#             if calc_iou > 0.3:
#                 num_correct += 1
#             num_examples += 1
#             with open("top1_results.csv", "a") as fp:
#                 wr = csv.writer(fp, dialect='excel')
#                 wr.writerow([anno['filename'], query, pred_boxes, calc_iou])
#         break

In [24]:
print("Top 1 Accuracy: {}".format(num_correct/num_examples))

Top 1 Accuracy: 0.4562623550746342


In [25]:
num_examples

183629

In [26]:
num_correct

83783

In [27]:
len(dataloader)

6612

In [28]:
step

6611