* We have a vqa.json file that contains all the data to make predictions on the model
* output: a vqa predictions file that we can submit to the leaderboard 


* Just make sure you can do normal validation first --- DONE
* create Jupyter notebook that loads in model, and runs evaluation on a given image/question pair --- DONE
* generate image features from faster-rcnn for visual genome and nyu
* Given the vqa.json file from the grit benchmark, feed that into the model and get predictions
* Calculate the score based on VQA’s metric, so each sample in the vqa.json has 10 predictions and you need to manually get the score 
* Generate a predictions file 
* Do this for ablation and test split 


In [1]:
# coding=utf-8
# Copyleft 2019 project LXRT.
import neptune.new as neptune
import sys
sys.path.append('../')
import os
import collections

import torch
import torch.nn as nn
from torch.utils.data.dataloader import DataLoader
from tqdm import tqdm
import numpy as np
# from param import args
from pretrain.qa_answer_table import load_lxmert_qa
# from tasks.vqa_model import VQAModel
# from tasks.vqa_data import VQADataset, VQATorchDataset, VQAEvaluator
DataTuple = collections.namedtuple("DataTuple", 'dataset loader evaluator')
import json 
import os
# from tasks.gqa_data import GQADataset, GQATorchDataset, GQAEvaluator
import random



In [2]:
torch.cuda.is_available()

True

In [3]:
import argparse

def get_optimizer(optim):
    # Bind the optimizer
    if optim == 'rms':
        print("Optimizer: Using RMSProp")
        optimizer = torch.optim.RMSprop
    elif optim == 'adam':
        print("Optimizer: Using Adam")
        optimizer = torch.optim.Adam
    elif optim == 'adamax':
        print("Optimizer: Using Adamax")
        optimizer = torch.optim.Adamax
    elif optim == 'sgd':
        print("Optimizer: sgd")
        optimizer = torch.optim.SGD
    elif 'bert' in optim:
        optimizer = 'bert'      # The bert optimizer will be bind later.
    else:
        assert False, "Please add your optimizer %s in the list." % optim

    return optimizer


def parse_args():
    parser = argparse.ArgumentParser()

    # Data Splits
    parser.add_argument("--train", default='train')
    parser.add_argument("--valid", default='')
    parser.add_argument("--test", default='minival')
    parser.add_argument("--subset", type=str, default=None, help='vqa-animals, myo-sports, myo-animals')
    parser.add_argument("--multiclass", action='store_const', default=False, const=True)
    parser.add_argument("--multilabel", action='store_const', default=False, const=True)
    parser.add_argument("--sampling_ids", type=str, default=None) # TODO when using sampling ids with vqa_from_scratch.bash, change sampling args for appropriate neptune logging

    # Training Hyper-parameters
    parser.add_argument('--batchSize', dest='batch_size', type=int, default=256)
    parser.add_argument('--optim', default='bert') 
    parser.add_argument('--lr', type=float, default=1e-4)
    parser.add_argument('--epochs', type=int, default=10)
    parser.add_argument('--dropout', type=float, default=0.1)
    parser.add_argument('--seed', type=int, default=965, help='random seed')

    # Debugging
    parser.add_argument('--output', type=str, default='snap/test')
    parser.add_argument("--fast", action='store_const', default=False, const=True)
    parser.add_argument("--tiny", action='store_const', default=False, const=True)
    parser.add_argument("--tqdm", action='store_const', default=False, const=True)

    # Model Loading
    parser.add_argument('--load', type=str, default='/home/jaspreet/vl-pretraining/snap/vqa/lxr111_multilabel_full_run_3/BEST',
                        help='Load the model (usually the fine-tuned model).')
    parser.add_argument('--loadLXMERT', dest='load_lxmert', type=str, default=None,
                        help='Load the pre-trained LXMERT model.')
    parser.add_argument('--loadLXMERTQA', dest='load_lxmert_qa', type=str, default=None,
                        help='Load the pre-trained LXMERT model with QA answer head.')
    parser.add_argument("--fromScratch", dest='from_scratch', action='store_const', default=False, const=True,
                        help='If none of the --load, --loadLXMERT, --loadLXMERTQA is set, '
                             'the model would be trained from scratch. If --fromScratch is'
                             ' not specified, the model would load BERT-pre-trained weights by'
                             ' default. ')

    # Optimization
    parser.add_argument("--mceLoss", dest='mce_loss', action='store_const', default=False, const=True)

    # LXRT Model Config
    # Note: LXRT = L, X, R (three encoders), Transformer
    parser.add_argument("--llayers", default=1, type=int, help='Number of Language layers')
    parser.add_argument("--xlayers", default=1, type=int, help='Number of CROSS-modality layers.')
    parser.add_argument("--rlayers", default=1, type=int, help='Number of object Relationship layers.')

    # LXMERT Pre-training Config
    parser.add_argument("--taskMatched", dest='task_matched', action='store_const', default=False, const=True)
    parser.add_argument("--taskMaskLM", dest='task_mask_lm', action='store_const', default=False, const=True)
    parser.add_argument("--taskObjPredict", dest='task_obj_predict', action='store_const', default=False, const=True)
    parser.add_argument("--taskQA", dest='task_qa', action='store_const', default=False, const=True)
    parser.add_argument("--visualLosses", dest='visual_losses', default='obj,attr,feat', type=str)
    parser.add_argument("--qaSets", dest='qa_sets', default=None, type=str)
    parser.add_argument("--wordMaskRate", dest='word_mask_rate', default=0.15, type=float)
    parser.add_argument("--objMaskRate", dest='obj_mask_rate', default=0.15, type=float)

    # Training configuration
    parser.add_argument("--multiGPU", action='store_const', default=False, const=True)
    parser.add_argument("--numWorkers", dest='num_workers', default=0)

    # Datamaps
    parser.add_argument("--base_path", default='snap/vqa/lxr111_multilabel_full_run_3/', type=str, help='Path to trained model')
    parser.add_argument("--datamap_title", default='Trained from Scratch on VQA-Multiclass', type=str, help='Title of datamap plot')
    parser.add_argument("--multilabel_datamaps", action='store_const', default=True, const=False)

    # Sampling
    parser.add_argument("--sampling_method", default='min_variability', type=str, help='Sampling algorithm - beta, random, max_variability, min_variability')
    parser.add_argument("--sampling_model", default='LXR111', type=str, help='Name of model you are sampling variability values from')
    parser.add_argument("--training_budget", default=30, type=int, help='Percentage of data sampled')
    parser.add_argument("--sampling_dataset", default='multilabel_full', type=str, help='animals, sports, myo-food, myo-sports, multilabel_full')
    parser.add_argument("--include_all_classes", action='store_const', default=False, const=True )
    
    # Beta sampling
    parser.add_argument("--alpha", default=2, type=int, help='alpha parameter for beta distribution')
    parser.add_argument("--beta", default=2, type=int, help='beta parameter for beta distribution')
    parser.add_argument("--norm", default='pvals', type=str, help='pvals, gaussian_kde, gaussian, tophat, epanechnikov, exponential, linear, cosine')
    parser.add_argument("--bandwidth", default=0.01, type=float, help='bandwidth for beta kernels')

    # Optuna
    parser.add_argument("--optuna_sweep", default='other', type=str, help='beta, other')
    parser.add_argument("--neptune_study_name", default='global_min_variability', type=str, help='name of optuna study')




    # Parse the arguments.
    args = parser.parse_args(args=[])

    # Bind optimizer class.
    args.optimizer = get_optimizer(args.optim)

    # Set seeds
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    np.random.seed(args.seed)
    print(args.seed)

    return args


args = parse_args()

965


In [4]:
# vqa_model.py
# coding=utf-8
# Copyleft 2019 project LXRT.

import torch.nn as nn

# from param import args
from lxrt.entry import LXRTEncoder
from lxrt.modeling import BertLayerNorm, GeLU

# Max length including <bos> and <eos>
MAX_VQA_LENGTH = 20


class VQAModel(nn.Module):
    def __init__(self, num_answers):
        super().__init__()
        
        # Build LXRT encoder
        self.lxrt_encoder = LXRTEncoder(
            args,
            max_seq_length=MAX_VQA_LENGTH
        )
        hid_dim = self.lxrt_encoder.dim
        
        # VQA Answer heads
        self.logit_fc = nn.Sequential(
            nn.Linear(hid_dim, hid_dim * 2),
            GeLU(),
            BertLayerNorm(hid_dim * 2, eps=1e-12),
            nn.Linear(hid_dim * 2, num_answers)
        )
        self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights)

    def forward(self, feat, pos, sent):
        """
        b -- batch_size, o -- object_number, f -- visual_feature_size

        :param feat: (b, o, f)
        :param pos:  (b, o, 4)
        :param sent: (b,) Type -- list of string
        :param leng: (b,) Type -- int numpy array
        :return: (b, num_answer) The logit of each answers.
        """
        x = self.lxrt_encoder(sent, (feat, pos))
        logit = self.logit_fc(x)

        return logit




In [9]:
# vqa.py
def get_data_tuple(splits: str, subset: str, bs:int, shuffle=False, drop_last=False, sampling_ids=None) -> DataTuple:
    dset = VQADataset(splits, subset, sampling_ids)
    if splits != 'minival':
        index_dset = len(dset.data) % bs
        if index_dset != 0:
            dset.data = dset.data[:-index_dset] 
    tset = VQATorchDataset(dset)
    if splits != 'minival':
        index_tset = len(tset.data) % bs
        if index_tset != 0:
            tset.data = tset.data[:-index_tset]
    evaluator = VQAEvaluator(dset)
    data_loader = DataLoader(
        tset, batch_size=bs,
        shuffle=shuffle, num_workers=args.num_workers,
        drop_last=drop_last, pin_memory=True
    )
    return DataTuple(dataset=dset, loader=data_loader, evaluator=evaluator)

def get_gqa_tuple(splits: str, subset: str, bs:int, shuffle=False, drop_last=False) -> DataTuple:
    dset = GQADataset(splits, subset)
    tset = GQATorchDataset(dset)
    evaluator = GQAEvaluator(dset)
    data_loader = DataLoader(
        tset, batch_size=bs,
        shuffle=shuffle, num_workers=args.num_workers,
        drop_last=drop_last, pin_memory=True
    )

    return DataTuple(dataset=dset, loader=data_loader, evaluator=evaluator)

class VQA:
    def __init__(self, sampling_ids=None):
        # Datasets
        self.train_tuple = get_data_tuple(
            args.train, args.subset, bs=args.batch_size, shuffle=True, drop_last=False, sampling_ids=sampling_ids
        )

        if args.valid != "":
            self.valid_tuple = get_data_tuple(
                args.valid, args.subset, bs=1024,
                shuffle=False, drop_last=False
            )
        else:
            self.valid_tuple = None
        
        # Model
        self.model = VQAModel(self.train_tuple.dataset.num_answers)

        # Load pre-trained weights
        if args.load_lxmert is not None:
            self.model.lxrt_encoder.load(args.load_lxmert)
        if args.load_lxmert_qa is not None:
            load_lxmert_qa(args.load_lxmert_qa, self.model,
                           label2ans=self.train_tuple.dataset.label2ans)
        
        # GPU options
        self.model = self.model.cuda()
        #self.model = self.model
        if args.multiGPU:
            self.model.lxrt_encoder.multi_gpu()

        # Loss and Optimizer
        if args.multiclass == True:
            self.loss_fxn = nn.CrossEntropyLoss()
        else:
            self.loss_fxn = nn.BCEWithLogitsLoss()
        if 'bert' in args.optim:
            batch_per_epoch = len(self.train_tuple.loader)
            t_total = int(batch_per_epoch * args.epochs)
            print("BertAdam Total Iters: %d" % t_total)
            from lxrt.optimization import BertAdam
            self.optim = BertAdam(list(self.model.parameters()),
                                  lr=args.lr,
                                  warmup=0.1,
                                  t_total=t_total)
        else:
            self.optim = args.optimizer(self.model.parameters(), args.lr)
        
        # Output Directory
        self.output = args.output
        os.makedirs(self.output, exist_ok=True)

    def train(self, train_tuple, eval_tuple):

        dset, loader, evaluator = train_tuple
        iter_wrapper = (lambda x: tqdm(x, total=len(loader))) if args.tqdm else (lambda x: x)

        best_valid = 0.
        all_loss = []
        valid_scores = []
        train_scores = []
        training_stats = []
        for epoch in range(args.epochs):
            quesid2ans = {}

            for i, (ques_id, feats, boxes, sent, target, img_id) in iter_wrapper(enumerate(loader)):

                self.model.train()
                self.optim.zero_grad()

                feats, boxes, target = feats.cuda(), boxes.cuda(), target.cuda()
                logit = self.model(feats, boxes, sent)

                #assert logit.dim() == target.dim() == 2
                loss = self.loss_fxn(logit, target)
                loss = loss * logit.size(1)
                all_loss.append(loss.detach().cpu().numpy())


                loss.backward()
                nn.utils.clip_grad_norm_(self.model.parameters(), 5.)
                self.optim.step()

                if args.multiclass == True:
                    softmax = torch.nn.Softmax(dim=1)
                    logit_softmax = softmax(logit)
                    gt_preds_probability_softmax = torch.squeeze(logit_softmax.gather(1, torch.unsqueeze(target, 1))) # Batchwise
                    #score, label = logit_softmax.max(1)
                    score, label = logit.max(1)
                else:
                    sigmoid = torch.nn.Sigmoid()
                    logit_sigmoid = sigmoid(logit)
                    score, label = logit.max(1) # gets the max predicted label for each instance 
                    target_bool = (target>0).long()
                    gt_preds_probability_sigmoid = logit_sigmoid * target_bool 

                for qid, l in zip(ques_id, label.cpu().numpy()):
                    ans = dset.label2ans[l]
                    quesid2ans[qid.item()] = ans

                if args.multiclass == True:
                    for idx, question in enumerate(sent):
                        preds = dset.label2ans[np.squeeze(label.cpu().numpy()[idx].astype(int))]
                        ans_gt = dset.label2ans[np.squeeze(target.cpu().numpy()[idx].astype(int))]
                        
                        training_stats.append({
                            "Epoch": int(epoch),
                            "Question ID": int(ques_id[idx]),
                            "Image ID": str(img_id[idx]),
                            "Question": str(question),
                            "Target": str(ans_gt),
                            "Prediction": str(preds),
                            "GT Probability": float(gt_preds_probability_softmax[idx])
                            }
                    )
                        
                    if i%1000 ==0:
                        for idx, question in enumerate(sent):
                            ans_gt = dset.label2ans[target.cpu().numpy()[idx]]
                            preds = dset.label2ans[label.cpu().numpy()[idx]]
                            preds_str = "Image ID: " + img_id[idx] + "\n Question: " + question + "\n ans_gt: " + ans_gt + "\n preds: " + preds + "\n"
                            with open(self.output + "/log_preds.log", 'a') as preds_file:
                                preds_file.write(preds_str)
                                preds_file.flush()
                else:
                    for idx, question in enumerate(sent):
                        preds = dset.label2ans[np.squeeze(label.cpu().numpy()[idx].astype(int))]
                        target_numpy = target_bool.cpu().numpy()[idx]
                        #print("target: ", target_numpy.shape)
                        targets_indices = np.nonzero(target_numpy) # get indices of groundtruth 
                        #print("target indices: ", targets_indices)
                        target_indices_list = []
                        for i in targets_indices[0]:
                            target_indices_list.append(i)
                        #print(target_indices_list)

                        all_ans_gt = []
                        all_probs = []
                        for target_idx in target_indices_list:
                            #print("target idx: ", target_idx)
                            all_ans_gt.append(dset.label2ans[target_idx])
                        probs_sigmoid = gt_preds_probability_sigmoid.detach().cpu().numpy()[idx]
                        
                        probs = probs_sigmoid[np.nonzero(probs_sigmoid)]
                        for x in probs:
                            all_probs.append(str(x))

                        #ans_gt = dset.label2ans[np.squeeze(target.cpu().numpy()[idx].astype(int))]
                        # datum = dset.id2datum[ques_id[idx]]
                        # answer_type = datum['answer_type']
                        # question_type = datum['question_type']
                        # label = datum['label']
                        # score= 0.0
                        # if preds in label:
                        #     score += label[preds]


                        training_stats.append({
                            "Epoch": int(epoch),
                            "Question ID": int(ques_id[idx]),
                            "Image ID": str(img_id[idx]),
                            "Question": str(question),
                            "Target": ', '.join(all_ans_gt),
                            "Prediction": str(preds),
                            "GT Probability": ', '.join(all_probs)
                            }
                    )

            log_str = "\nEpoch %d: Train %0.2f\n" % (epoch, evaluator.evaluate(quesid2ans) * 100.)
            train_scores.append(evaluator.evaluate(quesid2ans) * 100.)

            if self.valid_tuple is not None:  # Do Validation
                valid_score = self.evaluate(eval_tuple)
                valid_scores.append(valid_score)
                if valid_score > best_valid:
                    best_valid = valid_score
                    self.save("BEST")

                log_str += "Epoch %d: Valid %0.2f\n" % (epoch, valid_score * 100.) + \
                           "Epoch %d: Best %0.2f\n" % (epoch, best_valid * 100.)

            print(log_str, end='')

            with open(self.output + "/log.log", 'a') as f:
                f.write(log_str)
                f.flush()

        with open(self.output+'/datamaps_stats.json', 'w') as json_file:
            json.dump(training_stats, json_file, 
                                indent=4,  
                                separators=(',',': '))
        self.save("LAST")
        #return best_valid * 100.

    def predict(self, eval_tuple: DataTuple, train_label2ans=None, dump=None):
        """
        Predict the answers to questions in a data split.

        :param eval_tuple: The data tuple to be evaluated.
        :param dump: The path of saved file to dump results.
        :return: A dict of question_id to answer.
        """
        self.model.eval()
        dset, loader, evaluator = eval_tuple
        quesid2ans = {}
        for i, datum_tuple in enumerate(loader):
            ques_id, feats, boxes, sent = datum_tuple[:4]   # Avoid seeing ground truth
            with torch.no_grad():
                feats, boxes = feats.cuda(), boxes.cuda()
                logit = self.model(feats, boxes, sent)
                if args.multiclass == True:
                    softmax = torch.nn.Softmax()
                    #score, label = softmax(logit).max(1)
                    score, label = logit.max(1)
                else:
                    score, label = logit.max(1) # this will output predictions wrt the vqa classes
                for qid, l in zip(ques_id, label.cpu().numpy()):
                    if train_label2ans != None:
                    #ans = dset.label2ans[l]
                        ans = train_label2ans[l]
                        quesid2ans[qid] = ans
                    else:
                        ans = dset.label2ans[l]
                        quesid2ans[qid.item()] = ans
        if dump is not None:
            evaluator.dump_result(quesid2ans, dump)
        return quesid2ans

    def evaluate(self, eval_tuple: DataTuple, train_label2ans= None, dump=None):
        """Evaluate all data in data_tuple."""

        quesid2ans = self.predict(eval_tuple, train_label2ans=train_label2ans, dump=dump)
        return eval_tuple.evaluator.evaluate(quesid2ans)

    @staticmethod
    def oracle_score(data_tuple):
        dset, loader, evaluator = data_tuple
        quesid2ans = {}
        for i, (ques_id, feats, boxes, sent, target, img_id) in enumerate(loader):
            if args.multiclass == True:
                label = target
            else:
                _, label = target.max(1)
            for qid, l in zip(ques_id, label.cpu().numpy()):
                ans = dset.label2ans[l]
                quesid2ans[qid.item()] = ans
        return evaluator.evaluate(quesid2ans)

    def save(self, name):
        torch.save(self.model.state_dict(),
                   os.path.join(self.output, "%s.pth" % name))

    def load(self, path):
        print("Load model from %s" % path)
        state_dict = torch.load("%s.pth" % path)
        self.model.load_state_dict(state_dict)


In [10]:
# coding=utf-8
# Copyleft 2019 project LXRT.

import json
import os
import pickle

import numpy as np
import torch
from torch.utils.data import Dataset

# from param import args
from utils import load_obj_tsv

# Load part of the dataset for fast checking.
# Notice that here is the number of images instead of the number of data,
# which means all related data to the images would be used.
TINY_IMG_NUM = 512
FAST_IMG_NUM = 5000

# The path to data and image features.
VQA_DATA_ROOT = '../../data/vqa/'
MSCOCO_IMGFEAT_ROOT = '../../data/mscoco_imgfeat/'
SPLIT2NAME = {
    'train': 'train2014',
    'valid': 'val2014',
    'minival': 'val2014',
    'nominival': 'val2014',
    'test': 'test2015',
}


class VQADataset:
    """
    A VQA data example in json file:
        {
            "answer_type": "other",
            "img_id": "COCO_train2014_000000458752",
            "label": {
                "net": 1
            },
            "question_id": 458752000,
            "question_type": "what is this",
            "sent": "What is this photo taken looking through?"
        }
    """
    def __init__(self, splits: str, subset: str, sampling_ids: str):
        self.name = splits
        self.splits = splits.split(',')
        self.subset = subset # training on subsets: multiclass 
        self.sampling_ids = sampling_ids

        loaded_data = []
        for split in self.splits:
            loaded_data.extend(json.load(open("../../data/vqa/%s.json" % split)))

        # exclude examples with no labels
        self.data = []
        # Loading datasets, if no subset is specified, train on full dataset

        print("Loading full multilabel classification dataset")
        if self.sampling_ids != None:
            with open(self.sampling_ids, 'rb') as f:
                self.sampled_ids = pickle.load(f)
            print("ids length: ", len(self.sampled_ids))

        # for datum in loaded_data:
        #     # if 'label' in datum:
        #     #     if len(datum['label']) > 0:
        #     self.data.append(datum)

        # Loading datasets
        loaded_data = []
        for split in self.splits:
            loaded_data.extend(json.load(open("../../data/vqa/%s.json" % split)))
        #print("Load %d data from split(s) %s." % (len(self.data), self.name))
        self.data = []

        for datum in loaded_data:
            if 'minival' in self.splits:
                    self.data.append(datum)
            else:
                if self.sampling_ids != None:                            
                    if datum['question_id'] in self.sampled_ids:
                        self.data.append(datum)
                else:
                    self.data.append(datum)   

        print("Load %d data from split(s) %s." % (len(self.data), self.name))

        # Convert list to dict (for evaluation)
        self.id2datum = {
            datum['question_id']: datum
            for datum in self.data
        }

        # Answers
        self.ans2label = json.load(open("../../data/vqa/trainval_ans2label.json"))
        self.label2ans = json.load(open("../../data/vqa/trainval_label2ans.json"))
        assert len(self.ans2label) == len(self.label2ans)

    @property
    def num_answers(self):
        return len(self.ans2label)

    def __len__(self):
        return len(self.data)


"""
An example in obj36 tsv:
FIELDNAMES = ["img_id", "img_h", "img_w", "objects_id", "objects_conf",
              "attrs_id", "attrs_conf", "num_boxes", "boxes", "features"]
FIELDNAMES would be keys in the dict returned by load_obj_tsv.
"""
class VQATorchDataset(Dataset):
    def __init__(self, dataset: VQADataset):
        super().__init__()
        self.raw_dataset = dataset
        if args.tiny:
            topk = TINY_IMG_NUM
        elif args.fast:
            topk = FAST_IMG_NUM
        else:
            topk = None

        # Loading detection features to img_data
        img_data = []
        for split in dataset.splits:
            # Minival is 5K images in MS COCO, which is used in evaluating VQA/LXMERT-pre-training.
            # It is saved as the top 5K features in val2014_***.tsv
            load_topk = 5000 if (split == 'minival' and topk is None) else topk
            img_data.extend(load_obj_tsv(
                os.path.join(MSCOCO_IMGFEAT_ROOT, '%s_obj36.tsv' % (SPLIT2NAME[split])),
                topk=load_topk))

        # Convert img list to dict
        self.imgid2img = {}
        for img_datum in img_data:
            self.imgid2img[img_datum['img_id']] = img_datum

        # Only kept the data with loaded image features
        self.data = []
        for datum in self.raw_dataset.data:
            if datum['img_id'] in self.imgid2img:
                self.data.append(datum)
        print("Use %d data in torch dataset" % (len(self.data)))
        print()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, item: int):
        datum = self.data[item]

        # while datum['question_id'] in self.exclude_ids:
        #     datum = self.data[item+1]


        img_id = datum['img_id']
        ques_id = datum['question_id']
        ques = datum['sent']

        # Get image info
        img_info = self.imgid2img[img_id]
        obj_num = img_info['num_boxes']
        feats = img_info['features'].copy()
        boxes = img_info['boxes'].copy()
        assert obj_num == len(boxes) == len(feats)

        # Normalize the boxes (to 0 ~ 1)
        img_h, img_w = img_info['img_h'], img_info['img_w']
        boxes = boxes.copy()
        boxes[:, (0, 2)] /= img_w
        boxes[:, (1, 3)] /= img_h
        np.testing.assert_array_less(boxes, 1+1e-5)
        np.testing.assert_array_less(-boxes, 0+1e-5)

        # # Provide label (target)
        # if 'label' in datum:
        #     label = datum['label']
        #     target = torch.zeros(self.raw_dataset.num_answers)
        #     for ans, score in label.items():
        #         target[self.raw_dataset.ans2label[ans]] = score
        #     return ques_id, feats, boxes, ques, target
        # else:
        #     return ques_id, feats, boxes, ques
        # Provide label (target)
        if 'label' in datum:
            label = datum['label']
            target = torch.zeros(self.raw_dataset.num_answers)
            if args.multiclass == True:
                assert len(label) == 1 # ensure there is only one gold label
                for ans, score in label.items():
                    #if ans in self.raw_dataset.filtered: # double check the if answer is in filtered category
                    target[self.raw_dataset.ans2label[ans]] = 1.0
                target = torch.squeeze(target.nonzero())
                target = target.long()
            else:
                for ans, score in label.items():
                    target[self.raw_dataset.ans2label[ans]] = score
            return ques_id, feats, boxes, ques, target, img_id
        else:
            return ques_id, feats, boxes, ques


class VQAEvaluator:
    def __init__(self, dataset: VQADataset):
        self.dataset = dataset

    def evaluate(self, quesid2ans: dict):
        score = 0.
        for quesid, ans in quesid2ans.items():
            datum = self.dataset.id2datum[quesid]
            label = datum['label']
            if ans in label:
                score += label[ans]
        return score / len(quesid2ans)

    def dump_result(self, quesid2ans: dict, path):
        """
        Dump results to a json file, which could be submitted to the VQA online evaluation.
        VQA json file submission requirement:
            results = [result]
            result = {
                "question_id": int,
                "answer": str
            }

        :param quesid2ans: dict of quesid --> ans
        :param path: The desired path of saved file.
        """

        with open(path, 'w') as f:
            result = []
            for ques_id, ans in quesid2ans.items():
                datum = self.dataset.id2datum[ques_id]
                answer_type = datum['answer_type']
                img_id = datum['img_id']
                label = datum['label']
                question_type = datum['question_type']
                question = datum['sent']
                score = 0.
                if ans in label:
                    score += label[ans]
                

                result.append({
                    'question_id': ques_id,
                    'answer': ans,
                    'answer_type': answer_type,
                    'img_id': img_id,
                    'label': label,
                    'question_type': question_type,
                    'question': question,
                    'score': score
                })
            json.dump(result, f, indent=4, sort_keys=True)




In [11]:
def get_score(occurences):
    if occurences == 0:
        return 0
    elif occurences == 1:
        return 0.3
    elif occurences == 2:
        return 0.6
    elif occurences == 3:
        return 0.9
    else:
        return 1

In [12]:
vqa = VQA()


Loading full multilabel classification dataset
Load 443757 data from split(s) train.
Start to load Faster-RCNN detected objects from ../../data/mscoco_imgfeat/train2014_obj36.tsv
Loaded 82783 images in file ../../data/mscoco_imgfeat/train2014_obj36.tsv in 328 seconds.
Use 443648 data in torch dataset

LXRT encoder with 1 l_layers, 1 x_layers, and 1 r_layers.
BertAdam Total Iters: 17330


In [13]:
if args.load is not None:
    vqa.load(args.load)

Load model from /home/jaspreet/vl-pretraining/snap/vqa/lxr111_multilabel_full_run_3/BEST


In [14]:
result = vqa.evaluate(
    get_data_tuple('minival', args.subset, bs=950,
                shuffle=False, drop_last=False)
)
# dump=os.path.join(args.output, 'minival_predict.json')
print(result)

Loading full multilabel classification dataset
Load 25994 data from split(s) minival.
Start to load Faster-RCNN detected objects from ../../data/mscoco_imgfeat/val2014_obj36.tsv
Loaded 5000 images in file ../../data/mscoco_imgfeat/val2014_obj36.tsv in 19 seconds.
Use 25994 data in torch dataset

0.6317380934061486
