In [1]:
%load_ext autoreload
%autoreload 2

import json

split = 'val'
results = json.load(open('saved_model/%s.json'%split))
dataset = json.load(open('/home/liqing/Desktop/VizWiz_new/data/Annotations/%s.json'%split))
img2gt = {x['image']:x['answers'] for x in dataset}

In [2]:
from collections import Counter
img2ans_type = {}
for one_data in dataset:
    ans_counter = Counter([x['answer'] for x in one_data['answers']])
    ans = ans_counter.most_common(1)[0][0]
    if ans == 'yes' or ans == 'no':
        ans_type = 'yes/no'
    elif ans == 'unanswerable' or ans == 'unsuitable':
        ans_type = 'unanswerable'
    elif ans.isdigit():
        ans_type = 'number'
    else:
        ans_type = 'other'
    img2ans_type[one_data['image']] = ans_type
    
all_ans = img2ans_type.values()
print len(all_ans)
for ans_type in set(all_ans):
    print ans_type, ':', all_ans.count(ans_type)*1.0/len(all_ans)

3173
other : 0.594075007879
number : 0.0151276394579
unanswerable : 0.350772139931
yes/no : 0.0400252127324


In [3]:
import numpy as np
img2acc = {}
for pred in results:
    img = pred['image']
    pred_ans = pred['answer']
    gt_ans = img2gt[img]
    gt_ans = [x['answer'] for x in gt_ans]
    gt_ans = [x.lower() for x in gt_ans]
    cur_acc = np.minimum(1.0, gt_ans.count(pred_ans)/3.0)
    img2acc[img] = cur_acc

print 'Accuracy :', np.mean(img2acc.values())
for ans_type in set(all_ans):
    acc_per_type = np.mean([acc for img, acc in img2acc.items() if img2ans_type[img] == ans_type])
    print ans_type, ':', acc_per_type

Accuracy : 0.48681584200021005
other : 0.29885057471264365
number : 0.29166666666666663
unanswerable : 0.796346211440551
yes/no : 0.6377952755905512


In [4]:
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider

class COCOEvalCap:
    def __init__(self,images,gts,res):
        self.evalImgs = []
        self.eval = {}
        self.imgToEval = {}
        self.params = {'image_id': images}
        self.gts = gts
        self.res = res

    def evaluate(self):
        imgIds = self.params['image_id']
        gts = self.gts
        res = self.res

        # =================================================
        # Set up scorers
        # =================================================
        print 'tokenization...'
        tokenizer = PTBTokenizer()
        gts  = tokenizer.tokenize(gts)
        res = tokenizer.tokenize(res)

        # =================================================
        # Set up scorers
        # =================================================
        print 'setting up scorers...'
        scorers = [
            (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
            (Meteor(),"METEOR"),
            (Rouge(), "ROUGE_L"),
            (Cider(), "CIDEr")
        ]

        # =================================================
        # Compute scores
        # =================================================
        eval = {}
        for scorer, method in scorers:
            print 'computing %s score...'%(scorer.method())
            assert(set(gts.keys()) == set(res.keys()))
            score, scores = scorer.compute_score(gts, res)
            if type(method) == list:
                for sc, scs, m in zip(score, scores, method):
                    self.setEval(sc, m)
                    self.setImgToEvalImgs(scs, imgIds, m)
                    print "%s: %0.3f"%(m, sc)
            else:
                self.setEval(score, method)
                self.setImgToEvalImgs(scores, imgIds, method)
                print "%s: %0.3f"%(method, score)
        self.setEvalImgs()

    def setEval(self, score, method):
        self.eval[method] = score

    def setImgToEvalImgs(self, scores, imgIds, method):
        for imgId, score in zip(imgIds, scores):
            if not imgId in self.imgToEval:
                self.imgToEval[imgId] = {}
                self.imgToEval[imgId]["image_id"] = imgId
            self.imgToEval[imgId][method] = score

    def setEvalImgs(self):
        self.evalImgs = [eval for imgId, eval in self.imgToEval.items()]

In [5]:
res = {x['image']:[{'image_id':x['image'], 'caption':x['answer']}] for x in results}
gts = {}
for img, ans_list in img2gt.items():
    ans_list = [x['answer'] for x in ans_list]
    tmp = []
    for x in ans_list:
        try:
            tmp.append(str(x))
        except:
            pass
    ans_list = tmp
    ans_list = [{'image_id': img, 'caption': str(x)} for x in ans_list]
    gts[img] = ans_list

for img in gts.keys():
    if img not in res.keys():
        res[img] = [{'image_id':img, 'caption':''}]
        
evalObj = COCOEvalCap(gts.keys(),gts,res)
evalObj.evaluate()
print evalObj.eval

tokenization...
setting up scorers...
computing Bleu score...
{'reflen': 3533, 'guess': [3394, 221, 44, 8], 'testlen': 3394, 'correct': [2096, 114, 20, 7]}
ratio: 0.960656665723
Bleu_1: 0.593
Bleu_2: 0.542
Bleu_3: 0.504
Bleu_4: 0.573
computing METEOR score...
METEOR: 0.323
computing Rouge score...
ROUGE_L: 0.612
computing CIDEr score...
CIDEr: 0.726
{'CIDEr': 0.7261421572875759, 'Bleu_4': 0.5726746769784576, 'Bleu_3': 0.5040458898496727, 'Bleu_2': 0.5417633738134077, 'Bleu_1': 0.5927793583734714, 'ROUGE_L': 0.6121488343688126, 'METEOR': 0.3227668804784314}


In [6]:
import cPickle as pkl
prob = pkl.load(open('saved_model/%s_prob.pkl'%split))
answer2answer_id = json.load(open('data/create_vocab/answer2answer_id.json'))
unanswerable_labels = [answer2answer_id['unanswerable'], answer2answer_id['unsuitable']]
img2answerable = {x['image']:x['answerable'] for x in dataset}

In [7]:
from sklearn.metrics import recall_score, average_precision_score, precision_recall_curve

y_test = []
pred = []

for res in results:
    img = res['image']
    gt_ans = img2answerable[img]
    y_test.append(gt_ans)
    one_prob = prob[img]
    one_pred = 1 - sum([one_prob[x] for x in unanswerable_labels])
    pred.append(one_pred)
y_test = np.array(y_test)
pred = np.array(pred)

gt_labels = np.asarray(y_test) > 0.5
precision, recall, thresholds = precision_recall_curve(gt_labels, pred)
average_precision = average_precision_score(gt_labels, pred)
print "AP_rel: %.4f"%average_precision
with open('saved_model/results_rel.txt','w') as fid:
    fid.write(str(average_precision))
    fid.write('\n')
    fid.write('\n'.join(['%.4f\t%.4f\t%.4f'%x for x in list(zip(recall,precision,thresholds))[::-1]]))


gt_labels_n = np.asarray(y_test) < 0.5
pred_n = 1.0 - pred
precision, recall, thresholds = precision_recall_curve(gt_labels_n, pred_n)
average_precision = average_precision_score(gt_labels_n, pred_n)
print "AP_irrel: %.4f"%average_precision
with open('saved_model/results_irrel.txt','w') as fid:
    fid.write(str(average_precision))
    fid.write('\n')
    fid.write('\n'.join(['%.4f\t%.4f\t%.4f'%x for x in list(zip(recall,precision,thresholds))[::-1]]))

AP_rel: 0.8777
AP_irrel: 0.5791
