In [1]:
import json

from collections import Counter

In [2]:
base_path = 'path to base dir'
train_anno_path = base_path + "data/annotations/mscoco_train2014_annotations.json"
val_anno_path = base_path + "data/annotations/mscoco_val2014_annotations.json"

train_ques_path = base_path + "data/annotations/OpenEnded_mscoco_train2014_questions.json"
val_ques_path = base_path + "data/annotations/OpenEnded_mscoco_val2014_questions.json"
testdev_ques_path = base_path + "data/annotations/OpenEnded_mscoco_test-dev2015_questions.json"
test_ques_path = base_path + "data/annotations/OpenEnded_mscoco_test2015_questions.json"

trainmc_ques_path = base_path + "data/annotations/MultipleChoice_mscoco_train2014_questions.json"
valmc_ques_path = base_path + "data/annotations/MultipleChoice_mscoco_val2014_questions.json"
testdevmc_ques_path = base_path + "data/annotations/MultipleChoice_mscoco_test-dev2015_questions.json"
testmc_ques_path = base_path + "data/annotations/MultipleChoice_mscoco_test2015_questions.json"

In [3]:
def num_to_score(num):
    if num > 3:
        return 1
    elif num == 3:
        return 0.9
    elif num == 2:
        return 0.6
    elif num == 1:
        return 0.3
    elif num == 0:
        return 0
    else:
        raise TypeError("Wrong type of number!")

In [4]:
def save_dataset(subtype, ques_path, mc_path, anno_path=None):
    dataset = []
    imdir = base_path + "data/images/%s/COCO_%s_%012d.jpg"
    ques_data = json.load(open(ques_path, "r"))
    mc_data = json.load(open(mc_path, "r"))
    anno_data = json.load(open(anno_path, "r")) if anno_path is not None else None
    
    for i in range(len(ques_data["questions"])):
        ques = ques_data["questions"][i]["question"]
        ques_id = ques_data["questions"][i]["question_id"]
        mc = mc_data["questions"][i]["multiple_choices"]
        img_id = ques_data["questions"][i]["image_id"]
        image_path = imdir%(subtype, subtype, img_id)
        
        item = {"ques_id": [ques_id], "img_path": image_path, "ques": [ques], "id": img_id, "mc": [mc]}
        
        if anno_path is not None:
            mc_ans = anno_data["annotations"][i]["multiple_choice_answer"]
            answers = Counter()
            for ans in anno_data["annotations"][i]["answers"]:
                answers.update([ans["answer"]])
            answers = [(ans, num_to_score(num)) for ans, num in answers.items()]
            
            assert img_id == anno_data["annotations"][i]["image_id"], "Image index doesn't match!"
            assert ques_id == anno_data["annotations"][i]["question_id"], "Question index doesn't match!"
            
            item["mc_ans"] = [mc_ans]
            item["ans"] = [answers]
        dataset.append(item)
        if (i+1) % 1000 == 0:
            print("processing %i/%i" % (i, len(ques_data["questions"])))
        
    return dataset

In [5]:
trainset = save_dataset("train2014", train_ques_path, trainmc_ques_path, train_anno_path)

processing 999/248349
processing 1999/248349
processing 2999/248349
processing 3999/248349
processing 4999/248349
processing 5999/248349
processing 6999/248349
processing 7999/248349
processing 8999/248349
processing 9999/248349
processing 10999/248349
processing 11999/248349
processing 12999/248349
processing 13999/248349
processing 14999/248349
processing 15999/248349
processing 16999/248349
processing 17999/248349
processing 18999/248349
processing 19999/248349
processing 20999/248349
processing 21999/248349
processing 22999/248349
processing 23999/248349
processing 24999/248349
processing 25999/248349
processing 26999/248349
processing 27999/248349
processing 28999/248349
processing 29999/248349
processing 30999/248349
processing 31999/248349
processing 32999/248349
processing 33999/248349
processing 34999/248349
processing 35999/248349
processing 36999/248349
processing 37999/248349
processing 38999/248349
processing 39999/248349
processing 40999/248349
processing 41999/248349
pro

In [6]:
print(len(trainset))

num_samples = 5
for i in range(num_samples):
    print(trainset[i])

248349
{'ques_id': [4870250], 'img_path': '/dip_project/data/images/train2014/COCO_train2014_000000487025.jpg', 'ques': ['What shape is the bench seat?'], 'id': 487025, 'mc': [['square', '1', 'w 26th st', 'white', 'rectangular', '2', 'red', '3', 'blue', '4', 'yellow sign', 'curved', 'green', 'no', 'rectangle', 'croatia express', 'medal', 'yes']], 'mc_ans': ['curved'], 'ans': [[('oval', 0.3), ('semi circle', 0.3), ('curved', 1), ('double curve', 0.3), ('banana', 0.3), ('wavy', 0.3), ('twisting', 0.3)]]}
{'ques_id': [4870251], 'img_path': '/dip_project/data/images/train2014/COCO_train2014_000000487025.jpg', 'ques': ['Is there a shadow?'], 'id': 487025, 'mc': [['north carolina', 'against wall', '1', '3', 'cookbook', 'audience', 'ppk', '2', 'elgin county', 'blue', 'desk', 'green', 'red', '4', 'adjusting woman oxygen', 'yes', 'no', 'white']], 'mc_ans': ['yes'], 'ans': [[('yes', 1)]]}
{'ques_id': [4870252], 'img_path': '/dip_project/data/images/train2014/COCO_train2014_000000487025.jpg', 'qu

In [7]:
train_path = base_path + "data/v1_mscoco_train.json"

with open(train_path, "w") as f:
    json.dump(trainset, f)

In [8]:
valset = save_dataset("val2014", val_ques_path, valmc_ques_path, val_anno_path)

processing 999/121512
processing 1999/121512
processing 2999/121512
processing 3999/121512
processing 4999/121512
processing 5999/121512
processing 6999/121512
processing 7999/121512
processing 8999/121512
processing 9999/121512
processing 10999/121512
processing 11999/121512
processing 12999/121512
processing 13999/121512
processing 14999/121512
processing 15999/121512
processing 16999/121512
processing 17999/121512
processing 18999/121512
processing 19999/121512
processing 20999/121512
processing 21999/121512
processing 22999/121512
processing 23999/121512
processing 24999/121512
processing 25999/121512
processing 26999/121512
processing 27999/121512
processing 28999/121512
processing 29999/121512
processing 30999/121512
processing 31999/121512
processing 32999/121512
processing 33999/121512
processing 34999/121512
processing 35999/121512
processing 36999/121512
processing 37999/121512
processing 38999/121512
processing 39999/121512
processing 40999/121512
processing 41999/121512
pro

In [9]:
print(len(valset))

num_samples = 5
for i in range(num_samples):
    print(valset[i])

121512
{'ques_id': [3506232], 'img_path': '/dip_project/data/images/val2014/COCO_val2014_000000350623.jpg', 'ques': ['What is the table made of?'], 'id': 350623, 'mc': [['4', 'green', 'no', 'metal', '2', 'blue', 'plastic', 'marble', 'wood', 'white', 'red', 'concrete bricks', 'robe', '3', '1', 'yes', 'siam', 'white and black']], 'mc_ans': ['wood'], 'ans': [[('wood', 1)]]}
{'ques_id': [3506230], 'img_path': '/dip_project/data/images/val2014/COCO_val2014_000000350623.jpg', 'ques': ['Is the food napping on the table?'], 'id': 350623, 'mc': [['tv is on', 'red', 'blue', '3', 'green', 'no', 'white', 'yes', '1', 'disney', 'on porch', 'mother and child', '2', 'slug bug', 'cartoons', '4', 'playing with dog', 'burger']], 'mc_ans': ['no'], 'ans': [[('no', 1), ('yes', 0.6)]]}
{'ques_id': [3506231], 'img_path': '/dip_project/data/images/val2014/COCO_val2014_000000350623.jpg', 'ques': ['What has been upcycled to make lights?'], 'id': 350623, 'mc': [['3', 'build bear', 'no', 'blue', 'bulbs', 'green', 

In [10]:
val_path = base_path + "data/v1_mscoco_val.json"

with open(val_path, "w") as f:
    json.dump(valset, f)

In [11]:
trainval_path = base_path + "data/v1_mscoco_trainval.json"

trainvalset = trainset + valset
with open(trainval_path, "w") as f:
    json.dump(trainvalset, f)

In [12]:
testdevset = save_dataset("test2015", testdev_ques_path, testdevmc_ques_path)

processing 999/60864
processing 1999/60864
processing 2999/60864
processing 3999/60864
processing 4999/60864
processing 5999/60864
processing 6999/60864
processing 7999/60864
processing 8999/60864
processing 9999/60864
processing 10999/60864
processing 11999/60864
processing 12999/60864
processing 13999/60864
processing 14999/60864
processing 15999/60864
processing 16999/60864
processing 17999/60864
processing 18999/60864
processing 19999/60864
processing 20999/60864
processing 21999/60864
processing 22999/60864
processing 23999/60864
processing 24999/60864
processing 25999/60864
processing 26999/60864
processing 27999/60864
processing 28999/60864
processing 29999/60864
processing 30999/60864
processing 31999/60864
processing 32999/60864
processing 33999/60864
processing 34999/60864
processing 35999/60864
processing 36999/60864
processing 37999/60864
processing 38999/60864
processing 39999/60864
processing 40999/60864
processing 41999/60864
processing 42999/60864
processing 43999/60864

In [13]:
print(len(testdevset))

num_samples = 5
for i in range(num_samples):
    print(testdevset[i])

60864
{'ques_id': [4195880], 'img_path': '/dip_project/data/images/test2015/COCO_test2015_000000419588.jpg', 'ques': ['Are the dogs tied?'], 'id': 419588, 'mc': [['1', 'bare', 'bacon hot dog beans', 'stumbling', '4', 'no', '3', 'black', 'yes', '2', 'ringling bros and barnum & bailey', 'quilted northern', 'junk', 'white', 'blue', 'hopefully', 'red', 'grass']]}
{'ques_id': [4195881], 'img_path': '/dip_project/data/images/test2015/COCO_test2015_000000419588.jpg', 'ques': ['Is this a car show?'], 'id': 419588, 'mc': [['chickens', 'wedding', 'not', 'bowling alley', 'casa nos bairros', 'raincoat', '3', 'no', 'red', 'yes', 'blue', 'pc', 'black', '1', '2', 'on buffalo', 'white', '4']]}
{'ques_id': [4195882], 'img_path': '/dip_project/data/images/test2015/COCO_test2015_000000419588.jpg', 'ques': ['Is there a lady sitting inside the red truck?'], 'id': 419588, 'mc': [['black', '3', 'southern', 'blueberries', 'blonde and blue', 'blue', 'in field', 'train station', 'no', '2', '4', 'n', 'yes', '1',

In [14]:
testdev_path = base_path + "data/v1_mscoco_testdev.json"

with open(testdev_path, "w") as f:
    json.dump(testdevset, f)

In [15]:
testset = save_dataset("test2015", test_ques_path, testmc_ques_path)

processing 999/244302
processing 1999/244302
processing 2999/244302
processing 3999/244302
processing 4999/244302
processing 5999/244302
processing 6999/244302
processing 7999/244302
processing 8999/244302
processing 9999/244302
processing 10999/244302
processing 11999/244302
processing 12999/244302
processing 13999/244302
processing 14999/244302
processing 15999/244302
processing 16999/244302
processing 17999/244302
processing 18999/244302
processing 19999/244302
processing 20999/244302
processing 21999/244302
processing 22999/244302
processing 23999/244302
processing 24999/244302
processing 25999/244302
processing 26999/244302
processing 27999/244302
processing 28999/244302
processing 29999/244302
processing 30999/244302
processing 31999/244302
processing 32999/244302
processing 33999/244302
processing 34999/244302
processing 35999/244302
processing 36999/244302
processing 37999/244302
processing 38999/244302
processing 39999/244302
processing 40999/244302
processing 41999/244302
pro

In [16]:
print(len(testset))

num_samples = 5
for i in range(num_samples):
    print(testset[i])

244302
{'ques_id': [4195880], 'img_path': '/dip_project/data/images/test2015/COCO_test2015_000000419588.jpg', 'ques': ['Are the dogs tied?'], 'id': 419588, 'mc': [['1', 'bare', 'bacon hot dog beans', 'stumbling', '4', 'no', '3', 'black', 'yes', '2', 'ringling bros and barnum & bailey', 'quilted northern', 'junk', 'white', 'blue', 'hopefully', 'red', 'grass']]}
{'ques_id': [4195881], 'img_path': '/dip_project/data/images/test2015/COCO_test2015_000000419588.jpg', 'ques': ['Is this a car show?'], 'id': 419588, 'mc': [['chickens', 'wedding', 'not', 'bowling alley', 'casa nos bairros', 'raincoat', '3', 'no', 'red', 'yes', 'blue', 'pc', 'black', '1', '2', 'on buffalo', 'white', '4']]}
{'ques_id': [4195882], 'img_path': '/dip_project/data/images/test2015/COCO_test2015_000000419588.jpg', 'ques': ['Is there a lady sitting inside the red truck?'], 'id': 419588, 'mc': [['black', '3', 'southern', 'blueberries', 'blonde and blue', 'blue', 'in field', 'train station', 'no', '2', '4', 'n', 'yes', '1'

In [17]:
test_path = base_path + "data/v1_mscoco_test.json"

with open(test_path, "w") as f:
    json.dump(testset, f)