# VQA2.0 dataset visualisation

This notebook documents how to create the data for VQA2.0 data visualization. 

The graph is plotted using Excel, the graphs are documented in `dataset_visualization.xlsx`.

prerequisite: The below four files should exists
* VQA2/v2_OpenEnded_mscoco_train2014_questions.json
* VQA2/v2_OpenEnded_mscoco_val2014_questions.json
* VQA2/v2_mscoco_train2014_annotations.json
* VQA2/v2_mscoco_val2014_annotations.json

They can be downloaded from the below sites respectively
* https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Train_mscoco.zip
* https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Val_mscoco.zip
* https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Annotations_Train_mscoco.zip
* https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Annotations_Val_mscoco.zip

In [1]:
# import libraries
import json
from collections import Counter
import csv
import nltk

nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

# train dataset visualization data creation

In [2]:
# training question json
train_qn_file = "VQA2/v2_OpenEnded_mscoco_train2014_questions.json"
with open(train_qn_file) as fs:
    train_qns = json.load(fs)

# training answers json
train_ans_file = "VQA2/v2_mscoco_train2014_annotations.json"
with open(train_ans_file) as fs:
    train_ans = json.load(fs)
    print(train_ans["annotations"][:10])

[{'question_type': 'what is this', 'multiple_choice_answer': 'net', 'answers': [{'answer': 'net', 'answer_confidence': 'maybe', 'answer_id': 1}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 2}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 3}, {'answer': 'netting', 'answer_confidence': 'yes', 'answer_id': 4}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 5}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 6}, {'answer': 'mesh', 'answer_confidence': 'maybe', 'answer_id': 7}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 8}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 9}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 10}], 'image_id': 458752, 'answer_type': 'other', 'question_id': 458752000}, {'question_type': 'what', 'multiple_choice_answer': 'pitcher', 'answers': [{'answer': 'pitcher', 'answer_confidence': 'yes', 'answer_id': 1}, {'answer': 'catcher', 'answer_confidence': 'no', 'answer_id': 2}, 

In [3]:
# get the number of yes/no questions and their respective questions
# according to answer json

train_yes_no_qn_ids = []
train_total_yes_no = 0
train_yes_num = 0
train_no_num = 0
for ans in train_ans["annotations"]:
    if ans["answer_type"] == "yes/no":   # yes/no questions
        train_total_yes_no += 1
        train_yes_no_qn_ids.append(ans["question_id"])   # save ids
        if ans["multiple_choice_answer"] == "yes":
            train_yes_num += 1
        else:
            train_no_num += 1
print("Training set, number of yes/no questions:", train_total_yes_no)
print("Num of yes answer questions: ", train_yes_num)
print("Num of no answer questions: ", train_no_num)

Training set, number of yes/no questions: 166882
Num of yes answer questions:  84615
Num of no answer questions:  82267


In [4]:
# get all the question strings corresponding to yes/no answers
train_yes_no_ids = set(train_yes_no_qn_ids)

train_yes_no_qn = 0
train_qn_word_list = []
for qn in train_qns["questions"]:
    if qn["question_id"] in train_yes_no_ids:
        train_yes_no_qn += 1
        train_qn_word_list.extend([i.lower() for i in qn["question"][:-1].split(" ")])

print(train_qn_word_list[:100])
print("number of yes no questions: ", train_yes_no_qn)

train_joined_string = " ".join(train_qn_word_list)
train_cleaned_list = nltk.word_tokenize(train_joined_string)
train_counts = Counter(train_cleaned_list)
train_sort_wc = sorted(train_counts.items(), key=lambda x: x[1], reverse=True)
print(train_sort_wc)


['is', 'this', 'man', 'a', 'professional', 'baseball', 'player', 'is', 'the', 'dog', 'waiting', 'is', 'the', 'sky', 'blue', 'is', 'there', 'snow', 'on', 'the', 'mountains', 'is', 'the', 'window', 'open', 'is', 'she', 'brushing', 'is', 'the', 'man', 'smiling', 'does', 'his', 'tie', 'pair', 'well', 'with', 'his', 'suit', 'does', 'the', 'man', 'look', 'happy', 'is', 'this', 'photo', 'in', 'color', 'is', 'the', 'man', 'wearing', 'a', 'plain', 'tie', 'judging', 'from', 'the', 'dress,', 'was', 'this', 'taken', 'in', 'a', 'latin', 'american', 'country', 'are', 'the', 'men', 'on', 'the', 'sidewalk', 'was', 'this', 'photo', 'taken', 'recently', 'does', 'the', 'guy', 'have', 'a', 'tattoo', 'is', 'the', "man's", 'visor', 'providing', 'his', 'face', 'enough', 'protection', 'is', 'the', 'man', 'riding', 'on']
number of yes no questions:  166882


In [5]:
# save word count to csv file
with open('train_word_count.csv', 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=',',
                            quotechar='"', quoting=csv.QUOTE_MINIMAL)
    for word in train_sort_wc:
        spamwriter.writerow([word[0], word[1]])

In [6]:
# get part of speecg tag
train_pos = nltk.pos_tag(list(train_counts.keys()))
print(train_pos)



In [7]:
# find verb, noun, adjective, spatial prepersation
spatial_prep_word = [
                "above", "across", "around","behind", "beside", "down", 
                "from", "in", "inside", "into", "near", "next", "on", "out", 
                "outside", "over", "through", "under", "up", "with"
              ]
train_verb = {}
train_noun = {}
train_adjective = {}
train_spatial_prep = {}

for word in train_pos:
    if word[0] in spatial_prep_word:
        train_spatial_prep[word[0]] = train_counts[word[0]]
        continue
    if word[1].startswith("J"):
        train_adjective[word[0]] = train_counts[word[0]]
        continue
    if word[1].startswith("NN"):
        train_noun[word[0]] = train_counts[word[0]]
        continue
    if word[1].startswith("VB"):
        train_verb[word[0]] = train_counts[word[0]]
        continue
        

In [8]:
# sort part of speech wourd count dictionaries
train_verb_sort_wc = sorted(train_verb.items(), key=lambda x: x[1], reverse=True)
train_adjective_sort_wc = sorted(train_adjective.items(), key=lambda x: x[1], reverse=True)
train_noun_sort_wc = sorted(train_noun.items(), key=lambda x: x[1], reverse=True)
train_spatial_prep_sort_wc = sorted(train_spatial_prep.items(), key=lambda x: x[1], reverse=True)
train_spatial_prep_sort_wc

[('in', 26122),
 ('on', 17285),
 ('with', 1515),
 ('up', 1160),
 ('out', 1079),
 ('from', 1074),
 ('outside', 1046),
 ('down', 804),
 ('inside', 496),
 ('near', 478),
 ('behind', 467),
 ('over', 413),
 ('next', 362),
 ('under', 346),
 ('around', 315),
 ('into', 309),
 ('through', 288),
 ('above', 187),
 ('across', 62),
 ('beside', 54)]

In [9]:
# save sorted part of speech result to csv

# verb
with open('train_word_count_verb.csv', 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=',',
                            quotechar='"', quoting=csv.QUOTE_MINIMAL)
    for word in train_verb_sort_wc:
        spamwriter.writerow([word[0], word[1]])

# noun
with open('train_word_count_noun.csv', 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=',',
                            quotechar='"', quoting=csv.QUOTE_MINIMAL)
    for word in train_noun_sort_wc:
        spamwriter.writerow([word[0], word[1]])

# adjective
with open('train_word_count_adj.csv', 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=',',
                            quotechar='"', quoting=csv.QUOTE_MINIMAL)
    for word in train_adjective_sort_wc:
        spamwriter.writerow([word[0], word[1]])

# prepersation
with open('train_word_count_prep.csv', 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=',',
                            quotechar='"', quoting=csv.QUOTE_MINIMAL)
    for word in train_spatial_prep_sort_wc:
        spamwriter.writerow([word[0], word[1]])

# validation set dataset visualization data creation

In [10]:
# val question json
val_qn_file = "VQA2/v2_OpenEnded_mscoco_val2014_questions.json"
with open(val_qn_file) as fs:
    val_qns = json.load(fs)

# val answer json
val_ans_file = "VQA2/v2_mscoco_val2014_annotations.json"
with open(val_ans_file) as fs:
    val_ans = json.load(fs)
    print(train_ans["annotations"][:10])

[{'question_type': 'what is this', 'multiple_choice_answer': 'net', 'answers': [{'answer': 'net', 'answer_confidence': 'maybe', 'answer_id': 1}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 2}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 3}, {'answer': 'netting', 'answer_confidence': 'yes', 'answer_id': 4}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 5}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 6}, {'answer': 'mesh', 'answer_confidence': 'maybe', 'answer_id': 7}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 8}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 9}, {'answer': 'net', 'answer_confidence': 'yes', 'answer_id': 10}], 'image_id': 458752, 'answer_type': 'other', 'question_id': 458752000}, {'question_type': 'what', 'multiple_choice_answer': 'pitcher', 'answers': [{'answer': 'pitcher', 'answer_confidence': 'yes', 'answer_id': 1}, {'answer': 'catcher', 'answer_confidence': 'no', 'answer_id': 2}, 

In [11]:
# get the number of yes/no questions and their respective questions
# according to answer json

val_yes_no_qn_ids = []
val_total_yes_no = 0
val_yes_num = 0
val_no_num = 0
for ans in val_ans["annotations"]:
    if ans["answer_type"] == "yes/no":
        val_total_yes_no += 1
        val_yes_no_qn_ids.append(ans["question_id"])
        if ans["multiple_choice_answer"] == "yes":
            val_yes_num += 1
        else:
            val_no_num += 1
print("Training set, number of yes/no questions:", val_total_yes_no)
print("Num of yes answer questions: ", val_yes_num)
print("Num of no answer questions: ", val_no_num)

Training set, number of yes/no questions: 80541
Num of yes answer questions:  40585
Num of no answer questions:  39956


In [12]:
# get all the question strings corresponding to yes/no answers
val_yes_no_ids = set(val_yes_no_qn_ids)
val_yes_no_qn = 0
val_qn_word_list = []
for qn in val_qns["questions"]:
    if qn["question_id"] in val_yes_no_ids:
        val_yes_no_qn += 1
        val_qn_word_list.extend([i.lower() for i in qn["question"][:-1].split(" ")])

print(val_qn_word_list[:100])
print(val_yes_no_qn)

val_joined_string = " ".join(val_qn_word_list)
val_cleaned_list = nltk.word_tokenize(val_joined_string)
val_counts = Counter(val_cleaned_list)
val_sort_wc = sorted(val_counts.items(), key=lambda x: x[1], reverse=True)

print(val_sort_wc)

['is', 'this', 'a', 'creamy', 'soup', 'is', 'this', 'rice', 'noodle', 'soup', 'is', 'it', 'daylight', 'in', 'this', 'picture', 'did', 'the', 'batter', 'hit', 'the', 'ball', 'is', 'there', 'a', 'chain', 'link', 'fence', 'in', 'the', 'image', 'is', 'the', 'boy', 'playing', 'baseball', 'is', 'that', 'a', 'folding', 'chair', 'are', 'these', 'twin', 'mattresses', 'is', 'this', 'room', 'decorated', 'for', 'the', '1970s', 'are', 'the', 'lights', 'on', 'in', 'this', 'room', 'are', 'the', 'windows', 'big', 'is', 'this', 'room', 'in', "someone's", 'home', 'is', 'the', 'bed', 'white', 'could', 'this', 'be', 'a', 'hotel', 'room', 'is', 'the', 'bed', 'made', 'are', 'there', 'bed', 'headboards', 'present', 'in', 'the', 'photo', 'is', 'there', 'a', 'mirror', 'in', 'the', 'room', 'is', 'the']
80541


In [13]:
# save word count to csv file
with open('val_word_count.csv', 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=',',
                            quotechar='"', quoting=csv.QUOTE_MINIMAL)
    for word in val_sort_wc:
        spamwriter.writerow([word[0], word[1]])

In [14]:
# get part of speech tag
val_pos = nltk.pos_tag(list(val_counts.keys()))
print(val_pos)



In [15]:
# find verb, noun, adjective, spatial prepersation
spatial_prep_word = [
                "above", "across", "around","behind", "beside", "down", 
                "from", "in", "inside", "into", "near", "next", "on", "out", 
                "outside", "over", "through", "under", "up", "with"
              ]
val_verb = {}
val_noun = {}
val_adjective = {}
val_spatial_prep = {}

for word in val_pos:
    if word[0] in spatial_prep_word:
        val_spatial_prep[word[0]] = val_counts[word[0]]
        continue
    if word[1].startswith("J"):
        val_adjective[word[0]] = val_counts[word[0]]
        continue
    if word[1].startswith("NN"):
        val_noun[word[0]] = val_counts[word[0]]
        continue
    if word[1].startswith("VB"):
        val_verb[word[0]] = val_counts[word[0]]
        continue
        

In [16]:
# sort the part of speech word count dictionary
val_verb_sort_wc = sorted(val_verb.items(), key=lambda x: x[1], reverse=True)
val_adjective_sort_wc = sorted(val_adjective.items(), key=lambda x: x[1], reverse=True)
val_noun_sort_wc = sorted(val_noun.items(), key=lambda x: x[1], reverse=True)
val_spatial_prep_sort_wc = sorted(val_spatial_prep.items(), key=lambda x: x[1], reverse=True)
val_spatial_prep_sort_wc

[('in', 12061),
 ('on', 8357),
 ('with', 750),
 ('from', 605),
 ('up', 568),
 ('out', 555),
 ('outside', 478),
 ('down', 371),
 ('inside', 268),
 ('near', 247),
 ('over', 247),
 ('behind', 235),
 ('under', 184),
 ('into', 171),
 ('next', 156),
 ('through', 146),
 ('around', 144),
 ('above', 82),
 ('across', 30),
 ('beside', 19)]

In [17]:
# save sorted part of speech result to csv

# verb
with open('val_word_count_verb.csv', 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=',',
                            quotechar='"', quoting=csv.QUOTE_MINIMAL)
    for word in val_verb_sort_wc:
        spamwriter.writerow([word[0], word[1]])

# noun
with open('val_word_count_noun.csv', 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=',',
                            quotechar='"', quoting=csv.QUOTE_MINIMAL)
    for word in val_noun_sort_wc:
        spamwriter.writerow([word[0], word[1]])

# adjective
with open('val_word_count_adj.csv', 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=',',
                            quotechar='"', quoting=csv.QUOTE_MINIMAL)
    for word in val_adjective_sort_wc:
        spamwriter.writerow([word[0], word[1]])

# prepersation
with open('val_word_count_prep.csv', 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=',',
                            quotechar='"', quoting=csv.QUOTE_MINIMAL)
    for word in val_spatial_prep_sort_wc:
        spamwriter.writerow([word[0], word[1]])