In [1]:
%load_ext autoreload
%autoreload 2

import sys
import os
from collections import Counter

import json
import os
import re
import h5py
import numpy as np


from copy import deepcopy
from pprint import pprint
import bisect
from tqdm import *
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.image as mpimg


import programs
from modify_program import *
from utils import *

[Errno 2] No such file or directory: 'data/question_vocab.json'


In [2]:
print('Loading data')
questions = []
for split in ['val_balanced', 'train_balanced']:
    dataset = json.load(open(os.path.join(dataroot, "questions1.2/%s_questions.json"%split)))
    for k, v in dataset.items():
        v['qid'] = k
    dataset = dataset.values()
    update_program(dataset)
    questions.extend(dataset)
print('Num of questions: ', len(questions))

Loading data
Num of questions:  1075062


In [3]:
all_programs = [q['semantic'] for q in questions]
all_operations = [op['operation'] for program in all_programs for op in program]
all_arguments = [arg for program in all_programs for op in program for arg in op['argument']]


print('max len of args: ', max([len(op['argument']) for program in all_programs for op in program]))
print('max len of program: ', max([len(program) for program in all_programs]))

operations_count = Counter(all_operations)
arguments_count = Counter(all_arguments)

operation_vocab = ['UNK', 'START', 'END'] + list(sorted(operations_count))
argument_vocab = ['UNK', 'START', 'END'] + list(sorted(arguments_count))

json.dump(operation_vocab, open('data/operation_vocab.json', 'w'), indent=2)
json.dump(argument_vocab, open('data/argument_vocab.json', 'w'), indent=2)
print('operation vocab: ', len(operation_vocab))
print('argument vocab: ', len(argument_vocab))

# I found some strange arguments in questions like 'What is she doing?', 
# these arguments are related to the last operation 'query'.
# example qids: 00272272
# I just leave them alone
strange_args = ["15", "16", "18", "24", "25", "27", "31", "None"]

max len of args:  2
max len of program:  17
operation vocab:  15
argument vocab:  2676


In [4]:
all_questions = [q['question'] for q in questions]
words_count = Counter()
max_len = 0
for sentence in all_questions:
    words = tokenize(sentence)
    if len(words) > max_len:
        max_len = len(words)
    words_count.update(words)
print('max len of question:', max_len)
question_vocab = ['UNK', 'START', 'END'] + list(sorted(words_count))
print('question vocab: ', len(question_vocab))
json.dump(question_vocab, open('data/question_vocab.json', 'w'), indent=2)

max len of question: 28
question vocab:  2857


In [5]:
def create_glove_embedding_init(idx2word, glove_file='data/glove.6B.300d.txt'):
    word2emb = {}
    with open(glove_file, 'r') as f:
        entries = f.readlines()
    emb_dim = len(entries[0].split(' ')) - 1
    print('embedding dim is %d' % emb_dim)
    #weights = np.zeros((len(idx2word), emb_dim), dtype=np.float32)
    weights = np.random.uniform(-1, 1, (len(idx2word), emb_dim)).astype(np.float32)

    for entry in entries:
        vals = entry.split(' ')
        word = vals[0]
        vals = list(map(float, vals[1:]))
        word2emb[word] = np.array(vals)
    for idx, word in enumerate(idx2word):
        if word not in word2emb:
            print('Unseen: ', word)
            continue
        weights[idx] = word2emb[word]
    return weights

In [6]:
init_weights = create_glove_embedding_init(question_vocab)
init_weights /= np.sqrt(np.sum(init_weights**2, axis=1, keepdims=True))
np.save('data/question_word_embedding_glove_init', init_weights)

embedding dim is 300
Unseen:  UNK
Unseen:  START
Unseen:  END
Unseen:  asparaguss
Unseen:  avocadoes
Unseen:  bronwy
Unseen:  burritoes
Unseen:  cactuss
Unseen:  celeries
Unseen:  drainer
Unseen:  drainers
Unseen:  elmoes
Unseen:  hippoes
Unseen:  legoes
Unseen:  logoes
Unseen:  meatballss
Unseen:  mooses
Unseen:  mousess
Unseen:  octopodes
Unseen:  ottomen
Unseen:  pianoes
Unseen:  plier
Unseen:  puppys
Unseen:  snowpants
Unseen:  tacoes
Unseen:  tshirt


In [7]:
all_answers = [q['answer'] for q in questions]
answers_count = Counter(all_answers)
answer_vocab = ['UNK'] + list(sorted(answers_count)) # 'UNK' -> 'I do not know the answer'
json.dump(answer_vocab, open('data/answer_vocab.json', 'w'), indent=2)
print('answer vocab: ', len(answer_vocab))

answer vocab:  1843


In [8]:
# ideally, the gt answer should be included in the choices of 'choose' operation,
# but I found some exceptions in the dataset.
# based on observation of the results, 

choose_arguments = []
for q in questions:
    program = q['semantic']
    if 'choose' not in ' '.join([x['operation'] for x in program]):
        continue
    assert 'choose' in program[-1]['operation'], program
    argument = program[-1]['argument']
    # choose healthier
    if len(argument) == 0:
        argument = [program[dep]['argument'][0] for dep in program[-1]['dependencies']]
    if len(argument) >= 3:
        argument = argument[1:3]
    if q['answer'] not in argument:
        choose_arguments.append((q['answer'], tuple(argument)))
        
pprint(Counter(choose_arguments))

Counter({('left', ('to the left of', 'to the right of')): 21024,
         ('right', ('to the right of', 'to the left of')): 20931,
         ('front', ('in front of', 'behind')): 489,
         ('wooden', ('wood', 'metal')): 220,
         ('metallic', ('metal', 'wood')): 134,
         ('metallic', ('metal', 'porcelain')): 11,
         ('ceramic', ('porcelain', 'metal')): 5,
         ('ceramic', ('porcelain', 'wood')): 2,
         ('front', ('standing in front of', 'standing behind')): 1})
