In [1]:
import os
import json
import re
import numpy as np

from PIL import Image
from collections import defaultdict

## Resize Images

In [25]:
def resize_image(image, size):
    return image.resize(size, Image.Resampling.LANCZOS)

def resize_images(input_dir, output_dir, size):
    for idir in os.scandir(input_dir):
        if not idir.is_dir():
            continue

        if not os.path.exists(output_dir + '/' + idir.name):
            os.makedirs(output_dir + '/' + idir.name)
        
        images = os.listdir(idir.path)
        n_images = len(images)
        for iimage, image in enumerate(images):
            try:
                with open(os.path.join(idir.path, image), 'r+b') as f:
                    with Image.open(f) as img:
                        img = resize_image(img, size)
                        img.save(os.path.join(output_dir + '/' + idir.name, image), img.format)
            except(IOError, SyntaxError) as e:
                pass

            if (iimage + 1) % 1000 == 0:
                print("[{}/{}] Resized the images and saved into '{}'."
                      .format(iimage+1, n_images, output_dir+'/'+idir.name))

In [26]:
input_dir = r"C:\Users\hieunc15\Documents\VisualQA\data\raw\images"
output_dir = r"C:\Users\hieunc15\Documents\VisualQA\data\processed\images"
image_size = [224, 224]

resize_images(input_dir, output_dir, image_size)

[1000/82783] Resized the images and saved into 'C:\Users\hieunc15\Documents\VisualQA\data\processed\images/train2014'.
[2000/82783] Resized the images and saved into 'C:\Users\hieunc15\Documents\VisualQA\data\processed\images/train2014'.
[3000/82783] Resized the images and saved into 'C:\Users\hieunc15\Documents\VisualQA\data\processed\images/train2014'.
[4000/82783] Resized the images and saved into 'C:\Users\hieunc15\Documents\VisualQA\data\processed\images/train2014'.
[5000/82783] Resized the images and saved into 'C:\Users\hieunc15\Documents\VisualQA\data\processed\images/train2014'.
[6000/82783] Resized the images and saved into 'C:\Users\hieunc15\Documents\VisualQA\data\processed\images/train2014'.
[7000/82783] Resized the images and saved into 'C:\Users\hieunc15\Documents\VisualQA\data\processed\images/train2014'.
[8000/82783] Resized the images and saved into 'C:\Users\hieunc15\Documents\VisualQA\data\processed\images/train2014'.
[9000/82783] Resized the images and saved into '

## Make Vocab for QA

In [27]:
def make_vocab_questions(input_dir):
    """Make dictionary for questions and save them into text file."""
    vocab_set = set()
    SENTENCE_SPLIT_REGEX = re.compile(r'(\W+)')
    question_length = []
    datasets = os.listdir(input_dir)
    for dataset in datasets:
        with open(input_dir+'/'+dataset) as f:
            questions = json.load(f)['questions']
        set_question_length = [None]*len(questions)
        for iquestion, question in enumerate(questions):
            words = SENTENCE_SPLIT_REGEX.split(question['question'].lower())
            words = [w.strip() for w in words if len(w.strip()) > 0]
            vocab_set.update(words)
            set_question_length[iquestion] = len(words)
        question_length += set_question_length

    vocab_list = list(vocab_set)
    vocab_list.sort()
    vocab_list.insert(0, '<pad>')
    vocab_list.insert(1, '<unk>')
    
    with open(r'C:\Users\hieunc15\Documents\VisualQA\data\processed\vocab_questions.txt', 'w') as f:
        f.writelines([w+'\n' for w in vocab_list])
    
    print('Make vocabulary for questions')
    print('The number of total words of questions: %d' % len(vocab_set))
    print('Maximum length of question: %d' % np.max(question_length))


def make_vocab_answers(input_dir, n_answers):
    """Make dictionary for top n answers and save them into text file."""
    answers = defaultdict(lambda: 0)
    datasets = os.listdir(input_dir)
    for dataset in datasets:
        with open(input_dir+'/'+dataset) as f:
            annotations = json.load(f)['annotations']
        for annotation in annotations:
            for answer in annotation['answers']:
                word = answer['answer']
                if re.search(r"[^\w\s]", word):
                    continue
                answers[word] += 1
                
    answers = sorted(answers, key=answers.get, reverse=True)
    assert('<unk>' not in answers)
    top_answers = ['<unk>'] + answers[:n_answers-1] # '-1' is due to '<unk>'
    
    with open(r'C:\Users\hieunc15\Documents\VisualQA\data\processed\vocab_answers.txt', 'w') as f:
        f.writelines([w+'\n' for w in top_answers])

    print('Make vocabulary for answers')
    print('The number of total words of answers: %d' % len(answers))
    print('Keep top %d answers into vocab' % n_answers)

In [28]:
question_path = r"C:\Users\hieunc15\Documents\VisualQA\data\raw\questions"
annotate_path = r"C:\Users\hieunc15\Documents\VisualQA\data\raw\annotations"

n_answers = 1000    # number of answers to be kept in vocab
make_vocab_questions(question_path)
make_vocab_answers(annotate_path, n_answers)

Make vocabulary for questions
The number of total words of questions: 15334
Maximum length of question: 26
Make vocabulary for answers
The number of total words of answers: 181102
Keep top 1000 answers into vocab


## Build VQA input

In [6]:
SENTENCE_SPLIT_REGEX = re.compile(r'(\W+)')


def tokenize(sentence):
    tokens = SENTENCE_SPLIT_REGEX.split(sentence.lower())
    tokens = [t.strip() for t in tokens if len(t.strip()) > 0]
    return tokens


def load_str_list(fname):
    with open(fname) as f:
        lines = f.readlines()
    lines = [l.strip() for l in lines]
    return lines


class VocabDict:
    
    def __init__(self, vocab_file):
        self.word_list = load_str_list(vocab_file)
        self.word2idx_dict = {w:n_w for n_w, w in enumerate(self.word_list)}
        self.vocab_size = len(self.word_list)
        self.unk2idx = self.word2idx_dict['<unk>'] if '<unk>' in self.word2idx_dict else None

    def idx2word(self, n_w):
        return self.word_list[n_w]

    def word2idx(self, w):
        if w in self.word2idx_dict:
            return self.word2idx_dict[w]
        elif self.unk2idx is not None:
            return self.unk2idx
        else:
            raise ValueError('word %s not in dictionary (while dictionary does not contain <unk>)' % w)

    def tokenize_and_index(self, sentence):
        inds = [self.word2idx(w) for w in tokenize(sentence)]
        return inds

In [7]:
def extract_answers(q_answers, valid_answer_set):
    all_answers = [answer["answer"] for answer in q_answers]
    valid_answers = [a for a in all_answers if a in valid_answer_set]
    return all_answers, valid_answers


def vqa_processing(image_dir, annotation_file, question_file, valid_answer_set, image_set):
    print('building vqa %s dataset' % image_set)
    if image_set in ['train2014', 'val2014']:
        load_answer = True
        with open(annotation_file % image_set) as f:
            annotations = json.load(f)['annotations']
            qid2ann_dict = {ann['question_id']: ann for ann in annotations}
    else:
        load_answer = False
    with open(question_file % image_set) as f:
        questions = json.load(f)['questions']
    coco_set_name = image_set.replace('-dev', '')
    abs_image_dir = os.path.abspath(image_dir % coco_set_name)
    image_name_template = 'COCO_'+coco_set_name+'_%012d'
    dataset = [None]*len(questions)
    
    unk_ans_count = 0
    for n_q, q in enumerate(questions):
        if (n_q+1) % 10000 == 0:
            print('processing %d / %d' % (n_q+1, len(questions)))
        image_id = q['image_id']
        question_id = q['question_id']
        image_name = image_name_template % image_id
        image_path = os.path.join(abs_image_dir, image_name+'.jpg')
        question_str = q['question']
        question_tokens = tokenize(question_str)
        
        iminfo = dict(image_name=image_name,
                      image_path=image_path,
                      question_id=question_id,
                      question_str=question_str,
                      question_tokens=question_tokens)
        
        if load_answer:
            ann = qid2ann_dict[question_id]
            all_answers, valid_answers = extract_answers(ann['answers'], valid_answer_set)
            if len(valid_answers) == 0:
                valid_answers = ['<unk>']
                unk_ans_count += 1
            iminfo['all_answers'] = all_answers
            iminfo['valid_answers'] = valid_answers
            
        dataset[n_q] = iminfo
    print('total %d out of %d answers are <unk>' % (unk_ans_count, len(questions)))
    return dataset

In [13]:
image_dir = "C:/Users/hieunc15/Documents/VisualQA/data/processed/images/%s/"
annotate_file = "C:/Users/hieunc15/Documents/VisualQA/data/raw/annotations/v2_mscoco_%s_annotations.json"
question_file = "C:/Users/hieunc15/Documents/VisualQA/data/raw/questions/v2_OpenEnded_mscoco_%s_questions.json"

vocab_answer_file = r"C:\Users\hieunc15\Documents\VisualQA\data\processed\vocab_answers.txt"
answer_dict = VocabDict(vocab_answer_file)
valid_answer_set = set(answer_dict.word_list)

train = vqa_processing(image_dir, annotate_file, question_file, valid_answer_set, 'train2014')
valid = vqa_processing(image_dir, annotate_file, question_file, valid_answer_set, 'val2014')


output_dir = r"C:\Users\hieunc15\Documents\VisualQA\data\processed\vqa_input"
np.save(output_dir + r'\train.npy', np.array(train))
np.save(output_dir + r'\valid.npy', np.array(valid))
np.save(output_dir + r'\train_valid.npy', np.array(train+valid))

building vqa train2014 dataset
processing 10000 / 443757
processing 20000 / 443757
processing 30000 / 443757
processing 40000 / 443757
processing 50000 / 443757
processing 60000 / 443757
processing 70000 / 443757
processing 80000 / 443757
processing 90000 / 443757
processing 100000 / 443757
processing 110000 / 443757
processing 120000 / 443757
processing 130000 / 443757
processing 140000 / 443757
processing 150000 / 443757
processing 160000 / 443757
processing 170000 / 443757
processing 180000 / 443757
processing 190000 / 443757
processing 200000 / 443757
processing 210000 / 443757
processing 220000 / 443757
processing 230000 / 443757
processing 240000 / 443757
processing 250000 / 443757
processing 260000 / 443757
processing 270000 / 443757
processing 280000 / 443757
processing 290000 / 443757
processing 300000 / 443757
processing 310000 / 443757
processing 320000 / 443757
processing 330000 / 443757
processing 340000 / 443757
processing 350000 / 443757
processing 360000 / 443757
proces