In [None]:
import os
import glob
import matplotlib.pyplot as plt
import json
import re
import numpy as np

from collections import defaultdict
from PIL import Image

repo_dir = "/kaggle/input/vqa-baseline"
train_dir = "/kaggle/input/vqav2-train"
valid_dir = "/kaggle/input/vqav2-val"

# Preprocessing phase

## Resize images

In [None]:
def resize_image(image, size):
    return image.resize(size, Image.Resampling.LANCZOS)

def resize_images(input_dir, output_dir, suffix, size):
    for idir in os.scandir(input_dir):
        if idir.name != suffix:
            print(idir.name)
            continue

        if not os.path.exists(output_dir + '/' + idir.name):
            os.makedirs(output_dir + '/' + idir.name)
        
        image_path = idir.path + '/' + suffix
        images = os.listdir(image_path)
        n_images = len(images)
        for iimage, image in enumerate(images):
            img = Image.open(os.path.join(image_path, image))
            img = resize_image(img, size)
            img.save(os.path.join(output_dir + '/' + idir.name, image), img.format)

            if (iimage + 1) % 1000 == 0:
                print("[{}/{}] Resized the images and saved into '{}'."
                      .format(iimage+1, n_images, output_dir+'/'+idir.name))

In [None]:
resize_images(train_dir, "/kaggle/working/Images", "train2014", [224, 224])
resize_images(valid_dir, "/kaggle/working/Images", "val2014", [224, 224])

## Make Vocab for QA

In [None]:
def make_vocab_questions(input_dirs):
    """Make dictionary for questions and save them into text file."""
    vocab_set = set()
    SENTENCE_SPLIT_REGEX = re.compile(r'(\W+)')
    question_length = []
    datasets = [list(os.scandir(input_dirs[i]))[0] for i in range(len(input_dirs))]
    for dataset in datasets:
        with open(dataset.path) as f:
            questions = json.load(f)['questions']
        set_question_length = [None]*len(questions)
        for iquestion, question in enumerate(questions):
            words = SENTENCE_SPLIT_REGEX.split(question['question'].lower())
            words = [w.strip() for w in words if len(w.strip()) > 0]
            vocab_set.update(words)
            set_question_length[iquestion] = len(words)
        question_length += set_question_length

    vocab_list = list(vocab_set)
    vocab_list.sort()
    vocab_list.insert(0, '<pad>')
    vocab_list.insert(1, '<unk>')
    
    with open(r'/kaggle/working/vocab_questions.txt', 'w') as f:
        f.writelines([w+'\n' for w in vocab_list])
    
    print('Make vocabulary for questions')
    print('The number of total words of questions: %d' % len(vocab_set))
    print('Maximum length of question: %d' % np.max(question_length))


def make_vocab_answers(input_dirs, n_answers):
    """Make dictionary for top n answers and save them into text file."""
    answers = defaultdict(lambda: 0)
    datasets = [list(os.scandir(input_dirs[i]))[0] for i in range(len(input_dirs))]
    for dataset in datasets:
        with open(dataset.path) as f:
            annotations = json.load(f)['annotations']
        for annotation in annotations:
            for answer in annotation['answers']:
                word = answer['answer']
                if re.search(r"[^\w\s]", word):
                    continue
                answers[word] += 1
                
    answers = sorted(answers, key=answers.get, reverse=True)
    assert('<unk>' not in answers)
    top_answers = ['<unk>'] + answers[:n_answers-1] # '-1' is due to '<unk>'
    
    with open(r'/kaggle/working/vocab_answers.txt', 'w') as f:
        f.writelines([w+'\n' for w in top_answers])

    print('Make vocabulary for answers')
    print('The number of total words of answers: %d' % len(answers))
    print('Keep top %d answers into vocab' % n_answers)

In [None]:
make_vocab_questions(
    ['/kaggle/input/vqav2-train/v2_Questions_Train_mscoco',
     '/kaggle/input/vqav2-val/v2_Questions_Val_mscoco']
)

make_vocab_answers(
    ['/kaggle/input/vqav2-train/v2_Annotations_Train_mscoco',
     '/kaggle/input/vqav2-val/v2_Annotations_Val_mscoco'],
    n_answers=1000
)

## Build VQA input

In [None]:
SENTENCE_SPLIT_REGEX = re.compile(r'(\W+)')


def tokenize(sentence):
    tokens = SENTENCE_SPLIT_REGEX.split(sentence.lower())
    tokens = [t.strip() for t in tokens if len(t.strip()) > 0]
    return tokens


def load_str_list(fname):
    with open(fname) as f:
        lines = f.readlines()
    lines = [l.strip() for l in lines]
    return lines


class VocabDict:
    
    def __init__(self, vocab_file):
        self.word_list = load_str_list(vocab_file)
        self.word2idx_dict = {w:n_w for n_w, w in enumerate(self.word_list)}
        self.vocab_size = len(self.word_list)
        self.unk2idx = self.word2idx_dict['<unk>'] if '<unk>' in self.word2idx_dict else None

    def idx2word(self, n_w):
        return self.word_list[n_w]

    def word2idx(self, w):
        if w in self.word2idx_dict:
            return self.word2idx_dict[w]
        elif self.unk2idx is not None:
            return self.unk2idx
        else:
            raise ValueError('word %s not in dictionary (while dictionary does not contain <unk>)' % w)

    def tokenize_and_index(self, sentence):
        inds = [self.word2idx(w) for w in tokenize(sentence)]
        return inds

In [None]:
def extract_answers(q_answers, valid_answer_set):
    all_answers = [answer["answer"] for answer in q_answers]
    valid_answers = [a for a in all_answers if a in valid_answer_set]
    return all_answers, valid_answers


def vqa_processing(image_dir, annotation_file, question_file, valid_answer_set, image_set):
    print('building vqa %s dataset' % image_set)
    if image_set in ['train2014', 'val2014']:
        load_answer = True
        with open(annotation_file.format(f'vqav2-{image_set[:-4]}', image_set[:-4].capitalize(), image_set)) as f:
            annotations = json.load(f)['annotations']
            qid2ann_dict = {ann['question_id']: ann for ann in annotations}
    else:
        load_answer = False
    with open(question_file.format(f'vqav2-{image_set[:-4]}', image_set[:-4].capitalize(), image_set)) as f:
        questions = json.load(f)['questions']
    coco_set_name = image_set.replace('-dev', '')
    abs_image_dir = os.path.abspath(image_dir % coco_set_name)
    image_name_template = 'COCO_'+coco_set_name+'_%012d'
    dataset = [None]*len(questions)
    
    unk_ans_count = 0
    for n_q, q in enumerate(questions):
        if (n_q+1) % 10000 == 0:
            print('processing %d / %d' % (n_q+1, len(questions)))
        image_id = q['image_id']
        question_id = q['question_id']
        image_name = image_name_template % image_id
        image_path = os.path.join(abs_image_dir, image_name+'.jpg')
        question_str = q['question']
        question_tokens = tokenize(question_str)
        
        iminfo = dict(image_name=image_name,
                      image_path=image_path,
                      question_id=question_id,
                      question_str=question_str,
                      question_tokens=question_tokens)
        
        if load_answer:
            ann = qid2ann_dict[question_id]
            all_answers, valid_answers = extract_answers(ann['answers'], valid_answer_set)
            if len(valid_answers) == 0:
                valid_answers = ['<unk>']
                unk_ans_count += 1
            iminfo['all_answers'] = all_answers
            iminfo['valid_answers'] = valid_answers
            
        dataset[n_q] = iminfo
    print('total %d out of %d answers are <unk>' % (unk_ans_count, len(questions)))
    return dataset

In [None]:
image_dir = '/kaggle/working/Images/%s/'

annotation_file = '/kaggle/input/{}/v2_Annotations_{}_mscoco/v2_mscoco_{}_annotations.json'
question_file = '/kaggle/input/{}/v2_Questions_{}_mscoco/v2_OpenEnded_mscoco_{}_questions.json'

vocab_answer_file = '/kaggle/working/vocab_answers.txt'
answer_dict = VocabDict(vocab_answer_file)
valid_answer_set = set(answer_dict.word_list)

train = vqa_processing(image_dir, annotation_file, question_file, valid_answer_set, 'train2014')
valid = vqa_processing(image_dir, annotation_file, question_file, valid_answer_set, 'val2014')
# test = vqa_processing(image_dir, annotation_file, question_file, valid_answer_set, 'test2015')

np.save('/kaggle/working/train.npy', np.array(train))
np.save('/kaggle/working/valid.npy', np.array(valid))
# np.save(args.output_dir+'/test.npy', np.array(test))

# Training Phase

In [None]:
%cd {repo_dir}
!ls

In [None]:
!python train.py --input /kaggle/working/ --log_dir /kaggle/working/ --model_dir /kaggle/working/