# Assignment3-Part2

In this part we will train a VQA model.

In previous assignments you have learnt to write models.
**Please fill in everything in VQA/vqa_model.py and run the following scripts.**

# Setup Code

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
import os

# TODO: Fill in the Google Drive path where you uploaded the assignment
# Example: If you create a 188 folder and put all the files under Assignment1 folder, then '188/Assignment1'
# GOOGLE_DRIVE_PATH_AFTER_MYDRIVE = '188/Assignment1'
GOOGLE_DRIVE_PATH_AFTER_MYDRIVE = '188/Assignment3'
GOOGLE_DRIVE_PATH = os.path.join('drive', 'My Drive', GOOGLE_DRIVE_PATH_AFTER_MYDRIVE)
print(os.listdir(GOOGLE_DRIVE_PATH))

In [None]:
import sys
sys.path.append(GOOGLE_DRIVE_PATH)
sys.path.append(os.path.join(GOOGLE_DRIVE_PATH, "VQA"))

Now we are going to untar the images folder. Don't worry! This time the file is much smaller.

In [None]:
!tar -xvf "/content/drive/My Drive/188/Assignment3/VQA/images.tar.xz" -C "/content/drive/My Drive/188/Assignment3/VQA"

In [None]:
import sys, os, re, json, time

import pandas as pd
import pickle
import h5py

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow
from VQA import plotting
from PIL import Image
from tqdm import tqdm
from VQA.utils import imread, img_data_2_mini_batch, imgs2batch

from sklearn import metrics
from sklearn.metrics import accuracy_score

# from naive import EncDec
from VQA.vqa_model import EncDec
# from attention import EncDec as FuseAttEncDec
# from rnn_att import EncDec
from VQA.data_loader import VQADataSet

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as Data
from torchvision import transforms


In [None]:
import nltk
nltk.download('punkt')

dataset_filename = os.path.join(GOOGLE_DRIVE_PATH, 'VQA', "data.pkl")
dataset = None

ann_path = os.path.join(GOOGLE_DRIVE_PATH, 'VQA/annotations.json')
q_path   = os.path.join(GOOGLE_DRIVE_PATH, 'VQA/questions.json')
i_path   = os.path.join(GOOGLE_DRIVE_PATH, 'VQA/images')

if (os.path.exists(dataset_filename)):
    with open(dataset_filename, 'rb') as handle:
        print("reading from " + dataset_filename)
        dataset = pickle.load(handle)
else:
    dataset = VQADataSet(ann_path, q_path, i_path)
    with open(dataset_filename, 'wb') as handle:
        print("writing to " + dataset_filename)
        pickle.dump(dataset, handle)

In [None]:
embed_size        = 128
batch_size        = 50
ques_vocab_size   = len(dataset.vocab['question'])
ans_vocab_size    = len(dataset.vocab['answer'])
n_epochs          = 10 #change to smaller value if out of GPU
learning_rate     = 0.001
momentum          = 0.98
attention_size    = 512

print(ques_vocab_size, ans_vocab_size)

In [None]:
def eval_model(data_loader, model, criterion, optimizer, batch_size, training=False,
              epoch = 0, total_loss_over_epochs=[], scores_over_epochs=[]):
    running_loss = 0.
    final_labels, final_preds = [], []
    scores, losses = [], []
    if data_loader is None:
        return
    
    run_type = None
    if training:
        run_type = 'train'
        model.train()
    else:
        run_type = 'test'
        model.eval()
    
    for i, minibatch in enumerate(data_loader):
        # extract minibatch
        t0 = time.time()
        idxs, v, q, a, q_len = minibatch
        
        # convert torch's DataLoader output to proper format.
        # torch gives a List[Tensor_1, ... ] where tensor has been transposed. 
        # batchify transposes back.`
        v = v.to(device)
        q = VQADataSet.batchify_questions(q).to(device)
        a = a.to(device)

        logits = model(v, q)
        preds = torch.argmax(logits, dim=1)

        loss = F.nll_loss(logits, a)
        running_loss += loss.item()
        
        score = metrics.accuracy_score(preds.tolist(),a.tolist())
    
        scores.append(score)
        losses.append(loss)
        
        loss_key = '{}_loss'.format(run_type)
        total_loss_over_epochs['{}_loss'.format(run_type)].append(loss)
        scores_over_epochs['{}_scores'.format(run_type)].append(score)
        
        if training and optimizer is not None:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
   
        final_labels += a.tolist()
        final_preds  += preds.tolist()
        if i%10==0:
            score = np.mean(scores)
            print("Epoch {}: {} Loss: {} Score: {} t: {}".format(epoch, run_type,loss, score, time.time()-t0))
            
    return running_loss, final_labels, final_preds

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = EncDec(embed_size,
               ques_vocab_size, 
               ans_vocab_size, 
               prefix_n=1).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

train_loader = dataset.build_data_loader(train=True, args={'batch_size': batch_size})
test_loader  = dataset.build_data_loader(test=True, args={'batch_size': batch_size})

best_score = 0

train_all_loss, train_all_labels, train_all_preds = [], [], []
print("model built, start training.")
total_loss_over_epochs, scores_over_epochs = plotting.get_empty_stat_over_n_epoch_dictionaries()
total_loss_over_epochs2, scores_over_epochs2 = plotting.get_empty_stat_over_n_epoch_dictionaries()
for epoch in tqdm(range(n_epochs)):
    t0= time.time()
    tr_loss, tr_labels, tr_preds = eval_model(data_loader = train_loader,
                                     model       = model,
                                     criterion   = criterion,
                                     optimizer   = optimizer,
                                     batch_size  = batch_size,
                                     training    = True,
                                     epoch       = epoch,
                                     total_loss_over_epochs = total_loss_over_epochs,
                                     scores_over_epochs     = scores_over_epochs)
    
    with torch.no_grad():
        tr_loss, ts_labels, ts_preds = eval_model(data_loader = test_loader,
                                         model       = model,
                                         criterion   = criterion,
                                         optimizer   = None,
                                         batch_size  = batch_size,
                                         training    = False,
                                         epoch       = epoch,
                                         total_loss_over_epochs = total_loss_over_epochs2,
                                         scores_over_epochs     = scores_over_epochs2)
    
    
    score = metrics.accuracy_score(ts_preds,ts_labels)

    print("\n"+"#==#"*7 + "epoch: {}".format(epoch) + "#==#"*7)
    print('TEST ACC: {}'.format(score))
    print("#==#"*7 + "time: {}".format(time.time()-t0) + "#==#"*7 + "\n")

    
    
    
    

In [None]:
### Error Analysis

In [None]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
count = 1
err_anal_data = []
for i, minibatch in enumerate(test_loader):
    # extract minibatch
    t0 = time.time()
    idxs, v, q, a, q_len = minibatch

    v = v.to(device)
    q = VQADataSet.batchify_questions(q).to(device)
    a = a.to(device)
    
    logits = model(v,q)
    preds = torch.argmax(logits, dim=1)

    for i in range(len(a)):
        idx = idxs[i]
        enc_ans = a[i].item()
        enc_ques = q[i].detach().cpu().numpy()
        img_v = v[i].detach().cpu().numpy()
        question = dataset.decode_question(enc_ques)
        answer_dec = dataset.decode_answer(preds[i])
        answer = dataset.decode_answer(enc_ans)
#         img_v = img_v.reshape(224, 224, 3)
        plt.figure()
        plt.imshow(img_v[0,:,:], interpolation='nearest')
        plt.show()
        question = question.replace("<pad>", "")
        question = question.replace("<start>", "")
        question = question.replace("<end>", "").strip()
        result = answer_dec==answer
        err_anal_data.append([question, answer_dec, answer])
        if not result:
            print("{}. [Q] {} [A] {} [PRED] {}".format(count, question, answer, answer_dec))
            count+=1
        print(err_anal_data[-1])
        print('question:',  question)
        print("[{}] - predicted: {} - ground-truth: {}".format(answer_dec==answer, answer_dec, answer))


## Open Question: Design an Interactive Demo System
Based on this model, design an interactive demo system like in http://visualqa.csail.mit.edu/
When we input an image and a question to the system, it will return the answer.

You can use any API you like. The easiest way is to directly use the notebook to show interactive demo. However, you can try fancier GUIs to show your results.