In [1]:
from __future__ import print_function, division

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
%matplotlib inline
import time
import os
import copy
import json

from tqdm import tqdm
from sklearn.metrics import average_precision_score, precision_recall_curve

from torch.utils.data import Dataset
import skimage
from PIL import Image
from copy import deepcopy
from sklearn.utils.fixes import signature
import time
import pickle
import nltk

from attention import NewAttention

plt.ion()

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

np.set_printoptions(precision=2, suppress=True)

In [2]:
class AnswerDisDataset(Dataset):
    def __init__(self, dataset_name, split):
        super(AnswerDisDataset, self).__init__()
        
        self.image_dir = None
        if dataset_name == 'vqa_2.0':
            self.image_dir = '/home/qing/Desktop/Datasets/MSCOCO/images'
        elif dataset_name == 'vizwiz':
            self.image_dir = '/home/qing/Desktop/Datasets/VizWiz/v1/data/Images'
        self.image_ext = '.jpg'
        
        data_transforms = {
            'train': transforms.Compose([
                transforms.Resize((224, 224)),
                transforms.RandomHorizontalFlip(),
                transforms.ToTensor(),
                transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
            ]),
            'trainval': transforms.Compose([
                transforms.Resize((224, 224)),
                transforms.RandomHorizontalFlip(),
                transforms.ToTensor(),
                transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
            ]),
            'val': transforms.Compose([
                transforms.Resize((224, 224)),
                transforms.ToTensor(),
                transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
            ]),
            'test': transforms.Compose([
                transforms.Resize((224, 224)),
                transforms.ToTensor(),
                transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
            ]),
        }
        
        self.transform = data_transforms[split]
        self.word2vocab_id = json.load(open('word2vocab_id.json'))
        self.ans2id = json.load(open('ans2id.json'))
        
        dataroot='../data'
        dataset = json.load(open(os.path.join(dataroot, '%s_%s.json'%(dataset_name, split))), encoding='cp1252')
        max_length = 20
        for sample in dataset:
            question = sample['question']
            question = question.lower()
            tokens = nltk.word_tokenize(question)
            tokens = [self.word2vocab_id[x] for x in tokens if x in self.word2vocab_id]
            tokens = tokens[:max_length]
            if len(tokens) < max_length:
                # Note here we pad in front of the sentence
                padding = [0] * (max_length - len(tokens))
                tokens = padding + tokens
            sample['q_token'] = tokens
            
            tokens = [x.lower() for x in sample['answers']]
            tokens = [self.ans2id[x] for x in tokens if x in self.ans2id]
            sample['a_token'] = tokens
        self.dataset = dataset
        
        
    def __getitem__(self, index):
        entry = self.dataset[index]
        image = entry['image']
        image_path = os.path.join(self.image_dir, image.replace('.jpg', self.image_ext))
        with open(image_path, 'rb') as f:
            image = Image.open(f).convert('RGB')
        label = [0 if x < 2 else 1 for x in entry['ans_dis_labels']]
        label = torch.tensor(label, dtype=torch.float32)
        
        question = torch.from_numpy(np.array(entry['q_token']))
        answer = np.zeros((len(self.ans2id),), dtype=np.float32)
        for ans in entry['a_token']:
            answer[ans] += 1.0
        answer /= 10.0
        answer = torch.from_numpy(answer)
        
        if self.transform:
            image = self.transform(image)

        return image, question, answer, label
    
    def __len__(self):
        return len(self.dataset)
    

splits = ['train', 'val', 'test']
datasets = {}
datasets.update({x: AnswerDisDataset('vizwiz', x) for x in splits})
dataset_sizes = {x: len(datasets[x]) for x in splits}
print(dataset_sizes)

{'test': 7501, 'train': 19502, 'val': 3001}


In [3]:
from language_model import WordEmbedding, QuestionEmbedding
from fc import FCNet
from torch.nn.utils.weight_norm import weight_norm

class Identity(nn.Module):
    def __init__(self, dim):
        super(Identity, self).__init__()
        self.dim = dim
        
    def forward(self, x):
        return x

def init_image_model(init_model_path=None):
    model = models.resnet50(pretrained=True)
    num_ftrs = model.fc.in_features
    model.avgpool = Identity(num_ftrs)
    model.fc = Identity(num_ftrs)
    
    
    if init_model_path:
        model.load_state_dict(torch.load(init_model_path))
    model = model.to(device)
    return model

class BaseModel(nn.Module):
    def __init__(self, w_emb, q_emb, v_att, q_net, v_net, a_net, classifier, model_type='Q+I'):
        super(BaseModel, self).__init__()
        self.w_emb = w_emb
        self.q_emb = q_emb
        self.v_att = v_att
        self.q_net = q_net
        self.v_net = v_net
        self.a_net = a_net
        self.classifier = classifier
        self.model_type = model_type

    def forward(self, v, q, a):
        """Forward

        return: logits, not probs
        """
        w_emb = self.w_emb(q)
        q_emb = self.q_emb(w_emb) # [batch, q_dim]

        att = self.v_att(v, q_emb)
        v_emb = (att * v).sum(1)
        
        q_repr = self.q_net(q_emb)
        v_repr = self.v_net(v_emb)
        a_repr = self.a_net(a)
        
        model_type = self.model_type
        if model_type == 'I':
            joint_repr = v_repr
        elif model_type == 'Q':
            joint_repr = q_repr
        elif model_type == 'A':
            joint_repr = a_repr
        elif model_type == 'Q+I':
            joint_repr = q_repr * v_repr
        elif model_type == 'Q+I+A':
            joint_repr = (q_repr + v_repr + a_repr) / 3
        elif model_type == 'Q+A':
            joint_repr = q_repr * a_repr
        logits = self.classifier(joint_repr)
        return logits
    
class FullModel(nn.Module):
    def __init__(self, image_model, base_model):
        super(FullModel, self).__init__()
        self.image_model = image_model
        self.base_model = base_model
        
    def forward(self, img, q, a):
        x = self.image_model(img)
        x = x.view(x.size(0), -1, self.image_model.fc.dim)
        x = self.base_model(x, q, a)
        return x

def init_model(model_type='Q+I'):
    num_hid = 300
    image_model = init_image_model()
    v_dim = image_model.fc.dim
    w_emb = WordEmbedding(len(datasets['train'].word2vocab_id), 300, 0.0)
    w_emb.init_embedding('glove6b_init_300d.npy')
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([num_hid, num_hid])
    v_net = FCNet([image_model.fc.dim, num_hid])
    a_net = FCNet([len(datasets['train'].ans2id), num_hid])
    classifier = weight_norm(nn.Linear(num_hid, 10), dim=None)

    base_model = BaseModel(w_emb, q_emb, v_att, q_net, v_net, a_net, classifier, model_type)
    base_model = base_model.to(device)
    model = FullModel(image_model, base_model)
    return model

In [5]:
def evaluate_model(model, dataloader):
    model.eval() 
    score_all = []
    label_all = []

    # Iterate over data.
    for images, questions, answers, labels in dataloader:
        images = images.to(device)
        questions = questions.to(device)
        labels = labels.to(device)
        answers = answers.to(device)
        outputs = model(images, questions, answers)
        score_all.append(outputs.data.cpu().numpy())
        label_all.append(labels.data.cpu().numpy())

    score_all = np.concatenate(score_all, axis=0)
    label_all = np.concatenate(label_all, axis=0)
    #score_all[np.isnan(score_all)] = 0.
    ap = average_precision_score(label_all, score_all, average=None)
    
    return ap, label_all, score_all

def train_model(model, num_epochs=5, train_splits=['train'], 
                eval_splits=['test'], n_epochs_per_eval = 1):
    criterion = nn.BCEWithLogitsLoss()
    params = [{'params': model.base_model.parameters()}]
    optimizer = optim.Adam(params, lr=1e-3)
    # Decay LR by a factor of 0.1 every 100 epochs
    scheduler = lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.1)
    
    best_model_wts = copy.deepcopy(model.state_dict())
    best_ap = 0.0
    
    
    train_dataloaders = {x: torch.utils.data.DataLoader(datasets[x], batch_size=32,
                                             shuffle=True, num_workers=4) for x in train_splits}
    
    eval_dataloaders = {x: torch.utils.data.DataLoader(datasets[x], batch_size=32,
                                         shuffle=False, num_workers=4) for x in eval_splits}
    
    dataloaders = {}
    dataloaders.update(train_dataloaders)
    dataloaders.update(eval_dataloaders)
    
    ###########evaluate init model###########
    for eval_split in eval_splits:
        ap, label, score = evaluate_model(model, dataloaders[eval_split])
        print('(AP={1}) {0}'.format(eval_split, 100*ap))
        ap = np.mean(ap)
        print(100*ap)
    print()
    #########################################

    for epoch in range(num_epochs):
        since = time.time()
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)
        
        scheduler.step()
        
        running_loss = 0.0
        running_corrects = 0

        # Iterate over data.
        for train_split in train_splits:
            for images, questions, answers, labels in dataloaders[train_split]:
                model.train()  # Set model to training mode
                images = images.to(device)
                questions = questions.to(device)
                labels = labels.to(device)
                answers = answers.to(device)
                outputs = model(images, questions, answers)
                loss = criterion(outputs, labels)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        # compute average precision
        if (epoch+1) % n_epochs_per_eval == 0:
            for eval_split in eval_splits:
                ap, label, score = evaluate_model(model, dataloaders[eval_split])
                print('(AP={1}) {0}'.format(eval_split, 100*ap))
                ap = np.mean(ap)
                print(100*ap)
            # deep copy the model
                
            if ap > best_ap:
                best_ap = ap
                best_model_wts = copy.deepcopy(model.state_dict())
                
        time_elapsed = time.time() - since
        print('Epoch time: {:.0f}m {:.0f}s'.format(
            time_elapsed // 60, time_elapsed % 60))
        #print(flush=True)
    
    ###########evaluate final model###########
    for eval_split in eval_splits:
        ap, label, score = evaluate_model(model, dataloaders[eval_split])
        print('(AP={1}) {0}'.format(eval_split, 100*ap))
        ap = np.mean(ap)
        print(100*ap)
    # deep copy the model
    if ap > best_ap:
        best_ap = ap
        best_model_wts = copy.deepcopy(model.state_dict())
    print()
    #########################################

    print('Best val AP: {:2f}'.format(100*best_ap))
    # load best model weights
    model.load_state_dict(best_model_wts)
    return

In [7]:
train_splits = ['train', 'val']
model_type_list = ['Q+I', 'Q+I+A']
eval_splits=['test']

for model_type in model_type_list:
    print(model_type)

    model = init_model(model_type)
    save_model_path = 'saved_models/{0}_train-on-({1}).pt'.format(model_type, ','.join(train_splits))
    train_model(model, num_epochs=5, train_splits=train_splits, eval_splits=eval_splits, n_epochs_per_eval = 1)
    torch.save(model.state_dict(), save_model_path)
    print('\n')

Q+I
(AP=[20.83 33.79 21.89  5.62 76.7   6.19 62.62 69.1   1.64  0.48]) test
29.88550001007032

Epoch 0/4
----------
(AP=[42.4  54.97 39.41  7.74 85.79  8.55 81.8  87.33  1.81  0.47]) test
41.02492701963925
Epoch time: 3m 55s
Epoch 1/4
----------
(AP=[45.22 56.93 42.32 10.05 86.28 10.92 83.34 89.05  1.68  0.56]) test
42.63478506037281
Epoch time: 3m 55s
Epoch 2/4
----------
(AP=[45.98 57.1  41.57 11.02 86.02 10.66 83.07 89.    1.67  0.78]) test
42.68769475026315
Epoch time: 3m 56s
Epoch 3/4
----------
(AP=[46.29 56.53 40.52 11.28 85.72 10.47 82.85 88.75  1.76  0.72]) test
42.4886023627039
Epoch time: 3m 55s
Epoch 4/4
----------
(AP=[46.11 57.01 40.59 11.67 85.33 10.82 82.46 88.38  1.69  0.72]) test
42.47816894710813
Epoch time: 3m 57s
(AP=[46.11 57.01 40.59 11.67 85.33 10.82 82.46 88.38  1.69  0.72]) test
42.47816894710813

Best val AP: 42.687695


Q+I+A
(AP=[20.79 31.22 17.83  5.07 75.53  4.71 69.85 67.01  1.69  0.49]) test
29.418365904300646

Epoch 0/4
----------
(AP=[64.54 76.93 56.2

In [8]:
model_type_list = ['Q+I', 'Q+I+A']
train_splits = ['train', 'val']
eval_splits=['test']

dataloaders = {x: torch.utils.data.DataLoader(datasets[x], batch_size=32,
                                     shuffle=False, num_workers=4) for x in eval_splits}

score_all = 0
all_data= {}
for model_type in model_type_list:
    print(model_type)
    model = init_model(model_type)
    save_model_path = 'saved_models/{0}_train-on-({1}).pt'.format(model_type, ','.join(train_splits))
    model.load_state_dict(torch.load(save_model_path))
    
    eval_split = eval_splits[0]
    ap, label, score = evaluate_model(model, dataloaders[eval_split])
    
    key = '%s_train-on-%s_%s'%(model_type, ','.join(train_splits), eval_split)
    value = [ap, label, score]
    all_data[key] = value
    
    print('AP: {1}\nmAP: {2}'.format(eval_split, 100*ap, 100*np.mean(ap)))
    print('\n')
    
    score_all += score

Q+I
AP: [45.98 57.1  41.57 11.02 86.02 10.66 83.07 89.    1.67  0.78]
mAP: 42.6876947503


Q+I+A
AP: [64.93 77.4  56.78 10.1  89.48 13.16 90.52 95.5   1.84  1.28]
mAP: 50.099659358




In [None]:
 np.mean([44.82, 58.63,18.15,  5.7,  80.14,  5.14, 66.61, 71.94,  1.35,  0.62])

In [None]:
score = [all_data['%s_train-on-%s_%s'%(model_type, ','.join(train_splits), eval_split)][2]
         for model_type in ['A', 'Q']]
score = sum(score)
ap = average_precision_score(label, score, average=None)
print('(AP={1}) {0}'.format(eval_split, 100*ap))
ap = np.mean(ap)
print(100*ap)

In [None]:
LQI	IVE	INV	DFF	AMB	SBJ	SYN	GRN	SPM	OTH
7.89 56.53 38.25 25.78 96.37 24.97 87.92 83.27  5.39  0.4