In [1]:
from __future__ import print_function, division

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
%matplotlib inline
import time
import os
import copy
import json

from tqdm import tqdm
from sklearn.metrics import average_precision_score, precision_recall_curve

from torch.utils.data import Dataset
import skimage
from PIL import Image
from copy import deepcopy
from sklearn.utils.fixes import signature
import time
import pickle
import nltk

plt.ion()

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

np.set_printoptions(precision=2, suppress=True)

In [2]:
class AnswerDisDataset(Dataset):
    def __init__(self, dataset_name, split):
        super(AnswerDisDataset, self).__init__()
        
        self.image_dir = None
        if dataset_name == 'vqa_2.0':
            self.image_dir = '/home/qing/Desktop/Datasets/MSCOCO/images'
        elif dataset_name == 'vizwiz':
            self.image_dir = '/home/qing/Desktop/Datasets/VizWiz/v1/data/Images'
        self.image_ext = '.jpg'
        
        data_transforms = {
            'train': transforms.Compose([
                transforms.Resize((224, 224)),
                transforms.RandomHorizontalFlip(),
                transforms.ToTensor(),
                transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
            ]),
            'trainval': transforms.Compose([
                transforms.Resize((224, 224)),
                transforms.RandomHorizontalFlip(),
                transforms.ToTensor(),
                transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
            ]),
            'val': transforms.Compose([
                transforms.Resize((224, 224)),
                transforms.ToTensor(),
                transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
            ]),
            'test': transforms.Compose([
                transforms.Resize((224, 224)),
                transforms.ToTensor(),
                transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
            ]),
        }
        
        self.transform = data_transforms[split]
        self.word2vocab_id = json.load(open('word2vocab_id.json'))
        self.ans2id = json.load(open('ans2id.json'))
        
        dataroot='../data'
        dataset = json.load(open(os.path.join(dataroot, '%s_%s.json'%(dataset_name, split))), encoding='cp1252')
        max_length = 20
        for sample in dataset:
            question = sample['question']
            question = question.lower()
            tokens = nltk.word_tokenize(question)
            tokens = [self.word2vocab_id[x] for x in tokens if x in self.word2vocab_id]
            tokens = tokens[:max_length]
            if len(tokens) < max_length:
                # Note here we pad in front of the sentence
                padding = [0] * (max_length - len(tokens))
                tokens = padding + tokens
            sample['q_token'] = tokens
            
            tokens = [x.lower() for x in sample['answers']]
            tokens = [self.ans2id[x] for x in tokens if x in self.ans2id]
            sample['a_token'] = tokens
        self.dataset = dataset
        
        
    def __getitem__(self, index):
        entry = self.dataset[index]
        image = entry['image']
        image_path = os.path.join(self.image_dir, image.replace('.jpg', self.image_ext))
        with open(image_path, 'rb') as f:
            image = Image.open(f).convert('RGB')
        label = [0 if x < 2 else 1 for x in entry['ans_dis_labels']]
        label[-2] = 0 # -2 -> spam, don't consider it
        label = torch.tensor(label, dtype=torch.float32)
        
        question = torch.from_numpy(np.array(entry['q_token']))
        answer = np.zeros((len(self.ans2id),), dtype=np.float32)
        for ans in entry['a_token']:
            answer[ans] += 1.0
        answer /= 10.0
        answer = torch.from_numpy(answer)
        
        if self.transform:
            image = self.transform(image)

        return image, question, answer, label
    
    def __len__(self):
        return len(self.dataset)
    

splits = ['train', 'val', 'test']
datasets = {}
datasets.update({x: AnswerDisDataset('vizwiz', x) for x in splits})
dataset_sizes = {x: len(datasets[x]) for x in splits}
print(dataset_sizes)

{'test': 7501, 'train': 19502, 'val': 3001}


In [3]:
from language_model import WordEmbedding, QuestionEmbedding
from fc import FCNet
from torch.nn.utils.weight_norm import weight_norm

class Identity(nn.Module):
    def __init__(self, dim):
        super(Identity, self).__init__()
        self.dim = dim
        
    def forward(self, x):
        return x

def init_image_model(init_model_path=None):
    model = models.resnet50(pretrained=True)
    num_ftrs = model.fc.in_features
    model.fc = Identity(num_ftrs)
    
    if init_model_path:
        model.load_state_dict(torch.load(init_model_path))
    model = model.to(device)
    return model

class BaseModel(nn.Module):
    def __init__(self, w_emb, q_emb, q_net, v_net, a_net, classifier, model_type='Q+I'):
        super(BaseModel, self).__init__()
        self.w_emb = w_emb
        self.q_emb = q_emb
        self.q_net = q_net
        self.v_net = v_net
        self.a_net = a_net
        self.classifier = classifier
        self.model_type = model_type

    def forward(self, v, q, a):
        """Forward

        return: logits, not probs
        """
        w_emb = self.w_emb(q)
        q_emb = self.q_emb(w_emb) # [batch, q_dim]

        q_repr = self.q_net(q_emb)
        v_repr = self.v_net(v)
        a_repr = self.a_net(a)
        
        model_type = self.model_type
        if model_type == 'I':
            joint_repr = v_repr
        elif model_type == 'Q':
            joint_repr = q_repr
        elif model_type == 'A':
            joint_repr = a_repr
        elif model_type == 'Q+I':
            joint_repr = q_repr * v_repr
        elif model_type == 'Q+I+A':
            joint_repr = (q_repr + v_repr + a_repr) / 3
        elif model_type == 'Q+A':
            joint_repr = q_repr * a_repr
        logits = self.classifier(joint_repr)
        return logits
    
class FullModel(nn.Module):
    def __init__(self, image_model, base_model):
        super(FullModel, self).__init__()
        self.image_model = image_model
        self.base_model = base_model
        
    def forward(self, img, q, a):
        x = self.image_model(img)
        x = self.base_model(x, q, a)
        return x

def init_model(model_type='Q+I'):
    num_hid = 300
    image_model = init_image_model()
    w_emb = WordEmbedding(len(datasets['train'].word2vocab_id), 300, 0.0)
    w_emb.init_embedding('glove6b_init_300d.npy')
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    q_net = FCNet([num_hid, num_hid])
    v_net = FCNet([image_model.fc.dim, num_hid])
    a_net = FCNet([len(datasets['train'].ans2id), num_hid])
    classifier = weight_norm(nn.Linear(num_hid, 10), dim=None)

    base_model = BaseModel(w_emb, q_emb, q_net, v_net, a_net, classifier, model_type)
    base_model = base_model.to(device)
    model = FullModel(image_model, base_model)
    return model

In [4]:
def evaluate_model(model, dataloader):
    model.eval() 
    score_all = []
    label_all = []

    # Iterate over data.
    for images, questions, answers, labels in dataloader:
        images = images.to(device)
        questions = questions.to(device)
        labels = labels.to(device)
        answers = answers.to(device)
        outputs = model(images, questions, answers)
        score_all.append(outputs.data.cpu().numpy())
        label_all.append(labels.data.cpu().numpy())

    score_all = np.concatenate(score_all, axis=0)
    label_all = np.concatenate(label_all, axis=0)
    #score_all[np.isnan(score_all)] = 0.
    ap = average_precision_score(label_all, score_all, average=None)
    
    return ap, label_all, score_all

def train_model(model, num_epochs=5, train_splits=['train'], 
                eval_splits=['test'], n_epochs_per_eval = 1):
    criterion = nn.BCEWithLogitsLoss()
    params = [{'params': model.base_model.parameters()}]
    optimizer = optim.Adam(params, lr=1e-3)
    # Decay LR by a factor of 0.1 every 100 epochs
    scheduler = lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.1)
    
    best_model_wts = copy.deepcopy(model.state_dict())
    best_ap = 0.0
    
    
    train_dataloaders = {x: torch.utils.data.DataLoader(datasets[x], batch_size=32,
                                             shuffle=True, num_workers=4) for x in train_splits}
    
    eval_dataloaders = {x: torch.utils.data.DataLoader(datasets[x], batch_size=32,
                                         shuffle=False, num_workers=4) for x in eval_splits}
    
    dataloaders = {}
    dataloaders.update(train_dataloaders)
    dataloaders.update(eval_dataloaders)
    
    ###########evaluate init model###########
    for eval_split in eval_splits:
        ap, label, score = evaluate_model(model, dataloaders[eval_split])
        print('(AP={1}) {0}'.format(eval_split, 100*ap))
        ap = np.nanmean(ap)
        print(100*ap)
    print()
    #########################################

    for epoch in range(num_epochs):
        since = time.time()
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)
        
        scheduler.step()
        
        running_loss = 0.0
        running_corrects = 0

        # Iterate over data.
        for train_split in train_splits:
            for images, questions, answers, labels in dataloaders[train_split]:
                model.train()  # Set model to training mode
                images = images.to(device)
                questions = questions.to(device)
                labels = labels.to(device)
                answers = answers.to(device)
                outputs = model(images, questions, answers)
                loss = criterion(outputs, labels)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        # compute average precision
        if (epoch+1) % n_epochs_per_eval == 0:
            for eval_split in eval_splits:
                ap, label, score = evaluate_model(model, dataloaders[eval_split])
                print('(AP={1}) {0}'.format(eval_split, 100*ap))
                ap = np.nanmean(ap)
                print(100*ap)
            # deep copy the model
                
            if ap > best_ap:
                best_ap = ap
                best_model_wts = copy.deepcopy(model.state_dict())
                
        time_elapsed = time.time() - since
        print('Epoch time: {:.0f}m {:.0f}s'.format(
            time_elapsed // 60, time_elapsed % 60))
        #print(flush=True)
    
    ###########evaluate final model###########
    for eval_split in eval_splits:
        ap, label, score = evaluate_model(model, dataloaders[eval_split])
        print('(AP={1}) {0}'.format(eval_split, 100*ap))
        ap = np.nanmean(ap)
        print(100*ap)
    # deep copy the model
    if ap > best_ap:
        best_ap = ap
        best_model_wts = copy.deepcopy(model.state_dict())
    print()
    #########################################

    print('Best val AP: {:2f}'.format(100*best_ap))
    # load best model weights
    model.load_state_dict(best_model_wts)
    return

In [5]:
train_splits = ['train', 'val']
model_type_list = ['I', 'Q', 'Q+I', 'Q+A', 'A', 'Q+I+A']
eval_splits=['test']

for model_type in model_type_list:
    print(model_type)

    model = init_model(model_type)
    save_model_path = 'saved_models/{0}_train-on-({1}).pt'.format(model_type, ','.join(train_splits))
    train_model(model, num_epochs=5, train_splits=train_splits, eval_splits=eval_splits, n_epochs_per_eval = 1)
    torch.save(model.state_dict(), save_model_path)
    print('\n')

I
(AP=[26.77 37.73 16.27  4.67 73.67  5.67 67.67 75.24   nan  0.4 ]) test
34.23283425319277

Epoch 0/4
----------


  recall = tps / tps[-1]


(AP=[52.85 49.39 28.51  6.88 82.39  8.7  78.93 85.52   nan  0.4 ]) test
43.73088021129832
Epoch time: 6m 52s
Epoch 1/4
----------
(AP=[53.91 50.03 29.78  7.84 83.17  8.71 79.16 85.86   nan  0.49]) test
44.32611540855959
Epoch time: 3m 52s
Epoch 2/4
----------
(AP=[54.92 50.37 29.93  7.78 83.11  8.81 79.62 85.99   nan  0.43]) test
44.55162305455126
Epoch time: 3m 53s
Epoch 3/4
----------
(AP=[55.23 50.38 29.85  8.17 83.42  9.19 79.96 86.34   nan  0.62]) test
44.79421825476641
Epoch time: 3m 54s
Epoch 4/4
----------
(AP=[55.42 50.95 29.91  8.05 83.34  8.72 79.78 86.04   nan  0.56]) test
44.753949298886376
Epoch time: 3m 53s
(AP=[55.42 50.95 29.91  8.05 83.34  8.72 79.78 86.04   nan  0.56]) test
44.753949298886376

Best val AP: 44.794218


Q
(AP=[26.26 35.   20.14  5.07 74.17  5.14 62.45 74.97   nan  0.53]) test
33.748423498377136

Epoch 0/4
----------
(AP=[34.01 53.77 38.3   9.02 84.41 10.17 79.6  85.02   nan  0.67]) test
43.88639239935925
Epoch time: 3m 52s
Epoch 1/4
----------
(AP=[35.

In [6]:
model_type_list = ['A', 'I', 'Q', 'Q+I', 'Q+A', 'Q+I+A']
train_splits = ['train', 'val']
eval_splits=['test']

dataloaders = {x: torch.utils.data.DataLoader(datasets[x], batch_size=32,
                                     shuffle=False, num_workers=4) for x in eval_splits}

score_all = 0
all_data= {}
for model_type in model_type_list:
    print(model_type)
    model = init_model(model_type)
    save_model_path = 'saved_models/{0}_train-on-({1}).pt'.format(model_type, ','.join(train_splits))
    model.load_state_dict(torch.load(save_model_path))
    
    eval_split = eval_splits[0]
    ap, label, score = evaluate_model(model, dataloaders[eval_split])
    
    key = '%s_train-on-%s_%s'%(model_type, ','.join(train_splits), eval_split)
    value = [ap, label, score]
    all_data[key] = value
    
    print('AP: {1}\nmAP: {2}'.format(eval_split, 100*ap, 100*np.nanmean(ap)))
    print('\n')
    
    score_all += score

A
AP: [65.35 77.76 57.4   9.75 89.21 10.17 90.19 95.31   nan  2.51]
mAP: 55.2952884601


I
AP: [55.23 50.38 29.85  8.17 83.42  9.19 79.96 86.34   nan  0.62]
mAP: 44.7942182548


Q
AP: [35.38 54.43 38.91 13.59 84.44 10.59 79.68 85.15   nan  0.65]
mAP: 44.7572671316


Q+I
AP: [56.54 61.91 45.25 13.8  87.55 11.55 85.97 91.42   nan  1.36]
mAP: 50.5934886386


Q+A
AP: [65.21 77.7  57.37 10.89 88.76  9.8  90.05 95.26   nan  2.09]
mAP: 55.2366648431


Q+I+A
AP: [66.03 77.8  56.55 12.94 90.03 12.51 90.41 95.51   nan  1.97]
mAP: 55.9717905099




In [7]:
 np.mean([44.82, 58.63,18.15,  5.7,  80.14,  5.14, 66.61, 71.94,  1.35,  0.62])

35.31

In [8]:
LQI	IVE	INV	DFF	AMB	SBJ	SYN	GRN	SPM	OTH
7.89 56.53 38.25 25.78 96.37 24.97 87.92 83.27  5.39  0.4

SyntaxError: invalid syntax (<ipython-input-8-6c81065af7b4>, line 1)