In [17]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from attention_net import Attention_net 
from data_loader import load_questions_answers, load_image_features
from tensorboardX import SummaryWriter
import progressbar as pb

# Load QA Data

In [5]:
data_dir = 'data'
print("Reading QA DATA")
qa_data = load_questions_answers(token_type='word', version=2, data_dir=data_dir)
print("train questions", len(qa_data['training']))
print("val questions", len(qa_data['validation']))
print("answer vocab", len(qa_data['answer_vocab']))
print("question vocab", len(qa_data['question_vocab']))
print("max question length", qa_data['max_question_length'])

Reading QA DATA
train questions 412564
val questions 199148
answer vocab 3000
question vocab 15881
max question length 22


In [6]:
qa_data['training'][0]

{'image_id': 458752,
 'question': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 2.,
        3., 4., 5., 6., 7.]),
 'answer': 824}

# Load Image Data

In [7]:
print("Reading Image DATA")
train_image_features ,train_image_id_list = load_image_features(data_dir, 'train')
print("train image features", train_image_features.shape)
print("train image_id_list", train_image_id_list.shape)
val_image_features, val_image_id_list = load_image_features(data_dir, 'val')
print("val image features", val_image_features.shape)
print("val image_id_list", val_image_id_list.shape)

Readings Image DATA


  from ._conv import register_converters as _register_converters


train image features (82723, 1024, 7, 7)
train image_id_list (82723,)
val image features (40481, 1024, 7, 7)
val image_id_list (40481,)


# Change Image Feature Dimension 

In [8]:
val_image_features = torch.from_numpy(val_image_features)
val_image_features = val_image_features.permute(0, 2, 3, 1)
val_image_features = val_image_features.view(val_image_features.size(0), -1, val_image_features.size(3))
val_image_features.size()

torch.Size([40481, 49, 1024])

In [10]:
train_image_features = torch.from_numpy(train_image_features)
train_image_features = train_image_features.permute(0, 2, 3, 1)
train_image_features = train_image_features.view(train_image_features.size(0), -1, train_image_features.size(3))
train_image_features.size()

TypeError: expected np.ndarray (got Tensor)

In [11]:
train_image_features = train_image_features.view(train_image_features.size(0), -1, train_image_features.size(3))
train_image_features.size()

torch.Size([82723, 49, 1024])

# Define Data Loader 

In [16]:
def sample_batch(batch_no, batch_size, features, image_id_map, qa_data, split):
  qa = None
  if split == 'train':
    qa = qa_data['training']
  else:
    qa = qa_data['validation']

  si = (batch_no * batch_size)%len(qa)
  ei = min(len(qa), si + batch_size)
  n = ei - si
  sentence = np.ndarray( (n, qa_data['max_question_length']), dtype = 'int32') # [N, 22]
  answer = np.zeros( (n, len(qa_data['answer_vocab']))) # [N, 3000]
  fc7 = np.ndarray( (n,49,1024) ) # [N, 49, 1024]

  count = 0
  for i in range(si, ei):
    sentence[count,:] = qa[i]['question'][:]
    answer[count, qa[i]['answer']] = 1.0
    fc7_index = image_id_map[ qa[i]['image_id'] ]
    fc7[count,:,:] = features[fc7_index, :, :]
    count += 1
  
  return fc7, sentence, answers

In [13]:
train_image_id_map = {image_id: i for i, image_id in enumerate(train_image_id_list)}
val_image_id_map = {image_id: i for i, image_id in enumerate(val_image_id_list)}

# Train 

In [14]:
model = Attention_net()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
num_params = 0
for param in model.parameters():
    num_params += param.numel()
print("Num parameters {}".format(num_params))

Num parameters 21433694


In [15]:
num_epoch = 16
batch_size = 128
writer = SummaryWriter()

In [None]:
for epoch in range(args.num_epoch):
    pbar = pb.ProgressBar()
    model.train()
    loss_value = 0.0
    correct = 0.0

    # Train
    for j in pbar(range(len(train_data) // batch_size)):
        img_features, que_features, answers = sample_batch(batch_no, batch_size, features, image_id_map, qa_data, split)

        outputs, que_att, img_att = model(img_features, que_features)

        loss = criterion(outputs, answers)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        loss_value += loss.data[0]
        pred = outputs.data.max(1)[1] # get the index of the max log-probability
        correct += pred.eq(targets.data).cpu().sum()
        
    print("Train epoch {}, loss {}, acc {}".format(epoch,
            loss_value / (len(train_data) // args.batch_size),
            correct / (len(train_data) // args.batch_size * args.batch_size)))

#     if epoch > 20 and epoch % 10 == 0:
#         for param_group in early_optimizer.param_groups:
#             param_group['lr'] *= 0.5

    model.eval()

#     for module in model.modules():
#         if module.__class__.__name__.find("BatchNorm") > -1:
#             module.train()
#             # BatchNorm for some reasons is not stable in eval

    loss_value = 0.0
    correct = 0.0
    pbar = pb.ProgressBar()

    # Evaluate
    for j in pbar(range(len(test_data) // args.batch_size)):
        inputs, targets, mask, laplacian, Di, DiA = sample_batch(test_data, is_training=False)

        if args.model in ["lap", "avg", "mlp"]:
            outputs = model(inputs, laplacian, mask)
        else:
            outputs = model(inputs, Di, DiA, mask)

        loss = F.nll_loss(outputs, targets)

        loss.backward()

        loss_value += loss.data[0]
        pred = outputs.data.max(1)[1] # get the index of the max log-probability
        correct += pred.eq(targets.data).cpu().sum()

    print("Test epoch {}, loss {}, acc {}".format(epoch,
                    loss_value / (len(test_data) /args.batch_size),
                    correct / (len(test_data) // args.batch_size * args.batch_size)))