In [1]:
import argparse
import json
import os
import pickle
import re
import numpy as np

import torch
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.nn as nn
from torch.nn.utils import clip_grad_norm
from torch.utils.data import DataLoader
from tqdm import tqdm, trange

import utils
import math
from dataset import ClevrDataset
from model import RN

In [14]:
val_scene_filename = 'D:\\272_project\\data\\GQA\\eval\\val_choices.json'
with open(val_scene_filename, 'r') as f:
    val_scene = json.load(f)

In [16]:
val_scene['07785307']

{'plausible': ['yes', 'no'], 'valid': ['yes', 'no']}

In [13]:
'n336443' in val_scene.keys()

False

In [10]:
val_scene['1159721']['o

12

In [2]:
os.getcwd()

'c:\\Users\\mpnau\\Documents\\School\\CS272\\Relation-Network-VQA\\clevr'

In [3]:
clevr_dir = 'D:\\272_project\\data\\CLEVR'
os.listdir(clevr_dir)

['COPYRIGHT.txt', 'images', 'LICENSE.txt', 'questions', 'README.txt', 'scenes']

### Question Data

In [4]:
train_q_filename = os.path.join(clevr_dir, 'questions', 'CLEVR_trainA_questions.json')
with open(train_q_filename, 'r') as f:
    train_q = json.load(f)['questions']

In [5]:
train_q[50]

{'question_index': 50,
 'question_family_index': 62,
 'image_index': 5,
 'question': 'There is a tiny object that is the same shape as the large red metallic object; what color is it?',
 'answer': 'red',
 'image_filename': 'CLEVR_trainA_000005.png',
 'split': 'trainA',
 'program': [{'value_inputs': [], 'inputs': [], 'function': 'scene'},
  {'value_inputs': ['large'], 'inputs': [0], 'function': 'filter_size'},
  {'value_inputs': ['red'], 'inputs': [1], 'function': 'filter_color'},
  {'value_inputs': ['metal'], 'inputs': [2], 'function': 'filter_material'},
  {'value_inputs': [], 'inputs': [3], 'function': 'unique'},
  {'value_inputs': [], 'inputs': [4], 'function': 'same_shape'},
  {'value_inputs': ['small'], 'inputs': [5], 'function': 'filter_size'},
  {'value_inputs': [], 'inputs': [6], 'function': 'unique'},
  {'value_inputs': [], 'inputs': [7], 'function': 'query_color'}]}

In [10]:
print('Number of questions in training data:', len(train_q))

Number of questions in training data: 699960


In [6]:
test_q_filename = os.path.join(clevr_dir, 'questions', 'CLEVR_testA_questions.json')
with open(test_q_filename, 'r') as f:
    test_q = json.load(f)['questions']

In [7]:
test_q[50]

{'question': 'There is a object that is both on the right side of the matte cylinder and in front of the large thing; what is its size?',
 'image_filename': 'CLEVR_testA_000005.png',
 'split': 'testA',
 'image_index': 5,
 'question_index': 50}

### Scene Data

In [8]:
train_scene_filename = os.path.join(clevr_dir, 'scenes', 'CLEVR_trainA_scenes.json')
with open(train_scene_filename, 'r') as f:
    train_s = json.load(f)['scenes']

In [11]:
train_s[5].keys()

dict_keys(['image_filename', 'objects', 'split', 'image_index', 'relationships', 'directions'])

In [9]:
train_s[5]

{'image_filename': 'CLEVR_trainA_000005.png',
 'objects': [{'pixel_coords': [226, 132, 12.197944641113281],
   'size': 'small',
   'color': 'green',
   'material': 'metal',
   'shape': 'cylinder',
   '3d_coords': [-0.7854127883911133, 0.25492721796035767, 0.3499999940395355],
   'rotation': 91.08205168295301},
  {'pixel_coords': [321, 147, 11.443076133728027],
   'size': 'small',
   'color': 'gray',
   'material': 'metal',
   'shape': 'cube',
   '3d_coords': [1.2099950313568115, 1.27609121799469, 0.3499999940395355],
   'rotation': 164.42639326011357},
  {'pixel_coords': [250, 169, 10.52890682220459],
   'size': 'small',
   'color': 'blue',
   'material': 'rubber',
   'shape': 'cube',
   '3d_coords': [0.974945068359375, -0.5668714642524719, 0.3499999940395355],
   'rotation': 108.6414687081874},
  {'pixel_coords': [123, 180, 10.131351470947266],
   'size': 'small',
   'color': 'red',
   'material': 'rubber',
   'shape': 'sphere',
   '3d_coords': [-0.29078203439712524, -2.71969151496887

### Dictionaries

In [12]:
dictionaries = utils.build_dictionaries(clevr_dir)

==> using cached dictionaries: D:\272_project\data\CLEVR\questions\CLEVR_built_dictionaries.pkl


In [13]:
dictionaries

({'are': 1,
  'there': 2,
  'any': 3,
  'gray': 4,
  'things': 5,
  'made': 6,
  'of': 7,
  'the': 8,
  'same': 9,
  'material': 10,
  'as': 11,
  'big': 12,
  'cyan': 13,
  'cylinder': 14,
  '?': 15,
  'number': 16,
  'balls': 17,
  'that': 18,
  'in': 19,
  'front': 20,
  'metal': 21,
  'and': 22,
  'large': 23,
  'spheres': 24,
  'behind': 25,
  'thing': 26,
  'tiny': 27,
  'have': 28,
  'color': 29,
  'metallic': 30,
  'is': 31,
  'a': 32,
  'small': 33,
  'yellow': 34,
  'object': 35,
  ';': 36,
  'what': 37,
  'its': 38,
  'shape': 39,
  'how': 40,
  'many': 41,
  'objects': 42,
  'matte': 43,
  'or': 44,
  'green': 45,
  'rubber': 46,
  'sphere': 47,
  'size': 48,
  'either': 49,
  'to': 50,
  'left': 51,
  'ball': 52,
  'shiny': 53,
  'block': 54,
  'on': 55,
  'right': 56,
  'side': 57,
  'both': 58,
  'red': 59,
  'cylinders': 60,
  'blocks': 61,
  'blue': 62,
  'purple': 63,
  'other': 64,
  'do': 65,
  'cube': 66,
  'anything': 67,
  'else': 68,
  'has': 69,
  'does': 70,
 

First dictionary holds question vocab, second dictionary holds answers, third dictionary holds category for that answer.

### Dataset and DataLoader

In [14]:
clevr_dataset_train = ClevrDataset(clevr_dir, True, dictionaries)
clevr_train_loader = DataLoader(clevr_dataset_train, batch_size=3,
                                    shuffle=True, collate_fn=utils.collate_samples)

loading questions...
==> using cached scenes: D:\272_project\data\CLEVR\scenes\CLEVR_trainA_scenes.pkl


In [15]:
clevr_dataset_train[50]

{'image': tensor([[-0.7854,  0.2549,  0.3500,  6.0000,  2.0000,  3.0000,  2.0000],
         [ 1.2100,  1.2761,  0.3500,  7.0000,  2.0000,  2.0000,  2.0000],
         [ 0.9749, -0.5669,  0.3500,  2.0000,  1.0000,  2.0000,  2.0000],
         [-0.2908, -2.7197,  0.3500,  5.0000,  1.0000,  1.0000,  2.0000],
         [ 1.8245, -2.6589,  0.7000,  5.0000,  2.0000,  1.0000,  1.0000],
         [-1.4880,  2.7624,  0.3500,  1.0000,  1.0000,  3.0000,  2.0000],
         [ 0.4916,  2.8697,  0.3500,  8.0000,  1.0000,  2.0000,  2.0000],
         [-2.8349,  0.2255,  0.3500,  7.0000,  1.0000,  2.0000,  2.0000],
         [ 2.9251,  2.4406,  0.7000,  8.0000,  1.0000,  1.0000,  1.0000]]),
 'question': tensor([ 2, 31, 32, 27, 35, 18, 31,  8,  9, 39, 11,  8, 23, 59, 30, 35, 36, 37,
         29, 31, 71, 15]),
 'answer': tensor([16])}

In [16]:
dataiter = iter(clevr_train_loader)
sample_batched = dataiter.next()
sample_batched

{'image': tensor([[[ 2.6672,  0.7097,  0.3500,  2.0000,  2.0000,  2.0000,  2.0000],
          [ 2.2848, -2.8362,  0.3500,  1.0000,  1.0000,  3.0000,  2.0000],
          [-2.9901, -0.2639,  0.7000,  5.0000,  1.0000,  3.0000,  1.0000],
          [ 0.9244,  0.6390,  0.3500,  8.0000,  2.0000,  2.0000,  2.0000],
          [-0.7456,  1.2459,  0.3500,  6.0000,  1.0000,  3.0000,  2.0000],
          [ 0.6022, -2.4559,  0.7000,  6.0000,  2.0000,  1.0000,  1.0000],
          [ 1.1887,  2.7453,  0.7000,  7.0000,  1.0000,  2.0000,  1.0000],
          [-1.0821, -2.8627,  0.7000,  6.0000,  2.0000,  3.0000,  1.0000],
          [-2.6200,  2.2461,  0.3500,  6.0000,  1.0000,  3.0000,  2.0000],
          [-1.0013,  2.9526,  0.7000,  4.0000,  2.0000,  1.0000,  1.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000]],
 
         [[-1.1363, -0.2567,  0.7000,  2.0000,  1.0000,  2.0000,  1.0000],
          [ 1

In [17]:
print(type(sample_batched['image']))
print(sample_batched['image'].shape)

<class 'torch.Tensor'>
torch.Size([3, 12, 7])


In [18]:
print(type(sample_batched['answer']))
print(sample_batched['answer'].shape)

<class 'torch.Tensor'>
torch.Size([3, 1])


In [19]:
print(type(sample_batched['question']))
print(sample_batched['question'].shape)

<class 'torch.Tensor'>
torch.Size([3, 26])


In [20]:
img, qst, label = utils.load_tensor_data(sample_batched, False, True)

In [21]:
img

tensor([[[ 2.6672,  0.7097,  0.3500,  2.0000,  2.0000,  2.0000,  2.0000],
         [ 2.2848, -2.8362,  0.3500,  1.0000,  1.0000,  3.0000,  2.0000],
         [-2.9901, -0.2639,  0.7000,  5.0000,  1.0000,  3.0000,  1.0000],
         [ 0.9244,  0.6390,  0.3500,  8.0000,  2.0000,  2.0000,  2.0000],
         [-0.7456,  1.2459,  0.3500,  6.0000,  1.0000,  3.0000,  2.0000],
         [ 0.6022, -2.4559,  0.7000,  6.0000,  2.0000,  1.0000,  1.0000],
         [ 1.1887,  2.7453,  0.7000,  7.0000,  1.0000,  2.0000,  1.0000],
         [-1.0821, -2.8627,  0.7000,  6.0000,  2.0000,  3.0000,  1.0000],
         [-2.6200,  2.2461,  0.3500,  6.0000,  1.0000,  3.0000,  2.0000],
         [-1.0013,  2.9526,  0.7000,  4.0000,  2.0000,  1.0000,  1.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000]],

        [[-1.1363, -0.2567,  0.7000,  2.0000,  1.0000,  2.0000,  1.0000],
         [ 1.0704,  0.9271,  0.3500,

In [22]:
qst

tensor([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 15, 24, 21, 44, 73,
         23, 49,  1, 42, 63,  7, 16, 37],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 15, 42, 33, 44,  5,
         43, 13, 23,  1,  5,  7, 16, 37],
        [15, 14,  8,  7, 56, 31, 18, 35, 53, 27,  8,  7, 20, 19, 26, 59,  8,  7,
         51,  8, 50, 52,  8, 31, 10, 37]])

In [23]:
label

tensor([ 5, 24,  6])

In [24]:
test_stats = pickle.load(open('./test_results/test.pickle', 'rb'))

In [25]:
test_stats['global_accuracy']

tensor(0.8583, device='cuda:0')