In [1]:
import pandas as pd
import numpy as np
import json
import ujson
import toolz
import pathlib
from IPython.display import Image, HTML

In [2]:
vg_base = pathlib.Path('/Data/VisualGenome')
image_objects = ujson.load(open(vg_base / 'objects.json'))


In [3]:
img_by_id = {img['image_id']: img for img in image_objects}

In [4]:
image_objects_df = pd.DataFrame(image_objects)

In [6]:
image_objects_df.to_feather('visual_genome_objects.feather')

ImportError: the feather-format library is not installed
you can install via conda
conda install feather-format -c conda-forge
or via pip
pip install -U feather-format


In [4]:
if False:
    def show_images(imgids):
        return HTML('\n'.join('<img src="{}">'.format(img_by_id[imgid]['image_url']) for imgid in imgids))

    obj_synsets = json.load(open(vg_base / 'object_synsets.json'))
    obj_attributes = json.load(open(vg_base / 'attributes.json'))

    attributes_by_img = {att['image_id']: att['attributes'] for att in obj_attributes}

    def has_object(imgid, obj_name):
        return any(obj_name in '\n'.join(obj['names']) for obj in attributes_by_img[imgid])

    def has_synset(imgid, obj_synset):
        return any(obj_synset in obj['synsets'] for obj in attributes_by_img[imgid])

    def has_attr(imgid, attr):
        return any(attr in obj['attributes'] for obj in attributes_by_img[imgid])

    def has_obj_with_attr(imgid, obj_name, attr):
        return any(
            (obj_name in '\n'.join(obj['names'])) and (attr in obj.get('attributes', []))
            for obj in attributes_by_img[imgid])

    def has_synset_with_attr(imgid, obj_synset, attr):
        return any(
            (obj_synset in obj['synsets']) and (attr in obj.get('attributes', []))
            for obj in attributes_by_img[imgid])

    def has_obj_without_attr(imgid, obj_name, attr):
        return any(
            (obj_name in '\n'.join(obj['names'])) and (attr not in obj.get('attributes', []))
            for obj in attributes_by_img[imgid])

    def has_synset_without_attr(imgid, obj_synset, attr):
        return any(
            (obj_synset in obj['synsets']) and (attr not in obj.get('attributes', []))
            for obj in attributes_by_img[imgid])

    candidates = {
        imgid for imgid in attributes_by_img.keys()
        if (
            (
                has_object(imgid, 'pedestrian sign') or
                has_object(imgid, 'pedestrian crossing sign') or
                has_object(imgid, 'crossing sign') or
                has_obj_with_attr(imgid, 'sign', 'yellow')
            ) and (
                has_obj_without_attr(imgid, 'traffic light', 'red') or
                has_synset_without_attr(imgid, 'traffic_light.n.01', 'red')
            ))}
    len(candidates)


    show_images(list(candidates)[:10])
    show_images(candidates)

    candidates = {
        imgid for imgid in attributes_by_img.keys()
        if has_obj_with_attr(imgid, "train", "brown")
    }

In [7]:
region_descs = ujson.load(open(vg_base / 'region_descriptions.json'))

In [8]:
region_descs[0].keys()

dict_keys(['regions', 'id'])

In [9]:
regions_by_img = {r['id']: r['regions'] for r in region_descs}

In [10]:
phrases_by_img = {id: {region['phrase'].strip() for region in regions} for id, regions in regions_by_img.items()}

In [11]:
phrases_by_img[2408456]

{'7760 on back of train',
 'MARC  on back of train',
 'a blue metal pipe',
 'a bright green light',
 'a bunch of wires and plugs',
 'a few telephone posts',
 'a red traffic light',
 'a small grey outbuilding',
 'a small grey shack',
 'a small puddle of water',
 'a straigh train track',
 'a tall electrical pole with powerlines',
 'a train in motion',
 'a white train with blue and orange stripes',
 'a yellow and blue logo sticker',
 'blue and orange stripes',
 'bright red lights on the back of a train',
 'brown train tracks',
 'green and red stoplights',
 'green dense trees',
 'green traffic light on right',
 'orange and blue stripes on back',
 'red light on the left',
 'red light on the right',
 'red traffic light on right',
 'rough rocky ground',
 'some piles of gravel',
 'the door is white',
 'the light is red',
 'the lights are red',
 'the nummber 7760 is on the train',
 'the photo was taken during the day',
 'the pole is white',
 'the portable is standing',
 'the rail road is brown'

In [12]:
question_answers = ujson.load(open(vg_base / 'question_answers.json'))

In [13]:
def show_images(imgids, max_width=200):
    def img(idx):
        img = img_by_id[idx]
#         attrs = {
#             '<b>{}</b>=<i>{}</i>'.format(', '.join(obj['names']), ', '.join(obj.get('attributes', [])))
#             for obj in attributes_by_img[idx]}
        attrs = sorted(phrases_by_img[idx])
        return '<div style="display: inline-block;">{}<img src="{}" style="max-width: {}px">{}</div>'.format(
            idx, img['image_url'], max_width, '; '.join(sorted(attrs)))
    return '\n'.join(img(idx) for idx in imgids)

In [14]:
len(question_answers)

108077

In [15]:
len(question_answers[0]['qas'])

91

In [16]:
question_answers[0]['qas'][0].keys()

dict_keys(['a_objects', 'question', 'image_id', 'qa_id', 'answer', 'q_objects'])

In [17]:
question_answers = {qa['id']: qa['qas'] for qa in question_answers if len(qa['qas'])}

In [18]:
import random
img_qa = random.choice(list(question_answers.values()))
list(toolz.pluck(['question', 'answer'], img_qa))

[('How many people are walking on the sidewalk?', 'Two.'),
 ('Where are the two women walking?', 'On the sidewalk.'),
 ('When is the photograph taken?', 'Daytime.'),
 ('What does the sign in the far left say?', '"Made Frames".'),
 ('What color shirt is the woman on the left wearing?', 'White.'),
 ('What is the color of the face of the clock?', 'Black.'),
 ('Who is walking faster?', 'The woman in the white shirt.'),
 ('Why is it bright outside?', 'It is daytime.')]

In [19]:
[qa for qa in toolz.concat(question_answers.values()) if 'better' in qa['answer'].lower()]

[{'a_objects': [],
  'question': 'Why are the lights on?',
  'image_id': 12,
  'qa_id': 988489,
  'answer': 'To see better.',
  'q_objects': []},
 {'a_objects': [],
  'question': 'Who us named on the clock face?',
  'image_id': 2411253,
  'qa_id': 166360,
  'answer': 'J. C. Leadbetter.',
  'q_objects': []},
 {'a_objects': [],
  'question': 'Why are these scissors in light?',
  'image_id': 2408221,
  'qa_id': 190623,
  'answer': 'To see better.',
  'q_objects': []},
 {'a_objects': [],
  'question': 'Why do the players wear cleats?',
  'image_id': 2407189,
  'qa_id': 1201108,
  'answer': 'Better footing.',
  'q_objects': []},
 {'a_objects': [],
  'question': 'Why is there a light around the mirror?',
  'image_id': 2407180,
  'qa_id': 198948,
  'answer': 'For better visibility.',
  'q_objects': []},
 {'a_objects': [],
  'question': 'Why is the dog sleeping in the bed curled up in a ball?',
  'image_id': 2405369,
  'qa_id': 1158833,
  'answer': 'To sleep better.',
  'q_objects': []},
 {'a_

In [20]:
whys = [qa for qa in toolz.concat(question_answers.values()) if qa['question'].lower().startswith('why ')]
len(whys)

38672

In [19]:
random.sample(whys, 10)

[{'a_objects': [],
  'question': 'Why is there a toilet sitting on the grass?',
  'image_id': 2360916,
  'qa_id': 1651536,
  'answer': 'Remodeling.',
  'q_objects': []},
 {'a_objects': [],
  'question': 'Why does the clock have a knob and ring on top?',
  'image_id': 2377631,
  'qa_id': 1988012,
  'answer': 'To make it resemble a pocket watch.',
  'q_objects': []},
 {'a_objects': [],
  'question': 'Why are there cowboys?',
  'image_id': 2327245,
  'qa_id': 853622,
  'answer': 'No cowboys.',
  'q_objects': []},
 {'a_objects': [],
  'question': 'Why was the picture taken?',
  'image_id': 2330184,
  'qa_id': 851274,
  'answer': 'To capture the animals.',
  'q_objects': []},
 {'a_objects': [],
  'question': 'Why is the man squatting?',
  'image_id': 2365280,
  'qa_id': 1738567,
  'answer': 'To feed cats.',
  'q_objects': []},
 {'a_objects': [],
  'question': 'Why are both people wearing glasses?',
  'image_id': 2349942,
  'qa_id': 259666,
  'answer': 'So they can see properly.',
  'q_objec

In [21]:
HTML(show_images([2370308]))

In [22]:
HTML(show_images([img_qa[0]['image_id']]))

So the region descriptions contain more information than the objects/attributes.

# Image-Object Questions

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [24]:
object_names = [[name for obj in img['objects'] for name in obj['names']] for img in image_objects]

In [25]:
object_names[500]

['couch',
 'floor',
 'curtains',
 'curtains',
 'lamp',
 'cushion',
 'shade',
 'cord',
 'foot',
 'foot']

In [26]:
object_vectorizer = TfidfVectorizer(analyzer=lambda x: x)
object_vec_matrix = object_vectorizer.fit_transform(object_names)

In [27]:
object_vec_matrix

<108077x82827 sparse matrix of type '<class 'numpy.float64'>'
	with 1929060 stored elements in Compressed Sparse Row format>

In [28]:
object_vectorizer.vocabulary_['shower']

62957

In [29]:
id2objname = object_vectorizer.get_feature_names()

In [30]:
from sklearn.metrics.pairwise import pairwise_distances_argmin

In [31]:
#pairwise_distances_argmin()

In [32]:
sims = object_vec_matrix.T.dot(object_vec_matrix[:,object_vectorizer.vocabulary_['shower']]).A.ravel()
[id2objname[x] for x in np.argsort(sims)[-10:]]

['floor',
 'mirror',
 'faucet',
 'sink',
 'towel',
 'toilet',
 'wall',
 'bathroom',
 'tile',
 'shower']

In [33]:
sims = object_vec_matrix.T.dot(object_vec_matrix[:,object_vectorizer.vocabulary_['kite']]).A.ravel()
[id2objname[x] for x in np.argsort(sims)[-10:]]

['string',
 'shirt',
 'clouds',
 'kites',
 'people',
 'beach',
 'man',
 'person',
 'sky',
 'kite']

In [34]:
sims = object_vec_matrix.T.dot(object_vec_matrix[:,object_vectorizer.vocabulary_['office']]).A.ravel()
[id2objname[x] for x in np.argsort(sims)[-10:]]

['window',
 'laptop',
 'wall',
 'mouse',
 'keyboard',
 'chair',
 'computer',
 'monitor',
 'desk',
 'office']

In [35]:
sims = object_vec_matrix.T.dot(object_vec_matrix[:,object_vectorizer.vocabulary_['bus']]).A.ravel()
[id2objname[x] for x in np.argsort(sims)[-10:]]

['tree',
 'tire',
 'street',
 'light',
 'road',
 'car',
 'building',
 'sign',
 'window',
 'bus']

Ok great, I can generate "is there a \_" questions.

How about more general questions?

In [36]:
from collections import Counter

In [37]:
qs = [qa for qa in toolz.concat(question_answers.values()) if 'kite' in qa['question'].lower()]
Counter(toolz.pluck('question', qs)).most_common(20)

[('Where is the kite?', 397),
 ('What color is the kite?', 265),
 ('Where are the kites?', 223),
 ('How many kites are there?', 182),
 ('Who is flying the kite?', 164),
 ('Who is holding the kite?', 80),
 ('Who is flying a kite?', 63),
 ('How many kites are in the sky?', 59),
 ('How many kites?', 52),
 ('What is attached to the kite?', 41),
 ('How many kites are shown?', 37),
 ('What shape is the kite?', 37),
 ('What color are the kites?', 30),
 ('Who is flying the kites?', 30),
 ('How many kites are in the air?', 29),
 ('Where are the kites flying?', 27),
 ('How many kites are flying?', 25),
 ('What is on the kite?', 23),
 ('How many kites are in the photo?', 21),
 ('Where is the kite flying?', 21)]

In [38]:
qs = [qa for qa in toolz.concat(question_answers.values()) if 'bus' in qa['question'].lower()]
Counter(toolz.pluck('question', qs)).most_common(20)

[('What color is the bus?', 1102),
 ('Where is the bus?', 823),
 ('How many buses are there?', 381),
 ('What is behind the bus?', 202),
 ('Where are the bushes?', 177),
 ('What color are the bushes?', 157),
 ('Where is the bus parked?', 143),
 ('What color are the buses?', 126),
 ('Who is driving the bus?', 123),
 ('Where is the bus going?', 115),
 ('What is the bus doing?', 113),
 ('What is on the bus?', 111),
 ('What kind of bus is this?', 104),
 ('What is on the side of the bus?', 100),
 ('How many buses?', 98),
 ('Where are the buses?', 89),
 ('What number is on the bus?', 89),
 ('How many buses are in the picture?', 85),
 ('How many buses are shown?', 83),
 ('What type of bus is this?', 79)]

In [39]:
qs = [qa for qa in toolz.concat(question_answers.values()) if 'street' in qa['question'].lower()]
Counter(toolz.pluck('question', qs)).most_common(20)

[('What is on the street?', 395),
 ('What color is the street?', 371),
 ('What is the street made of?', 196),
 ('What color is the street sign?', 161),
 ('What color are the street signs?', 113),
 ('Where is the street sign?', 98),
 ('What is the name of the street?', 93),
 ('What is in the street?', 92),
 ('What is parked on the street?', 87),
 ('What is across the street?', 85),
 ('What does the street sign say?', 76),
 ('Where is the street light?', 73),
 ('How many street signs are there?', 71),
 ('What color are the lines on the street?', 70),
 ('What is painted on the street?', 67),
 ('Where are the street signs?', 65),
 ('Where are the street lights?', 55),
 ('What color is the street light?', 53),
 ('What street is this?', 51),
 ('Who is crossing the street?', 50)]

So thereare some questions that don't make sense, but surprisingly many seem like they will.

## Phrase Suggestions

In [41]:
query_words = 'wheelchair'.split()
query_vec = np.zeros(object_vec_matrix.shape[1])
for word in query_words:
    query_vec[object_vectorizer.vocabulary_[word]] = 1
HTML(show_images([image_objects[idx]['image_id'] for idx in np.argsort(object_vec_matrix.dot(query_vec))[-10:]]))

In [44]:
HTML(show_images([image_objects[idx]['image_id'] for idx in np.argsort(object_vec_matrix[:,object_vectorizer.vocabulary_['bus']].A.ravel())[-10:]]))

In [42]:
image_objects[48577]

{'image_id': 2372092,
 'objects': [{'synsets': ['arrow.n.01'],
   'h': 13,
   'object_id': 594085,
   'merged_object_ids': [],
   'names': ['arrow'],
   'w': 15,
   'y': 170,
   'x': 392},
  {'synsets': ['barrel.n.01'],
   'h': 22,
   'object_id': 594088,
   'merged_object_ids': [],
   'names': ['barrel'],
   'w': 23,
   'y': 156,
   'x': 6},
  {'synsets': ['building.n.01'],
   'h': 15,
   'object_id': 594084,
   'merged_object_ids': [],
   'names': ['building'],
   'w': 70,
   'y': 111,
   'x': 321},
  {'synsets': ['building.n.01'],
   'h': 24,
   'object_id': 594099,
   'merged_object_ids': [],
   'names': ['building'],
   'w': 91,
   'y': 92,
   'x': 78},
  {'synsets': ['bus.n.01'],
   'h': 24,
   'object_id': 594096,
   'merged_object_ids': [],
   'names': ['bus'],
   'w': 17,
   'y': 124,
   'x': 339},
  {'synsets': ['bus.n.01'],
   'h': 35,
   'object_id': 594093,
   'merged_object_ids': [],
   'names': ['bus'],
   'w': 32,
   'y': 116,
   'x': 275},
  {'synsets': ['bus.n.01'],
 

In [39]:
assert False

AssertionError: 

In [48]:
ngram_range = (1, 2)
min_df = 20
vectorizer = TfidfVectorizer(ngram_range=ngram_range, min_df=min_df, max_df=.7)
joined_captions = ['\n'.join(phrases) for phrases in phrases_by_img.values()]
caption_vecs = vectorizer.fit_transform(joined_captions)


In [49]:
len(vectorizer.vocabulary_)

91408

In [50]:
voc = vectorizer.get_feature_names()

In [51]:
imgids = list(phrases_by_img.keys())

In [None]:
# query_caption = "a shower with a blue mat on the floor in front of it"
query_caption = "a red city bus"
query_vec = vectorizer.transform([query_caption])
similarity = caption_vecs.dot(query_vec.T).A.ravel()

In [None]:
sorted([(query_vec[0,x], voc[x]) for x in query_vec.tocoo().col])

CosMul similarity. https://tedboy.github.io/nlps/_modules/gensim/models/word2vec.html#Word2Vec.most_similar_cosmul

In [55]:
descs = 'shower,towel,hanging,handle'
descs = 'red bus,black car,street'
similarities = caption_vecs.dot(vectorizer.transform(descs.split(',')).T.A).T
similarity = np.prod(1 + similarities / 2, axis=0)

HTML(show_images([imgids[x] for x in np.argsort(similarity)[-10:]]))