In [2]:
import pandas as pd
import numpy as np
import json
import ujson
import toolz
import pathlib
from IPython.display import Image, HTML

In [3]:
vg_base = pathlib.Path('/Data/VisualGenome')
image_objects = ujson.load(open(vg_base / 'objects.json'))


In [4]:
img_by_id = {img['image_id']: img for img in image_objects}

In [5]:
if False:
    def show_images(imgids):
        return HTML('\n'.join('<img src="{}">'.format(img_by_id[imgid]['image_url']) for imgid in imgids))

    obj_synsets = json.load(open(vg_base / 'object_synsets.json'))
    obj_attributes = json.load(open(vg_base / 'attributes.json'))

    attributes_by_img = {att['image_id']: att['attributes'] for att in obj_attributes}

    def has_object(imgid, obj_name):
        return any(obj_name in '\n'.join(obj['names']) for obj in attributes_by_img[imgid])

    def has_synset(imgid, obj_synset):
        return any(obj_synset in obj['synsets'] for obj in attributes_by_img[imgid])

    def has_attr(imgid, attr):
        return any(attr in obj['attributes'] for obj in attributes_by_img[imgid])

    def has_obj_with_attr(imgid, obj_name, attr):
        return any(
            (obj_name in '\n'.join(obj['names'])) and (attr in obj.get('attributes', []))
            for obj in attributes_by_img[imgid])

    def has_synset_with_attr(imgid, obj_synset, attr):
        return any(
            (obj_synset in obj['synsets']) and (attr in obj.get('attributes', []))
            for obj in attributes_by_img[imgid])

    def has_obj_without_attr(imgid, obj_name, attr):
        return any(
            (obj_name in '\n'.join(obj['names'])) and (attr not in obj.get('attributes', []))
            for obj in attributes_by_img[imgid])

    def has_synset_without_attr(imgid, obj_synset, attr):
        return any(
            (obj_synset in obj['synsets']) and (attr not in obj.get('attributes', []))
            for obj in attributes_by_img[imgid])

    candidates = {
        imgid for imgid in attributes_by_img.keys()
        if (
            (
                has_object(imgid, 'pedestrian sign') or
                has_object(imgid, 'pedestrian crossing sign') or
                has_object(imgid, 'crossing sign') or
                has_obj_with_attr(imgid, 'sign', 'yellow')
            ) and (
                has_obj_without_attr(imgid, 'traffic light', 'red') or
                has_synset_without_attr(imgid, 'traffic_light.n.01', 'red')
            ))}
    len(candidates)


    show_images(list(candidates)[:10])
    show_images(candidates)

    candidates = {
        imgid for imgid in attributes_by_img.keys()
        if has_obj_with_attr(imgid, "train", "brown")
    }

In [6]:
region_descs = ujson.load(open(vg_base / 'region_descriptions.json'))

In [7]:
region_descs[0].keys()

dict_keys(['regions', 'id'])

In [8]:
regions_by_img = {r['id']: r['regions'] for r in region_descs}

In [9]:
phrases_by_img = {id: {region['phrase'].strip() for region in regions} for id, regions in regions_by_img.items()}

In [10]:
phrases_by_img[2408456]

{'7760 on back of train',
 'MARC  on back of train',
 'a blue metal pipe',
 'a bright green light',
 'a bunch of wires and plugs',
 'a few telephone posts',
 'a red traffic light',
 'a small grey outbuilding',
 'a small grey shack',
 'a small puddle of water',
 'a straigh train track',
 'a tall electrical pole with powerlines',
 'a train in motion',
 'a white train with blue and orange stripes',
 'a yellow and blue logo sticker',
 'blue and orange stripes',
 'bright red lights on the back of a train',
 'brown train tracks',
 'green and red stoplights',
 'green dense trees',
 'green traffic light on right',
 'orange and blue stripes on back',
 'red light on the left',
 'red light on the right',
 'red traffic light on right',
 'rough rocky ground',
 'some piles of gravel',
 'the door is white',
 'the light is red',
 'the lights are red',
 'the nummber 7760 is on the train',
 'the photo was taken during the day',
 'the pole is white',
 'the portable is standing',
 'the rail road is brown'

In [11]:
question_answers = ujson.load(open(vg_base / 'question_answers.json'))

In [20]:
def show_images(imgids, max_width=200):
    def img(idx):
        img = img_by_id[idx]
#         attrs = {
#             '<b>{}</b>=<i>{}</i>'.format(', '.join(obj['names']), ', '.join(obj.get('attributes', [])))
#             for obj in attributes_by_img[idx]}
        attrs = sorted(phrases_by_img[idx])
        return '<div style="display: inline-block;">{}<img src="{}" style="max-width: {}px">{}</div>'.format(
            idx, img['image_url'], max_width, '; '.join(sorted(attrs)))
    return '\n'.join(img(idx) for idx in imgids)

In [12]:
len(question_answers)

108077

In [13]:
len(question_answers[0]['qas'])

91

In [14]:
question_answers[0]['qas'][0].keys()

dict_keys(['a_objects', 'question', 'image_id', 'qa_id', 'answer', 'q_objects'])

In [15]:
question_answers = {qa['id']: qa['qas'] for qa in question_answers if len(qa['qas'])}

In [17]:
import random
img_qa = random.choice(list(question_answers.values()))
list(toolz.pluck(['question', 'answer'], img_qa))

[('Where was the picture taken?', 'At a motorcycle convention.'),
 ('Who is sitting on the motorcycle?', 'A man.'),
 ('Where is the motorcycle?', 'A store.'),
 ('What has two wheels?', 'The motorcycle.'),
 ('What is the man holding onto?', 'Handles.'),
 ('What year is on the sign?', '2011.'),
 ('What is the motorcycle parked for?', 'Display.'),
 ('What is the man looking at?', 'The camera.'),
 ('Why is a plate on the bottom?', 'To hold the bike.'),
 ('What is advertised behind the man?', 'Posters.'),
 ('What is the poster of?', 'Bike 2011.'),
 ('What are the two people standing doing?', 'Hugging.'),
 ('What is the person carrying?', 'Bags.'),
 ('What is the man sitting on?', 'Motorcycles.'),
 ('What is the man sitting on?', 'Showroom bike.'),
 ('What is the man wearing?', 'White hoodie and gray design.'),
 ('What number is on the motorcycle?', 'Number 54 is on it.'),
 ('What type of carpet?', 'A red carpet.'),
 ('Who is sitting on the bike?', 'A man is sitting.'),
 ('What is black on t

In [34]:
[qa for qa in toolz.concat(question_answers.values()) if 'better' in qa['answer'].lower()]

[{'a_objects': [],
  'question': 'Why are the lights on?',
  'image_id': 12,
  'qa_id': 988489,
  'answer': 'To see better.',
  'q_objects': []},
 {'a_objects': [],
  'question': 'Who us named on the clock face?',
  'image_id': 2411253,
  'qa_id': 166360,
  'answer': 'J. C. Leadbetter.',
  'q_objects': []},
 {'a_objects': [],
  'question': 'Why are these scissors in light?',
  'image_id': 2408221,
  'qa_id': 190623,
  'answer': 'To see better.',
  'q_objects': []},
 {'a_objects': [],
  'question': 'Why do the players wear cleats?',
  'image_id': 2407189,
  'qa_id': 1201108,
  'answer': 'Better footing.',
  'q_objects': []},
 {'a_objects': [],
  'question': 'Why is there a light around the mirror?',
  'image_id': 2407180,
  'qa_id': 198948,
  'answer': 'For better visibility.',
  'q_objects': []},
 {'a_objects': [],
  'question': 'Why is the dog sleeping in the bed curled up in a ball?',
  'image_id': 2405369,
  'qa_id': 1158833,
  'answer': 'To sleep better.',
  'q_objects': []},
 {'a_

In [36]:
whys = [qa for qa in toolz.concat(question_answers.values()) if qa['question'].lower().startswith('why ')]
len(whys)

38672

In [38]:
random.sample(whys, 10)

[{'a_objects': [],
  'question': 'Why is the hand on the street light?',
  'image_id': 2413300,
  'qa_id': 149990,
  'answer': "Don't walk.",
  'q_objects': []},
 {'a_objects': [],
  'question': 'Why are his arms out?',
  'image_id': 2323698,
  'qa_id': 866769,
  'answer': 'For balance.',
  'q_objects': []},
 {'a_objects': [],
  'question': 'Why is the car there?',
  'image_id': 2370224,
  'qa_id': 819243,
  'answer': "It's the cat's toy.",
  'q_objects': []},
 {'a_objects': [],
  'question': 'Why is the airplane sitting?',
  'image_id': 2401632,
  'qa_id': 1247966,
  'answer': 'Loading.',
  'q_objects': []},
 {'a_objects': [],
  'question': 'Why is shirt so large?',
  'image_id': 2399643,
  'qa_id': 1469960,
  'answer': 'Wrong size.',
  'q_objects': []},
 {'a_objects': [],
  'question': 'Why is this man surfing?',
  'image_id': 2347722,
  'qa_id': 261441,
  'answer': 'For fun.',
  'q_objects': []},
 {'a_objects': [],
  'question': 'Why are the boats stationary?',
  'image_id': 2380028

In [33]:
HTML(show_images([2370308]))

In [19]:
HTML(show_images([img_qa[0]['image_id']]))

NameError: name 'show_images' is not defined

So the region descriptions contain more information than the objects/attributes.

# Image-Object Questions

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [45]:
object_names = [[name for obj in img['objects'] for name in obj['names']] for img in image_objects]

In [46]:
object_names[500]

['couch',
 'floor',
 'curtains',
 'curtains',
 'lamp',
 'cushion',
 'shade',
 'cord',
 'foot',
 'foot']

In [47]:
object_vectorizer = TfidfVectorizer(analyzer=lambda x: x)
object_vec_matrix = object_vectorizer.fit_transform(object_names)

In [48]:
object_vec_matrix

<108077x82827 sparse matrix of type '<class 'numpy.float64'>'
	with 1929060 stored elements in Compressed Sparse Row format>

In [49]:
object_vectorizer.vocabulary_['shower']

62957

In [61]:
id2objname = object_vectorizer.get_feature_names()

In [52]:
from sklearn.metrics.pairwise import pairwise_distances_argmin

In [None]:
#pairwise_distances_argmin()

In [63]:
sims = object_vec_matrix.T.dot(object_vec_matrix[:,object_vectorizer.vocabulary_['shower']]).A.ravel()
[id2objname[x] for x in np.argsort(sims)[-10:]]

['floor',
 'mirror',
 'faucet',
 'sink',
 'towel',
 'toilet',
 'wall',
 'bathroom',
 'tile',
 'shower']

In [64]:
sims = object_vec_matrix.T.dot(object_vec_matrix[:,object_vectorizer.vocabulary_['kite']]).A.ravel()
[id2objname[x] for x in np.argsort(sims)[-10:]]

['string',
 'shirt',
 'clouds',
 'kites',
 'people',
 'beach',
 'man',
 'person',
 'sky',
 'kite']

In [65]:
sims = object_vec_matrix.T.dot(object_vec_matrix[:,object_vectorizer.vocabulary_['office']]).A.ravel()
[id2objname[x] for x in np.argsort(sims)[-10:]]

['window',
 'laptop',
 'wall',
 'mouse',
 'keyboard',
 'chair',
 'computer',
 'monitor',
 'desk',
 'office']

In [66]:
sims = object_vec_matrix.T.dot(object_vec_matrix[:,object_vectorizer.vocabulary_['bus']]).A.ravel()
[id2objname[x] for x in np.argsort(sims)[-10:]]

['tree',
 'tire',
 'street',
 'light',
 'road',
 'car',
 'building',
 'sign',
 'window',
 'bus']

Ok great, I can generate "is there a \_" questions.

How about more general questions?

In [72]:
from collections import Counter

In [78]:
qs = [qa for qa in toolz.concat(question_answers.values()) if 'kite' in qa['question'].lower()]
Counter(toolz.pluck('question', qs)).most_common(20)

[('Where is the kite?', 397),
 ('What color is the kite?', 265),
 ('Where are the kites?', 223),
 ('How many kites are there?', 182),
 ('Who is flying the kite?', 164),
 ('Who is holding the kite?', 80),
 ('Who is flying a kite?', 63),
 ('How many kites are in the sky?', 59),
 ('How many kites?', 52),
 ('What is attached to the kite?', 41),
 ('How many kites are shown?', 37),
 ('What shape is the kite?', 37),
 ('What color are the kites?', 30),
 ('Who is flying the kites?', 30),
 ('How many kites are in the air?', 29),
 ('Where are the kites flying?', 27),
 ('How many kites are flying?', 25),
 ('What is on the kite?', 23),
 ('How many kites are in the photo?', 21),
 ('Where is the kite flying?', 21)]

In [79]:
qs = [qa for qa in toolz.concat(question_answers.values()) if 'bus' in qa['question'].lower()]
Counter(toolz.pluck('question', qs)).most_common(20)

[('What color is the bus?', 1102),
 ('Where is the bus?', 823),
 ('How many buses are there?', 381),
 ('What is behind the bus?', 202),
 ('Where are the bushes?', 177),
 ('What color are the bushes?', 157),
 ('Where is the bus parked?', 143),
 ('What color are the buses?', 126),
 ('Who is driving the bus?', 123),
 ('Where is the bus going?', 115),
 ('What is the bus doing?', 113),
 ('What is on the bus?', 111),
 ('What kind of bus is this?', 104),
 ('What is on the side of the bus?', 100),
 ('How many buses?', 98),
 ('Where are the buses?', 89),
 ('What number is on the bus?', 89),
 ('How many buses are in the picture?', 85),
 ('How many buses are shown?', 83),
 ('What type of bus is this?', 79)]

In [80]:
qs = [qa for qa in toolz.concat(question_answers.values()) if 'street' in qa['question'].lower()]
Counter(toolz.pluck('question', qs)).most_common(20)

[('What is on the street?', 395),
 ('What color is the street?', 371),
 ('What is the street made of?', 196),
 ('What color is the street sign?', 161),
 ('What color are the street signs?', 113),
 ('Where is the street sign?', 98),
 ('What is the name of the street?', 93),
 ('What is in the street?', 92),
 ('What is parked on the street?', 87),
 ('What is across the street?', 85),
 ('What does the street sign say?', 76),
 ('Where is the street light?', 73),
 ('How many street signs are there?', 71),
 ('What color are the lines on the street?', 70),
 ('What is painted on the street?', 67),
 ('Where are the street signs?', 65),
 ('Where are the street lights?', 55),
 ('What color is the street light?', 53),
 ('What street is this?', 51),
 ('Who is crossing the street?', 50)]

So thereare some questions that don't make sense, but surprisingly many seem like they will.

In [None]:
assert False

In [None]:
ngram_range = (1, 2)
min_df = 20
vectorizer = TfidfVectorizer(ngram_range=ngram_range, min_df=min_df, max_df=.7)
joined_captions = ['\n'.join(phrases) for phrases in phrases_by_img.values()]
caption_vecs = vectorizer.fit_transform(joined_captions)


In [None]:
len(vectorizer.vocabulary_)

In [None]:
voc = vectorizer.get_feature_names()

In [None]:
imgids = list(phrases_by_img.keys())

In [None]:
query_caption = "a shower with a blue mat on the floor in front of it"
query_vec = vectorizer.transform([query_caption])
similarity = caption_vecs.dot(query_vec.T).A.ravel()

In [None]:
sorted([(query_vec[0,x], voc[x]) for x in query_vec.tocoo().col])

CosMul similarity. https://tedboy.github.io/nlps/_modules/gensim/models/word2vec.html#Word2Vec.most_similar_cosmul

In [None]:
similarities = caption_vecs.dot(vectorizer.transform(['shower', 'towel', 'hanging', 'handle']).T.A).T
similarities.shape

In [None]:
similarity = np.prod(1 + similarities / 2, axis=0)

In [None]:
HTML(show_images([imgids[x] for x in np.argsort(similarity)[-10:]]))