In [1]:
import json
from collections import Counter
import toolz
from IPython.display import Image, HTML
import tqdm

In [2]:
images = json.load(open('/Users/kcarnold/src/ImageCaptioning.pytorch/data/dataset_coco.json'))['images']

In [3]:
id2url = {img['id']: img['coco_url'] for img in
              json.load(open('/Data/COCO/annotations/captions_train2017.json'))['images'] +
              json.load(open('/Data/COCO/annotations/captions_val2017.json'))['images']
         }

In [4]:
Counter(img['split'] for img in images)

Counter({'restval': 30504, 'test': 5000, 'train': 82783, 'val': 5000})

In [5]:
images_by_split = toolz.groupby('split', images)
valid_images = images_by_split['val']
len(valid_images)

5000

In [6]:
def coco_url(cocoid):
    return id2url[cocoid]
#     return f'http://images.cocodataset.org/train2017/{cocoid:012d}.jpg'

In [7]:
coco_url(valid_images[0]['cocoid'])

'http://images.cocodataset.org/train2017/000000184613.jpg'

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer

In [11]:
def join_captions(image):
    return '\n'.join(' '.join(sent['tokens']) for sent in image['sentences'])

In [12]:
joined_captions = [join_captions(image) for image in images]

In [42]:
vectorizer = TfidfVectorizer(
    ngram_range=(1,2),
    #analyzer=lambda image: [tok for sent in image['sentences'] for tok in sent['tokens']],
    min_df=5)
vectorizer.fit(joined_captions)
valid_img_by_word = vectorizer.transform([join_captions(img) for img in images_by_split['val']])

In [43]:
valid_img_by_word

<5000x96019 sparse matrix of type '<class 'numpy.float64'>'
	with 294544 stored elements in Compressed Sparse Row format>

In [31]:
def show_images(indices):
    valid_images = images_by_split['val']
    def img(idx):
        img = valid_images[idx]
        captions = '\n'.join(
            '<div>{}</div>'.format(sent)
            for sent in toolz.pluck('raw', img['sentences'])
        )
        return '<div style="display: inline-block;"><div>{}</div><img src="{}">{}</div>'.format(img['cocoid'], coco_url(img['cocoid']), captions)

    return '\n'.join(img(idx) for idx in indices)


In [69]:
id2str = vectorizer.get_feature_names()

In [73]:
# query_sent = 'a busy city street with cars and people along the streets with high-rise buildings on both sides'
# query_sent = 'a bath room with a white toilet and a white washbasin'
# query_sent = 'a busy street in a historic town with a red bus driving on the street.'
# query_sent = 'a tennis player hits a ball during a game'
query_sent = 'a bride and groom cutting their wedding cake, while a photographer guides them'
query_vec = vectorizer.transform([query_sent])
for k, v in query_vec.todok().items():
    print(id2str[k[1]], v)

and 0.049313356601274304
and groom 0.24176942501198595
bride 0.2359050975189118
bride and 0.24083625480048207
cake 0.15828851385295933
cake while 0.26344799436349564
cutting 0.17432786940553632
cutting their 0.27101955826310037
groom 0.23939787098188633
groom cutting 0.2866464364987647
guides 0.2965858315013846
photographer 0.24734421301588444
their 0.12373290091753768
their wedding 0.25553476029747846
them 0.1477141345464511
wedding 0.2207734177910256
wedding cake 0.23606300763758728
while 0.10336581020863053
while photographer 0.34779625254339697


In [74]:
HTML(show_images(np.argsort(valid_img_by_word.dot(query_vec.T).A.ravel())[-10:][::-1]))

# Similar Pairs

In [12]:
import scipy.sparse

In [13]:
sims = scipy.sparse.triu(valid_img_by_word.dot(valid_img_by_word.T), 1)
sims = sims.tocoo()

In [14]:
sims

<5000x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 12497377 stored elements in COOrdinate format>

In [15]:
sorts = np.argsort(sims.data)[::-1]
sorts[-10:]

array([ 309873, 1407884, 2690014,  311103, 1407850, 2687441, 1407954,
       1407943,  312064, 1407857])

In [16]:
def get_most_similar_unique_pairs(sim_matrix_coo, num_pairs):
    similar_pairs = []
    used_images = set()
    for data_idx in np.argsort(sim_matrix_coo.data)[::-1]:
        a = sim_matrix_coo.row[data_idx]
        b = sim_matrix_coo.col[data_idx]
        if a in used_images or b in used_images:
            continue
        used_images.add(a)
        used_images.add(b)
        similar_pairs.append((a, b))
        if len(similar_pairs) == num_pairs:
            break
    return similar_pairs
similar_pairs = get_most_similar_unique_pairs(scipy.sparse.triu(valid_img_by_word.dot(valid_img_by_word.T), 1).tocoo(), 12)

In [17]:
similar_pairs

[(836, 3414),
 (3322, 3323),
 (2622, 4295),
 (563, 1758),
 (1642, 3163),
 (3716, 3835),
 (4352, 4353),
 (489, 1614),
 (2024, 3063),
 (472, 1968),
 (1370, 3440),
 (1894, 3559)]

In [19]:
HTML('\n'.join(show_images(pair) for pair in similar_pairs))

In [115]:
def get_stimulus_images():
    sim_matrix_coo = scipy.sparse.triu(valid_img_by_word.dot(valid_img_by_word.T), 1).tocoo()
    similar_pairs = list(map(list, get_most_similar_unique_pairs(sim_matrix_coo, 12)))
    rs = np.random.RandomState(0)
    rs.shuffle(similar_pairs)
    for pair in similar_pairs:
        rs.shuffle(pair)
    return similar_pairs
stimuli = get_stimulus_images()

In [116]:
HTML('\n'.join(show_images(pair) for pair in stimuli))

# Here are the images that we're using in the experiment!

In [118]:
[[valid_images[i]['cocoid'] for i in pair] for pair in stimuli]

[[275449, 349130],
 [396295, 301595],
 [431140, 341011],
 [227326, 523217],
 [200451, 313214],
 [223777, 401320],
 [247576, 315976],
 [71815, 240739],
 [240275, 476864],
 [527375, 164170],
 [236272, 467791],
 [280480, 440500]]

In [100]:

vectorizer.idf_[vectorizer.vocabulary_['person']]

3.0180957701134536

In [77]:
def show_caps(indices):
    print('\n\n'.join(
        '\n'.join([sent['raw'] for sent in images_by_split['val'][idx]['sentences']])
        for idx in indices))

In [104]:
valid_images[836]['cocoid']

236272

In [95]:
# HTML(show_images([153, 3323]))
# HTML(show_images([489, 1614]))
HTML(show_images([836, 1508]))
# HTML(show_images([4352, 4353]))

In [35]:
[sent['raw'] for sent in images_by_split['val'][0]['sentences']]

['A child holding a flowered umbrella and petting a yak.',
 'A young man holding an umbrella next to a herd of cattle.',
 'a young boy barefoot holding an umbrella touching the horn of a cow',
 'A young boy with an umbrella who is touching the horn of a cow.',
 'A boy holding an umbrella while standing next to livestock.']

In [42]:
np.argsort(sims[0].A[0])[-10:]

array([ 293, 1648,  755, 4121, 3980, 1459, 3607,  943, 2166,    0])

In [53]:
closest_data_indices = np.argsort(sims.data)

In [46]:
from sklearn.metrics import pairwise

In [50]:
pairwise.pairwise_distances_argmin_min(valid_img_by_word[0], valid_img_by_word[1:])


(array([2165]), array([0.91825199]))

In [52]:
[sent['raw'] for sent in images_by_split['val'][2166]['sentences']]

['Young boy holding an umbrella on a rainy day.',
 'The boy is walking in the rain with an umbrella.',
 'a boy holding an umbrella as he walks down the road ',
 'a young boy walks towards the camera holding an umbrella ',
 'A young boy with an umbrella covering on a rainy day']

In [45]:
print('\n\n'.join([
    '\n'.join([sent['raw'] for sent in images_by_split['val'][idx]['sentences']])
    for idx in np.argsort(sims[0].A[0])[-10:][::-1]]))

A child holding a flowered umbrella and petting a yak.
A young man holding an umbrella next to a herd of cattle.
a young boy barefoot holding an umbrella touching the horn of a cow
A young boy with an umbrella who is touching the horn of a cow.
A boy holding an umbrella while standing next to livestock.

Young boy holding an umbrella on a rainy day.
The boy is walking in the rain with an umbrella.
a boy holding an umbrella as he walks down the road 
a young boy walks towards the camera holding an umbrella 
A young boy with an umbrella covering on a rainy day

A woman walking down a street holding an umbrella.
A woman with an umbrella and a man walking.
A woman is standing beside an older gentleman while holding an umbrella.
A woman holding an umbrella while a man walks behind her.
A woman walking with a man and holding an umbrella looks off to the side. 

an image of a woman holding an umbrella with her coat on
A woman holding an umbrella in the rain
A woman holding a white umbrella wi

In [9]:
import h5py

  from ._conv import register_converters as _register_converters


In [30]:
img_data_file = h5py.File('/Users/kcarnold/code/textrec/models-aside/feats_by_imgid.h5')

In [31]:
def get_vec_for_image(cocoid):
    return img_data_file[str(cocoid)][:].mean(axis=0)

In [32]:
mean_features = np.array([get_vec_for_image(img['cocoid']) for img in tqdm.tqdm_notebook(images)])




In [None]:
from sklearn.metrics import pairwise_distances

In [None]:
pdists = pairwise_distances(mean_features)

In [None]:
def show_similar_images(valid_idx, n=10):
    return '\n'.join(
        '<img src="{}">'.format(coco_url(valid_images[idx]['cocoid']))
        for idx in np.argsort(pdists[valid_idx])[:n])
HTML(show_similar_images(1))

In [18]:
HTML(show_similar_images(3))