In [1]:
#imports
from pathlib import Path
import json
from collections import defaultdict
import numpy as np
import time
import gensim
from gensim.models.keyedvectors import KeyedVectors
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt
%matplotlib inline
import re, string
import pickle
from collections import Counter

In [2]:
# load COCO metadata
filename = "data/captions_train2014.json"
with Path(filename).open() as f:
    coco_data = json.load(f)

In [3]:
class Embedder:
    def __init__(self, model, idf):
        self.model = model
        self.idf = idf
    def __call__(self, text):
        x = sum(_get(self.model, w, 0.0) * self.idf.get(w, 0) for w in get_words(text))
        return x / np.linalg.norm(x)
    

In [14]:
class coco_values:
    def __init__(self, coco_data):
        #make variables for images and captions
        self.coco_annotations = coco_data['annotations']
        self.coco_images = coco_data['images']
        self.coco_img_ids = [i['id'] for i in self.coco_images if i['id'] in resnet18_features.keys()]
        self.coco_cap_ids = [i['id'] for i in self.coco_annotations if i['image_id'] in resnet18_features.keys()]
        self.image2cap = {}
                
        for i in self.coco_annotations:
            if i['image_id'] in self.image2cap.keys():
                ids = self.image2cap[i['image_id']]
                ids.append(i['id'])
                self.image2cap[i['image_id']] = ids
            else:
                self.image2cap[i['image_id']] = [i['id']]
        self.cap2image = {i['id']: i['image_id'] for i in self.coco_annotations}
        self.cap2text = {i['id']: i['caption'] for i in self.coco_annotations}
        self.image2url = {i['id']: i['coco_url'] for i in self.coco_images}
        self.capID2embeded = None
        self.captions = list(self.cap2text.values())
    def get_captions(self, image_id):
        #Find the caption IDs by returning all caption IDs associated with an image ID
        return self.image2cap[image_id]
    def get_image_from_cap(self, caption_id):
        #Find the image ID by seeing if any image had a caption ID match
        return self.cap2image[caption_id]
    def get_caption_text(self, caption_id):
        #Take the text from a given caption ID
        return self.cap2text[caption_id]
    def get_url(self, image_id):
        return self.image2url[image_id]
    def init_id2vec(self, model):
        self.capID2embeded = {}
        idf = inverse_doc_freq(self.captions)
        self.text_embedder = Embedder(model,idf)
        self.capID2embeded = {_id: self.text_embedder(self.get_caption_text(_id)) for _id in self.coco_cap_ids}

    def compute_id2vec(self, caption_id):
        if self.capID2embeded is None:
            print("No caption to embeded dictionary created! Run init_id2vec before running")
            return None
        
        if not isinstance(caption_id,type(np.array(1))):
            return np.array(self.capID2embeded[caption_id])
        return [self.compute_id2vec(i) for i in caption_id]
    

In [5]:
punc_regex = re.compile('[{}]'.format(re.escape(string.punctuation)))
def comp_inverse_doc_freq(documents):
    df = Counter()
    for doc in documents:
        df.update(set(get_words(doc)))
    return df
def inverse_doc_freq(documents):
    df = comp_inverse_doc_freq(documents)
    return {word: np.log10(len(documents) / count) for word, count in df.items()}
def get_words(text):
    return punc_regex.sub('', text.lower()).split()
def _get(model, k, value = None, verbose = None):
    try:
        return model[k]
    except KeyError:
        if verbose:
            print(f"'(k)' is not in the word-embedding vocabulary")
        return value

In [6]:
with Path('data/resnet18_features.pkl').open('rb') as f:
    resnet18_features = pickle.load(f)

In [15]:
x = coco_values(coco_data)

In [8]:
x.get_captions(318556)

[48, 126, 219, 255, 3555]

In [9]:
x.get_image_from_cap(48)

318556

In [10]:
x.get_caption_text(48)

'A very clean and well decorated empty bathroom'

In [11]:
#load glove dataset
path = r"data/glove.6B.200d.txt.w2v"
t0 = time.time()
glove = KeyedVectors.load_word2vec_format(path, binary=False)
t1 = time.time()
print("elapsed %ss" % (t1 - t0))

elapsed 68.24052047729492s


In [16]:
x.init_id2vec(glove)

In [12]:
class ImageToWord:
    """A linear encoder network that produces normalized vectors
    in the embedded space"""

    def __init__(self, in_dim: int, out_dim: int):
        self.dense = dense(
            in_dim, out_dim, weight_initializer=glorot_normal, bias=False
        )

    def __call__(self, x: np.ndarray) -> mg.Tensor:
        """
        Parameters
        ----------
        x : numpy.ndarray, shape-(N, C)

        Returns
        -------
        mygrad.Tensor, shape-(N, D)
            The normalized linear projection of each of the N data
        """
        out = self.dense(x)
        return out / mg.sqrt(mg.einsum("ij, ij -> i", out, out)).reshape(-1, 1)

    @property
    def parameters(self) -> Tuple[mg.Tensor, ...]:
        return self.dense.parameters

    def save_model(self, path):
        """Path to .npz file where model parameters will be saved."""
        with open(path, "wb") as f:
            np.savez(f, *(x.data for x in self.parameters))

    def load_model(self, path):
        with open(path, "rb") as f:
            for param, (name, array) in zip(self.parameters, np.load(f).items()):
                param.data[:] = array

In [13]:
def resnetID2vectors(N,resnet_model=None):
    if resnet_model is None:
        print("no resnet model given, using ResNet18")
        with Path('data/resnet18_features.pkl').open('rb') as f:
            resnet_model = pickle.load(f)
    return [resnet_model[i] for i in N if i in resnet_model.keys()]

In [14]:
#resnetID2vectors([318556])

In [28]:
def generate_data(image_ids, coco, num_good_captions: int, triples_per_caption: int = 10,tournment_size=25):
    triples = []
    for i in range(triples_per_caption):
        #getting good image
        good_image_id = np.random.choice(image_ids)
        
        #getting caption
        good_caption_id: int = np.random.choice(coco.get_captions(good_image_id))
        good_caption_embedding = coco.compute_id2vec(good_caption_id)
        bad_image_id = caption2badimage(good_caption_embedding, good_image_id, coco.coco_img_ids, coco, tournment_size)
        
        #add triples
        triples.append((good_caption_id, good_image_id, bad_image_id))
        
    np.random.shuffle(triples)
    
    return triples

In [127]:
def caption2badimage(caption_embedding, good_image_id, possible_im_ids, coco, tournament_size):
    best_score = -1.0
    best_bad_img_id = -1
    for _ in range(tournament_size):
        bad_image_id = good_image_id
        while bad_image_id == good_image_id: #choosing a distinct bad image id from the set of available 
            bad_image_id = np.random.choice(possible_im_ids)
        
        bad_caption_id = np.random.choice(coco.get_captions(bad_image_id))
        bad_embedding = coco.compute_id2vec(bad_caption_id)
        sim = np.matmul(caption_embedding,bad_embedding)
        
        if sim > best_score:
            best_score = sim
            best_bad_img_id = bad_image_id
            
    print(best_score)
    return best_bad_img_id

In [144]:
ID = -1
while ID not in x.coco_cap_ids:
    ID = int(np.random.choice(x.coco_cap_ids))
print(x.get_url(x.get_image_from_cap(ID)))
print(x.get_url(caption2badimage(x.compute_id2vec(ID), x.get_image_from_cap(ID),x.coco_img_ids, x,1000)))

http://images.cocodataset.org/train2014/COCO_train2014_000000571395.jpg


TypeError: isinstance() arg 2 must be a type or tuple of types

In [30]:
generate_data(x.coco_img_ids, x, 100)

[(716588, 303617, 36029),
 (666909, 559544, 230462),
 (49896, 431628, 74429),
 (64868, 244530, 113857),
 (374390, 265008, 24782),
 (829308, 181584, 558584),
 (490160, 266436, 525876),
 (407049, 557974, 190219),
 (525853, 42819, 187277),
 (746675, 482798, 77195)]