In [1]:
#imports
from pathlib import Path
import json
from collections import defaultdict
import numpy as np
import time
import gensim
from gensim.models.keyedvectors import KeyedVectors
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt
%matplotlib inline
import re, string
import pickle

In [2]:
# load COCO metadata
filename = "data/captions_train2014.json"
with Path(filename).open() as f:
    coco_data = json.load(f)

In [3]:
with Path('data/resnet18_features.pkl').open('rb') as f:
    resnet18_features = pickle.load(f)

In [54]:
class coco_values:
    def __init__(self, coco_data):
        #make variables for images and captions
        self.coco_annotations = coco_data['annotations']
        self.coco_images = coco_data['images']
        self.coco_img_ids = [i['id'] for i in self.coco_images if i['id'] in resnet18_features.keys()]
        self.coco_cap_ids = [i['id'] for i in self.coco_annotations if i['image_id'] in resnet18_features.keys()]
        self.image2cap = {}
        for i in self.coco_annotations:
            if i['image_id'] in self.image2cap.keys():
                ids = self.image2cap[i['image_id']]
                ids.append(i['id'])
                self.image2cap[i['image_id']] = ids
            else:
                self.image2cap[i['image_id']] = [i['id']]
        self.cap2image = {i['id']: i['image_id'] for i in self.coco_annotations}
        self.cap2text = {i['id']: i['caption'] for i in self.coco_annotations}
        self.image2url = {i['id']: i['coco_url'] for i in self.coco_images}
        self.capID2embeded = None
    def get_captions(self, image_id):
        #Find the caption IDs by returning all caption IDs associated with an image ID
        return self.image2cap[image_id]
    def get_image_from_cap(self, caption_id):
        #Find the image ID by seeing if any image had a caption ID match
        return self.cap2image[caption_id]
    def get_caption_text(self, caption_id):
        #Take the text from a given caption ID
        return self.cap2text[caption_id]
    def get_url(self, image_id):
        return self.image2url[image_id]
    def init_id2vec(self, IDF):
        self.capID2embeded = {}
        punc_regex = re.compile('[{}]'.format(re.escape(string.punctuation)))
        for i in self.coco_annotations:
            given_str = punc_regex.sub('', i['caption'])
            str_list = given_str.lower().split()
            self.capID2embeded[i['id']] = [IDF[word] if word in IDF else np.zeros((200,)) for word in str_list]
            
    def compute_id2vec(self, caption_id):
        if self.capID2embeded is None:
            print("No caption to embeded dictionary created! Run init_id2vec before running")
            return None
        return self.capID2embeded[caption_id]

In [55]:
x = coco_values(coco_data)

In [47]:
x.get_captions(318556)

[48, 126, 219, 255, 3555]

In [46]:
x.get_image_from_cap(48)

318556

In [45]:
x.get_caption_text(48)

'A very clean and well decorated empty bathroom'

In [56]:
x.init_id2vec(glove)

In [9]:
#load glove dataset
path = r"data/glove.6B.200d.txt.w2v"
t0 = time.time()
glove = KeyedVectors.load_word2vec_format(path, binary=False)
t1 = time.time()
print("elapsed %ss" % (t1 - t0))

elapsed 102.4459068775177s


In [11]:
#string2vectors("My dog loves me",glove)

In [12]:
class ImageToWord:
    def __init__(self, context_words, d):
        """ Initializes all of the encoder and decoder layers in our model, setting them
        as attributes of the model.
        
        Parameters
        ----------
        context_words : int
            The number of context words included in our vocabulary
            
        d : int
            The dimensionality of our word embeddings
        """
        
        # STUDENT CODE:
        self.encode = dense(context_words, d, weight_initializer=glorot_normal, bias=False)
        self.decode = dense(d, context_words, weight_initializer=glorot_normal, bias=False)
        
    
    
    def __call__(self, x):
        ''' Passes data as input to our model, performing a "forward-pass".
        
        This allows us to conveniently initialize a model `m` and then send data through it
        to be classified by calling `m(x)`.
        
        Parameters
        ----------
        x : Union[numpy.ndarray, mygrad.Tensor], shape=(M, context_words)
            A batch of data consisting of M words from the context matrix,
                each tracking the number of co-occurences with `context_words` words.
                
        Returns
        -------
        mygrad.Tensor, shape=(M, context_words)
            The result of passing the data through borth the encoder and decoder.
        '''
        
        # STUDENT CODE:
        return self.decode(self.encode(x))
    
    
    @property
    def parameters(self):
        """ A convenience function for getting all the parameters of our model.
        
        This can be accessed as an attribute, via `model.parameters` 
        
        Returns
        -------
        Tuple[Tensor, ...]
            A tuple containing all of the learnable parameters for our model"""
        
        # STUDENT CODE:
        return self.encode.parameters + self.decode.parameters

In [13]:
def resnetID2vectors(N,resnet_model=None):
    if resnet_model is None:
        print("no resnet model given, using ResNet18")
        with Path('data/resnet18_features.pkl').open('rb') as f:
            resnet_model = pickle.load(f)
    return [resnet_model[i] for i in N if i in resnet_model.keys()]

In [14]:
#resnetID2vectors([318556])

In [15]:
def generate_data(image_ids, coco, num_good_captions: int, triples_per_caption: int = 10,tournment_size=25):
    triples = []
    for i in range(triples_per_caption):
        #getting good image
        good_image_id = random.choice(image_ids)
        
        #getting caption
        good_caption_id: int = random.choice(coco.get_captions(good_image_id))
        good_caption_embedding = string2vectors(good_caption_id,glove)
        
        #add triples (pairs for now)
        triples.append((good_caption_id, good_image_id))
        
    np.random.shuffle(triples)
    
    return triples

In [16]:
def cos_dist(di, dj):
    dot_prod = np.vdot(di, dj)
    denom = sum(i**2 for i in di.T)**0.5 * sum(j**2 for j in dj.T)**0.5
    cosine_dist = 1 - dot_prod/denom
    return float(cosine_dist)

In [70]:
def caption2badimage(embedded_caption, good_image_id, possible_im_ids, coco, tournament_size):
    best_score = -1.0
    best_bad_img_id = -1
    caption_embedding = coco.get_captions(good_image_id)
    for _ in range(tournament_size):
        bad_image_id = good_image_id
        while bad_image_id == good_image_id:
            bad_image_id = np.random.choice(possible_im_ids)
        
        bad_caption_id = np.random.choice(coco.get_captions(bad_image_id))
        
        bad_embedding = coco.compute_id2vec(bad_caption_id)
        sim = np.matmul(caption_embedding,bad_embedding)
        
        if sim > best_score:
            best_score = sim
            best_bad_img_id = bad_image_id        
    return best_bad_img_id

In [71]:
ID = 48
caption2badimage(x.compute_id2vec(ID), x.get_image_from_cap(ID),x.coco_img_ids, x,1)

ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 9 is different from 5)