In [18]:
#imports
from pathlib import Path
import json
from collections import defaultdict
import numpy as np
import time
import gensim
from gensim.models.keyedvectors import KeyedVectors
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt
%matplotlib inline
import re, string

In [9]:
# load COCO metadata
filename = "data/captions_train2014.json"
with Path(filename).open() as f:
    coco_data = json.load(f)

In [10]:
class coco_values:
    def __init__(self, coco_data):
        #make variables for images and captions
        self.coco_annotations = coco_data['annotations']
        self.coco_images = coco_data['images']
        self.coco_img_ids = [i['id'] for i in self.coco_images]
        self.coco_cap_ids = [i['id'] for i in self.coco_annotations]
    def get_captions(self, image_id):
            #Find the caption IDs by returning all caption IDs associated with an image ID
        return [i['id'] for i in self.coco_annotations if i['image_id'] == image_id]
    def get_image_from_cap(self, caption_id):
        #Find the image ID by seeing if any image had a caption ID match
        return [i['image_id'] for i in self.coco_annotations if i['id'] == caption_id][0]
    def get_caption_text(self, caption_id):
        #Take the text from a given caption ID
        return [i['caption'] for i in self.coco_annotations if i['id'] == caption_id][0]

In [11]:
x = coco_values(coco_data)

In [12]:
x.get_captions(318556)

[48, 126, 219, 255, 3555]

In [13]:
x.get_image_from_cap(48)

318556

In [14]:
x.get_caption_text(48)

'A very clean and well decorated empty bathroom'

In [16]:
#load glove dataset
path = r"data/glove.6B.200d.txt.w2v"
t0 = time.time()
glove = KeyedVectors.load_word2vec_format(path, binary=False)
t1 = time.time()
print("elapsed %ss" % (t1 - t0))

elapsed 97.68171954154968s


In [23]:
def string2vectors(given_str,IDF=None):
    if IDF is None:
        print("No IDF given, loading GloVe-200d")
        path = r"data/glove.6B.200d.txt.w2v"
        t0 = time.time()
        IDF = KeyedVectors.load_word2vec_format(path, binary=False)
        t1 = time.time()
        print("GloVe loaded in: %ss" % (t1-t0))
    punc_regex = re.compile('[{}]'.format(re.escape(string.punctuation)))
    given_str = punc_regex.sub('', given_str)
    str_list = given_str.lower().split()
    return [IDF[word] if word in IDF else np.zeroes((200,)) for word in str_list]

In [24]:
string2vectors("My dog loves me",glove)

[array([ 3.0380e-01,  1.8126e-01,  4.6583e-01, -6.6440e-01, -4.4070e-01,
         1.7174e-01, -5.0796e-01, -4.2103e-01,  1.6000e-01,  6.5258e-01,
        -5.7537e-01,  3.7265e-01,  6.9735e-01,  7.1328e-01,  1.7069e-01,
         4.0841e-01, -6.1980e-01,  5.2908e-01,  1.1537e-01,  2.0981e-01,
         5.6525e-01,  2.9440e+00,  7.0009e-01, -1.8037e-01,  1.0374e-01,
        -4.3081e-01, -1.3472e-02,  1.5318e-01, -5.7869e-01, -3.2528e-01,
        -7.2414e-01, -1.4693e-01,  1.3082e-01, -4.4664e-01, -5.2502e-01,
         2.5720e-01, -2.1991e-01, -6.1173e-02, -1.5098e-01,  2.5422e-01,
        -3.6608e-01,  3.5592e-01, -3.4717e-01,  5.6783e-01, -3.9235e-01,
         4.1060e-01,  5.7588e-01,  4.0124e-02, -5.8766e-02,  4.0908e-01,
         2.6878e-01, -1.2518e-01,  1.8262e-01,  8.3374e-02,  2.3665e-01,
        -2.9179e-01,  4.0927e-01, -3.1596e-01, -1.2123e-01, -1.2644e-01,
         2.1737e-01, -4.0186e-01, -7.3033e-01, -1.1869e-01, -7.8917e-01,
        -5.7036e-02, -4.6895e-01,  6.6060e-02,  5.5