# Imagenet Experiment

## Goal 
This script will first extract top 1k keywords which is most frequent and available at imagenet.  Then this script will download images for each keywords and then generate captions for each image.  

In [None]:
import json
import os
import pandas as pd
import numpy as np
import cPickle as pickle
import hickle
from collections import Counter
from nltk.corpus import stopwords 
from nltk.corpus import wordnet as wn
import urllib
import tarfile
from PIL import Image
from core.vggnet import Vgg19
import tensorflow as tf
from scipy import ndimage
from core.solver import CaptioningSolver
from core.model import CaptionGenerator

%load_ext autoreload
%autoreload 2

# Constants

In [None]:
caption_file = 'data/annotations/captions_train2014.json'
image_dir = 'image/train2014_resized'
max_length = 15
word_count_threshold = 100
vgg_model_path = './data/imagenet-vgg-verydeep-19.mat'
batch_size = 50


# Dictionary Building 

In [None]:
def _process_caption_data(caption_file, image_dir, max_length):
    with open(caption_file) as f:
        caption_data = json.load(f)

    # id_to_filename is a dictionary such as {image_id: filename]} 
    id_to_filename = {image['id']: image['file_name'] for image in caption_data['images']}

    # data is a list of dictionary which contains 'captions', 'file_name' and 'image_id' as key.
    data = []
    for annotation in caption_data['annotations']:
        image_id = annotation['image_id']
        annotation['file_name'] = os.path.join(image_dir, id_to_filename[image_id])
        data += [annotation]

    # convert to pandas dataframe (for later visualization or debugging)
    caption_data = pd.DataFrame.from_dict(data)
    del caption_data['id']
    caption_data.sort_values(by='image_id', inplace=True)
    caption_data = caption_data.reset_index(drop=True)

    del_idx = []
    for i, caption in enumerate(caption_data['caption']):
        caption = caption.replace('.', '').replace(',', '').replace("'", "").replace('"', '')
        caption = caption.replace('&', 'and').replace('(', '').replace(")", "").replace('-', ' ')
        caption = " ".join(caption.split())  # replace multiple spaces

        caption_data.set_value(i, 'caption', caption.lower())
        if len(caption.split(" ")) > max_length:
            del_idx.append(i)

    # delete captions if size is larger than max_length
    print "The number of captions before deletion: %d" % len(caption_data)
    caption_data = caption_data.drop(caption_data.index[del_idx])
    caption_data = caption_data.reset_index(drop=True)
    print "The number of captions after deletion: %d" % len(caption_data)
    return caption_data


In [None]:
def _build_vocab(annotations, threshold=1):
    counter = Counter()
    max_len = 0
    for i, caption in enumerate(annotations['caption']):
        words = caption.split(' ') # caption contrains only lower-case words
        for w in words:
            counter[w] +=1
        
        if len(caption.split(" ")) > max_len:
            max_len = len(caption.split(" "))

    vocab = [word for word in counter if counter[word] >= threshold]
    print ('Filtered %d words to %d words with word count threshold %d.' % (len(counter), len(vocab), threshold))

    word_to_idx = {u'<NULL>': 0, u'<START>': 1, u'<END>': 2}
    idx = 3
    for word in vocab:
        word_to_idx[word] = idx
        idx += 1
    print "Max length of caption: ", max_len
    return word_to_idx

In [None]:
def _build_top1k_vocab(annotations):
    counter = Counter()
    for i, caption in enumerate(annotations['caption']):
        words = caption.split(' ') # caption contrains only lower-case words
        for w in words:
            counter[w] +=1
    #Read imagenet synsets
    with open('./data/imagenet.synsets','r') as f:
        synsets = f.readlines()
    imagenet_synsets = { w.rstrip():True for w in synsets}
    
    top1k = []
    frequentWords = counter.most_common()
    i = 0
    
    while len(top1k) < 1000 and i< len(frequentWords):
        w = frequentWords[i][0]
        ss = wn.synsets(frequentWords[i][0])
        j = 0
        while j< len(ss) and ss[j].pos() != 'n': j += 1
        if j<len(ss):
            wnid = ss[j].pos() + str(ss[j].offset()).zfill(8)
            if wnid in imagenet_synsets:
                top1k.append((wnid, w))
        i += 1
    return top1k

In [None]:

a = _build_top1k_vocab(train_dataset)

In [None]:
a

In [None]:
wn.synsets('his')

In [None]:
train_dataset = _process_caption_data(caption_file=caption_file,
                                      image_dir=image_dir,
                                      max_length=max_length)
                                      

In [None]:
word_to_idx = _build_vocab(annotations=train_dataset, threshold=word_count_threshold)
save_pickle(word_to_idx, './data/word_to_idx.pkl')

            

In [None]:
top1k = _build_top1k_vocab(train_dataset)

In [None]:
top1k

In [None]:
ss = wn.synsets('hats')
ss[0].offset()
wnid = ss[0].pos() + str(ss[0].offset()).zfill(8)
print wnid

In [None]:
ss= wn.synsets('hats')
ss

In [None]:
pre_url = 'http://www.image-net.org/download/synset?wnid='
post_url = '&username=intuinno&accesskey=6be8155ee3d56b5120241b3bda13412d3cc0cd42&release=latest&src=stanford'
testfile = urllib.URLopener()
testfile.retrieve(pre_url+wnid+post_url, wnid+'.tar')

In [None]:
cur_dir = os.getcwd()
original_dir = './data/imagenet/%s/original/'%wnid
resized_dir = './data/imagenet/%s/resized/'%wnid

if not os.path.exists(wnid):
    os.makedirs(original_dir)
    os.rename(wnid+'.tar', original_dir + 'data.tar' )
    


In [None]:
os.chdir(original_dir)
print os.getcwd()
tar = tarfile.open('data.tar')
tar.extractall()
tar.close()
os.remove('data.tar')
os.chdir(cur_dir)

In [None]:
def resize_image(image):
    width, height = image.size
    if width > height:
        left = (width - height) / 2
        right = width - left
        top = 0
        bottom = height
    else:
        top = (height - width) /2
        bottom = height - top
        left = 0
        right = width 
    image = image.crop((left, top, right, bottom))
    image = image.resize([224,224], Image.ANTIALIAS)
    return image

In [None]:
if not os.path.exists(resized_dir):
    os.makedirs(resized_dir)
print 'Start resizing %s images.' %wnid
image_files = os.listdir(original_dir)
for i, image_file in enumerate(image_files):
#     from IPython.core.debugger import Tracer; Tracer()() 
    with open(os.path.join(original_dir, image_file),'r+b') as f:
        image = Image.open(f)
        image = resize_image(image)
        image.save(os.path.join(resized_dir, image_file), image.format)
        if i % 100 == 0:
            print 'Resized images: %d/%d' %(i, len(image_files))
            


In [None]:
vggnet = Vgg19(vgg_model_path)
vggnet.build()

In [None]:
with tf.Session() as sess:
    tf.initialize_all_variables().run()
    n_examples = len(image_files)
    all_feats = np.ndarray([n_examples, 196,512], dtype=np.float32)
    
    for start, end in zip(range(0, n_examples, batch_size),
                          range(batch_size, n_examples+batch_size, batch_size)):
        image_batch_file = image_files[start:end]
        image_batch = np.array(map(lambda x: ndimage.imread(os.path.join(resized_dir, x), mode='RGB'), image_batch_file)).astype(np.float32)
        feats = sess.run(vggnet.features, feed_dict={vggnet.images: image_batch})
        all_feats[start:end, :] = feats
        print ("Processed %d %s features" %(end, wnid))
        
        

In [None]:
save_path = './data/%s.hkl' %wnid
hickle.dump(all_feats, save_path)
print "Saved %s.." % save_path



# Run model to generate Captions

In [None]:
import cPickle as pickle
import tensorflow as tf
from core.solver import CaptioningSolver
from core.model import CaptionGenerator
from nltk.corpus import wordnet as wn
import hickle

In [None]:
ss = wn.synsets('hats')
ss[0].offset()
wnid = ss[0].pos() + str(ss[0].offset()).zfill(8)
print wnid

In [None]:
with open('./data/word_to_idx.pkl','rb') as f:
    word_to_idx = pickle.load(f)
    
with open('./data/%s.hkl'%wnid, 'r') as f:
    data = {}
    data['features'] = hickle.load(f)
    features = data['features']

In [None]:
model = CaptionGenerator(word_to_idx, dim_feature=[196, 512], dim_embed=512,
                                   dim_hidden=1024, n_time_step=16, prev2out=True, 
                                             ctx2out=True, alpha_c=1.0, selector=True, dropout=True)

In [None]:
solver = CaptioningSolver(model, data, data, n_epochs=15, batch_size=128, update_rule='adam',
                                      learning_rate=0.0025, print_every=2000, save_every=1, image_path='./data/imagenet/n03487657',
                                pretrained_model=None, model_path='./data/model/attention', test_model='./data/model/attention/model-18',
                                 print_bleu=False, log_path='./log/')

In [None]:
from IPython.core.debugger import Tracer
Tracer()() #this one triggers the debugger
captions = solver.test_imagenet(features)

In [None]:
len(captions)
captions[-1]

In [None]:
tf.get_variable_scope().reuse_variables()
caption2  = solver.test_imagenet(features)
caption2

In [None]:
import ipdb

# Top 1k Noun Dictionary

In [None]:
def _build_top1k_vocab(annotations):
    counter = Counter()
    for i, caption in enumerate(annotations['caption']):
        words = caption.split(' ') # caption contrains only lower-case words
        for w in words:
            counter[w] +=1

    return sorted(Counter[w], key=lambda (k, v): v)[:1000]

In [None]:
train_dataset = _process_caption_data(caption_file=caption_file,
                                      image_dir=image_dir,
                                      max_length=max_length)
top1k = _build_top1k_vocab(train_dataset)