# Notes

1. IAPR-TC12 does not reproduce on same corpus from the Wiki results
2. The Neural Network runs 8x slower than my Interactive iPython session.


In [1]:
import os, sys
from enum import Enum
import gzip
import time

import numpy as np
from scipy.sparse import dok_matrix, csr_matrix
import tensorflow as tf

# Attalos Imports
sys.path.append('/home/kni/local-kni/attalos/')
import attalos.util.log.log as l
from attalos.dataset.dataset import Dataset
from attalos.evaluation.evaluation import Evaluation
import attalos.imgtxt_algorithms.util.readw2v as readw2v
from attalos.imgtxt_algorithms.util.readw2v import initVo, readvocab

# Local models
# from mse import MSEModel
# from negsampling import NegSamplingModel
# from fast0tag import FastZeroTagModel

import negsampling
reload(negsampling)

# Setup global objects
logger = l.getLogger(__name__)


class ModelTypes(Enum):
    mse = 1
    negsampling = 2
    fast0tag = 3

In [2]:
def evaluate_regressor(sess, model, val_image_feats, val_one_hot, wordmatrix, k=5, verbose=False):
    """
    Takes a regressor and returns the precision/recall on the test data
    Args:
        sess: A tensorflow session
        model_info: A dictionary containing tensorflow layers (specifically input and prediction)
        val_image_feats: Image features to test performance on
        val_text_tags: Text Tags to test performance on
        w2v_model: a dictionary like object where the keys are words and the values are word vectors
        k: Top number of items to retrieve to test precision/recall on
        verbose: Verbose output or not

    Returns:
        evaluator: A attalos.evaluation.evaluation.Evaluation object
    """
    val_pred = model.predict(sess, val_image_feats)
    predictions = np.dot(val_pred, wordmatrix.T)

    evaluator = Evaluation(val_one_hot, predictions, k)

    return evaluator

In [3]:
def create_wordmatrix(w2v_model, dataset=None):
    """
    Take a w2v dictionary and return matrix/index lookup
    Args:
        w2vmodel: Dictionary where keys are words and values are word vectors
        dataset: If specified limits tags in matrix to tags in dataset

    Returns:
        w2ind: Mapping of word to index
        wordmatrix: Numpy matrix of word vectors
    """
    dataset_tags = None
    if dataset:
        dataset_tags = set()
        for tags in dataset.text_feats.values():
            dataset_tags.update(tags)
        num_tags_in_output = len(dataset_tags.intersection(w2v_model.keys()))
    else:
        num_tags_in_output = len(w2v_model)

    # Create word vector matrix to allow for embedding lookup
    w2ind = {}
    wordmatrix = np.zeros((num_tags_in_output, len(w2v_model[w2v_model.keys()[0]])), dtype=np.float32)
    i =0
    for word in w2v_model:
        if dataset_tags is None or word in dataset_tags:
            w2ind[word] = i
            wordmatrix[i, :] = w2v_model[word]
            i += 1
    return w2ind, wordmatrix

In [4]:
def dataset_to_onehot(dataset, w2ind):
    """
    Take a dataset and prepare it for convient evaluation
    Args:
        dataset: attalos.dataset.dataset object
        w2ind: a dictionary like object mapping words to their index

    Returns:
        img_feats: A matrix of image feautres
        one_hot: A sparse matrix of one hot tags

    """
    image_feat, tags = dataset.get_index(0)

    image_feats = np.zeros((dataset.num_images, image_feat.shape[0]))
    one_hot = dok_matrix((dataset.num_images, len(w2ind)), dtype=np.int32)
    # Extract features and place in numpy matrix
    for i in dataset:
        image_feat, tags = dataset[i]
        image_feats[i, :] = image_feat
        for tag in tags:
            if tag in w2ind:
                one_hot[i, w2ind[tag]] = 1

    return image_feats, csr_matrix(one_hot)

In [5]:
datadir='/data/fs4/teams/attalos/features/'
trimdata=datadir+'image/iaprtc_train_20160816_inception.hdf5'
trtxdata=datadir+'text/iaprtc_train_20160816_text.json.gz'
teimdata=datadir+'image/iaprtc_test_20160816_inception.hdf5'
tetxdata=datadir+'text/iaprtc_test_20160816_text.json.gz'

train_dataset = Dataset(trimdata, trtxdata, load_image_feats_in_mem=True)
test_dataset = Dataset(teimdata, tetxdata)

# Image vectors
# Get the full vocab so we can extract only the word vectors we care about
dataset_tags = set()
for dataset in [train_dataset, test_dataset]:
    for tags in dataset.text_feats.values():
        dataset_tags.update(tags)
        
# Word Vectors
w2vfile = readw2v.ReadW2V('/local_data/kni/data/vectors-phrase.bin')
w2v_model = w2vfile.readlines(100000)
# Require rescale of word vectors to avoid NaNs
for word in w2v_model.keys():
    w2v_model[word] *= 1.0
Wd, Id = readvocab('/local_data/kni/data/vectors-phrase.vocab')

In [6]:
# Allocate GPU memory as needed (vs. allocating all the memory)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)

In [7]:
batch_size=1024
num_epochs=1
learning_rate=.01
network_size=[2048,1024,200]
model_input_path = None
model_output_path = None
verbose=True
model_type=ModelTypes.negsampling
max_pos=5
max_neg=10

In [8]:
# def train_model(train_dataset,
#                 test_dataset,
#                 w2v_model,
#                 batch_size=128,
#                 num_epochs=200,
#                 learning_rate=1.001,
#                 network_size=[200,200],
#                 model_input_path = None,
#                 model_output_path = None,
#                 verbose=True,
#                 model_type=ModelTypes.negsampling,
#                 max_pos=5,
#                 max_neg=10):
"""
Train a regression model to map image features into the word vector space
Args:
    train_dataset: Training attalos.dataset.dataset object
    test_dataset: Test attalos.dataset.dataset object
    w2v_model: A dictionary like object where the keys are words and the values are word vectors
    batch_size: Batch size to use for training
    num_epochs: Number of epochs to train for
    learning_rate: The learning rate for the network
    network_size: A list defining the size of each layer of the neural network
    model_input_path: Path to a file containing initial weights
    model_output_path: Path to save final weights
    verbose: Amounto fdebug information to output
Returns:
"""
# Get validation data
#  Extract features from first image
image_feats, tags = test_dataset.get_index(0)
# Get shape and initialize numpy matrix
image_feat_size = image_feats.shape[0]


# Turn w2v dictionary into a matrix
w2ind, word_matrix = create_wordmatrix(w2v_model)
val_w2ind, val_word_matrix = create_wordmatrix(w2v_model, test_dataset)

# Precompute onehot representation for evaluation
val_image_feats, val_one_hot = dataset_to_onehot(test_dataset, val_w2ind)


# Setup data structures for negative sampling
if model_type == ModelTypes.negsampling or model_type == ModelTypes.fast0tag:
    word_counts = np.zeros(word_matrix.shape[0])
    for item_id in train_dataset:
        _, tags = train_dataset[item_id]
        for tag in tags:
            if tag in w2ind:
                word_counts[w2ind[tag]] += 1
    labelpdf = word_counts / word_counts.sum()
    vocabsize = word_matrix.shape[0]
    def negsamp(ignored_inds, num2samp):
        # Negative sampler that takes in indicies

        # Create new probability vector excluding positive samples
        nlabelpdf = np.copy(labelpdf)
        nlabelpdf[ignored_inds] = 0
        nlabelpdf /= nlabelpdf.sum()

        return np.random.choice(vocabsize, size=num2samp, p=nlabelpdf)

# Time to start building our graph
tf.Graph().as_default()
# Build regressor
print model_type
if model_type == ModelTypes.mse:
    logger.info('Building regressor with mean square error loss')
    model = MSEModel(image_feat_size,
                                 word_matrix,
                                learning_rate=learning_rate,
                                hidden_units=network_size,
                                use_batch_norm=True)
# elif model_type == ModelTypes.negsampling:
elif True:
    logger.info('Building regressor with negative sampling loss')
    model = negsampling.NegSamplingModel(image_feat_size,
                                         word_matrix,
                                         learning_rate=learning_rate,
                                         hidden_units=network_size,
                                         optim_words=True,
                                         use_batch_norm=True)
    print "Negative sampling model created"
elif model_type == ModelTypes.fast0tag:
    logger.info('Building model with fast zero tag loss')
    model = FastZeroTagModel(image_feat_size,
                                word_matrix,
                                learning_rate=learning_rate,
                                hidden_units=network_size,
                                use_batch_norm=True)
# Initialize model
model.initialize_model(sess)

[2016-09-09 21:39:16,025] [INFO] Building regressor with negative sampling loss


ModelTypes.negsampling
Optimization on GPU, word vectors stored separately
Negative sampling model created


In [9]:
# Optionally restore saved model
if model_input_path:
    model.load(sess, model_input_path)

# Reuse space for each iteration
pos_word_ids = np.ones((batch_size, max_pos), dtype=np.int32)
neg_word_ids = np.ones((batch_size, max_neg), dtype=np.int32)
performance = []
for epoch in range(num_epochs):
    batch_time_total = 0
    run_time_total = 0

    loss = None
    for batch in range(int(train_dataset.num_images/batch_size)):
        batch_time = time.time()
        # Get raw data
        image_feats, text_tags = train_dataset.get_next_batch(batch_size)

        # Generate positive examples
        pos_word_ids.fill(-1)
        for i, tags in enumerate(text_tags):
            j = 0
            for tag in tags:
                if tag in w2ind and j < max_pos:
                    pos_word_ids[i, j] = w2ind[tag]
                    j += 1

        if model_type == ModelTypes.negsampling or model_type == ModelTypes.fast0tag:
            neg_word_ids.fill(-1)
            for i in range(neg_word_ids.shape[0]):
                neg_word_ids[i] = negsamp(pos_word_ids, max_neg)

        batch_time = time.time() - batch_time
        batch_time_total += batch_time

        run_time = time.time()
        if model_type == ModelTypes.mse:
            loss = model.fit(sess, image_feats, pos_word_ids)
        elif model_type == ModelTypes.negsampling or model_type == ModelTypes.fast0tag:
            loss = model.fit(sess, image_feats,pos_word_ids, neg_word_ids=neg_word_ids)
        run_time = time.time() - run_time
        run_time_total += run_time

    if verbose:
        eval_time = time.time()
        evaluator = evaluate_regressor(sess, model, val_image_feats, val_one_hot, val_word_matrix, verbose=False)
        performance.append(evaluator.evaluate())
        eval_time = time.time() - eval_time
        # Evaluate accuracy
        #print('Epoch {}: Loss: {} Timing: {} {} {}'.format(epoch, loss, batch_time_total, run_time_total, eval_time))
        logger.debug('Epoch {}: Loss: {} Perf: {} {} {}'.format(epoch, loss, *performance[-1]))

if model_output_path:
    model.save(sess, model_output_path)

# return performance

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[2016-09-09 21:45:47,141] [DEBUG] Epoch 0: Loss: nan Perf: 0.00199240107497 0.0174825174825 0.00333921439699


In [None]:
x = image_feats
y = pos_word_ids
neg_ids = neg_word_ids
w2v = word_matrix

pvecs = np.zeros((y.shape[0], y.shape[1], w2v.shape[1]))                                                           
nvecs = np.zeros((neg_ids.shape[0], neg_ids.shape[1], w2v.shape[1]))      
for i, ids in enumerate(y):                                                                                             
    pvecs[i] = w2v[ids] 
for i, ids in enumerate(neg_ids): 
    nvecs[i] = w2v[ids]
pvecs=pvecs.transpose((1,0,2))
nvecs=nvecs.transpose((1,0,2))

_, loss, preds = sess.run([model.model_info['optimizer'], model.model_info['loss'], model.model_info['prediction']],
                          feed_dict={ model.model_info['input']: x,
                                     model.model_info['pos_vecs']: pvecs,
                                     model.model_info['neg_vecs']: nvecs  
                                    })                                                                                 
model.updatewords(y, neg_ids, preds) 
    
zip(y,neg_ids)[0]
loss