In [1]:
import os
from enum import Enum
import gzip
import time

import numpy as np
from scipy.sparse import dok_matrix, csr_matrix
import tensorflow as tf

# Attalos Imports
import sys
sys.path.append('/home/kni/local-kni/_update_negsamp/kyle_update/')
import attalos.util.log.log as l
from attalos.dataset.dataset import Dataset
from attalos.evaluation.evaluation import Evaluation

# Local models
from mse import MSEModel
from negsampling import NegSamplingModel
from fast0tag import FastZeroTagModel

In [2]:
# Setup global objects
logger = l.getLogger(__name__)

In [4]:
from attalos.imgtxt_algorithms.regress2sum.multihot import MultihotModel
from attalos.imgtxt_algorithms.regress2sum.naivesum import NaiveSumModel
from attalos.imgtxt_algorithms.regress2sum.wdv import WDVModel
# reload(attalos.imgtxt_algorithms.regress2sum.negsampling)
from attalos.imgtxt_algorithms.regress2sum.negsampling import NegSamplingModel
from attalos.dataset.wordvectors.glove import GloveWrapper

In [5]:
# Temp object using duck typing to replace command line arguments
args = lambda: None
#args.image_feature_file_train = "/local_data/teams/attalos/features/image/espgame_train_20160823_inception.hdf5"
#args.text_feature_file_train = "/local_data/teams/attalos/features/text/espgame_train_20160823_text.json.gz"
#args.image_feature_file_test = "/local_data/teams/attalos/features/image/espgame_test_20160823_inception.hdf5"
#args.text_feature_file_test = "/local_data/teams/attalos/features/text/espgame_test_20160823_text.json.gz"
args.image_feature_file_train = "/local_data/teams/attalos/features/image/iaprtc_train_20160816_inception.hdf5"
args.text_feature_file_train = "/local_data/teams/attalos/features/text/iaprtc_train_20160816_text.json.gz"
args.image_feature_file_test = "/local_data/teams/attalos/features/image/iaprtc_test_20160816_inception.hdf5"
args.text_feature_file_test = "/local_data/teams/attalos/features/text/iaprtc_test_20160816_text.json.gz"
args.word_vector_file = "/local_data/kylez/glove.6B.200d.txt"
args.word_vector_type = "glove"
args.model_type = "negsamp"
args.cross_eval = False
args.in_memory = True
args.model_input_path = None
args.model_output_path = None
args.num_epochs = 400
args.batch_size = 100
args.learning_rate = 0.0001

In [6]:
class WordVectorTypes(Enum):
    w2v = 1
    glove = 2

In [7]:
class ModelTypes(Enum):
    mse = 1
    negsampling = 2
    fast0tag = 3
    multihot = MultihotModel
    naivesum = NaiveSumModel
    wdv = WDVModel
    negsamp = NegSamplingModel

In [8]:
def train_batch(sess, model, batch):
    train_x, train_y = batch
    training_loss = model.fit(sess, train_x, train_y)
    return training_loss

In [9]:
def train_epoch(sess, model, train_dataset, batch_size):
    training_losses = []
    for cur_batch_num, batch in enumerate(model.to_batches(train_dataset, batch_size)):
        training_loss = train_batch(sess, model, batch)
        training_losses.append(training_loss)
    avg_training_loss = sum(training_losses) / float(len(training_losses))
    return avg_training_loss

In [10]:
def train(sess, model, num_epochs, train_dataset, batch_size, epoch_verbosity_rate=10):
    for cur_epoch in xrange(num_epochs):
        verbose = cur_epoch % epoch_verbosity_rate == 0
        avg_training_loss = train_epoch(sess, model, train_dataset, batch_size)
        if verbose:
            logger.info("Finished epoch %s. (Avg. training loss: %s)" % (cur_epoch, avg_training_loss))

In [11]:
def load_wv_model(word_vector_file, word_vector_type):
    if args.word_vector_type == WordVectorTypes.glove.name:
        from glove import Glove
        glove_model = Glove.load_stanford(word_vector_file)
        wv_model = GloveWrapper(glove_model)
    else: #args.word_vector_type == WordVectorTypes.w2v.name:
        import word2vec
        w2v_model = word2vec.load(word_vector_file)
        wv_model = W2VWrapper(w2v_model)
    return wv_model

In [13]:
logger.info("Parsing train and test datasets.")
train_dataset = Dataset(args.image_feature_file_train, args.text_feature_file_train, load_image_feats_in_mem=args.in_memory)
test_dataset = Dataset(args.image_feature_file_test, args.text_feature_file_test)

logger.info("Reading word vectors from file.")
# wv_model = load_wv_model(args.word_vector_file, args.word_vector_type)

[2016-09-09 23:21:47,660] [INFO] Parsing train and test datasets.
[2016-09-09 23:21:47,959] [INFO] Reading word vectors from file.


In [14]:
import attalos.imgtxt_algorithms.util.readw2v as readw2v
from attalos.imgtxt_algorithms.util.readw2v import initVo, readvocab
# Word Vectors
w2vfile = readw2v.ReadW2V('/local_data/kni/data/vectors-phrase.bin')
w2v_model = w2vfile.readlines(100000)
# Require rescale of word vectors to avoid NaNs
for word in w2v_model.keys():
    w2v_model[word] *= 1.0
    w2v_model[word] = w2v_model[word].astype(np.float32)
Wd, Id = readvocab('/local_data/kni/data/vectors-phrase.vocab')

In [15]:
#sess.close()
config = tf.ConfigProto(log_device_placement=True)
config.gpu_options.allow_growth = True
sess = tf.InteractiveSession(config=config)

In [19]:
type(w2v_model['hello'][0])

numpy.float32

In [17]:
model_cls = ModelTypes[args.model_type].value
logger.info("Selecting model class: %s" % model_cls.__name__)
#datasets = [train_dataset] if args.cross_eval else [train_dataset, test_dataset]
model = model_cls(w2v_model, train_dataset=train_dataset, test_dataset=test_dataset, **vars(args))
model.initialize_model(sess)

[2016-09-09 23:22:07,465] [INFO] Selecting model class: NegSamplingModel
[2016-09-09 23:22:09,915] [INFO] Tensor("fully_connected_1/BiasAdd:0", shape=(?, 200), dtype=float32)
[2016-09-09 23:22:09,916] [INFO] Tensor("transpose:0", shape=(?, ?, ?), dtype=float64)


Tensor("fully_connected_1/BiasAdd:0", shape=(?, 200), dtype=float32)
Tensor("transpose:0", shape=(?, ?, ?), dtype=float64)


ValueError: Tensor conversion requested dtype float32 for Tensor with dtype float64: 'Tensor("transpose:0", shape=(?, ?, ?), dtype=float64)'

In [None]:
logger.info("Starting training phase.")
train(sess, model, args.num_epochs, train_dataset, args.batch_size) #, train_dataset, wv_model, test_dataset=test_dataset, epoch_verbosity_rate=100)

In [None]:
logger.info("Starting evaluation phase.")
test_x, test_y = model.to_ndarrs(test_dataset)
predictions = model.predict(sess, test_x)
evaluator = Evaluation(test_y, predictions, k=5)
logger.info("Evaluation (precision, recall, f1): %s" % evaluator.evaluate())