In [1]:
import readw2v as rw2v
import numpy as np
import tensorflow as tf
sess = tf.InteractiveSession()

In [2]:
F = rw2v.ReadW2V('wordvecs/text9Bvin.bin')
vectors = F.readlines()

### Variables

In [3]:
data = np.load('data/iaprtc_alexfc7.npz')
D = open('data/iaprtc_dictionary.txt').read().splitlines()
train_ims = [ im.split('/')[-1] for im in open('data/iaprtc_trainlist.txt').read().splitlines() ]
test_ims = [ im.split('/')[-1] for im in open('data/iaprtc_testlist.txt').read().splitlines() ]
xTr = data['xTr'].T
yTr = data['yTr'].T
xTe = data['xTe'].T
yTe = data['yTe'].T
wc = yTr.sum(axis=0)+0.00

imfeatsize = xTr.shape[1]
vocabsize = yTr.shape[1]
wordvecsize = vectors['</s>'].shape[0]

In [11]:
BATCH_SIZE = 256
HIDDEN_SIZE = 300
RATE = 100.0

### If you wanted to collect them into a matrix similar to IAPR-TC12, then this would be how you would do it.

matval = np.array(vectors.values())
words = vectors.keys()

In [5]:
targetwords = [ D[i] for i in np.nonzero( yTr[np.random.randint(len(yTr))] )[0] ]
targetvecs = np.array( [vectors[w] for w in targetwords ] )

### Things not in the Wikipedia right now:

'tee-shirt'

### For a single feature, get the related training vectors

Usage:  (im, veclist) = get_training_vectors( index_number )

### For multiple features (in the list of indices), for each image feature, sum the training vectors

In [6]:
# Given an index into the corpus, get the corresponding image feature and all word vectors from it
def get_training_vectors(index):
    yTrList = []
    for i in np.nonzero( yTr[index] )[0]:
        yTrList.append( vectors[ D[i] ] )
    return (xTr[index], np.array(yTrList))

def get_training_batch(indices):
    
    feats = np.zeros((len(indices),imfeatsize))
    labels = np.zeros((len(indices),wordvecsize))
    
    for j, i in enumerate(indices):
        feats[j], wordfeats = get_training_vectors(i)
        labels[j] = wordfeats.sum(axis=0)
    return (feats, labels)

In [7]:
print "Image feature size = {}".format( imfeatsize )
print "Image vocabulary size = {}".format( vocabsize )
print "Looking to extend that to {}".format( len(vectors) )

print "Initializing tensorflow variables"
# Tensorflow input output data
X = tf.placeholder(tf.float32, shape=[None, imfeatsize], name='Input')
Ytruth = tf.placeholder(tf.float32, shape=[None, wordvecsize], name='Truth')
# positive_examples = tf.placeholder(tf.float32, shape=[BATCH_SIZE, HIDDEN_SIZE], name="Positive_Examples")
# negative_examples = tf.placeholder(tf.float32, shape=[BATCH_SIZE, HIDDEN_SIZE], name="Negative_Examples")

# Tensorflow variable weights
W0 = tf.Variable(tf.random_normal([imfeatsize, HIDDEN_SIZE],stddev=1))
W1 = tf.Variable(tf.random_normal([HIDDEN_SIZE, wordvecsize], stddev=1))

Image feature size = 4096
Image vocabulary size = 291
Looking to extend that to 218317
Initializing tensorflow variables


In [12]:
# Two layer perceptron
def two_layer_perceptron(X_i, Wt0, Wt1):
    hidden_layer = tf.nn.sigmoid(tf.matmul(X_i, Wt0))
    return tf.nn.sigmoid( tf.matmul(hidden_layer, Wt1) )
Y = two_layer_perceptron(X, W0, W1)

binxentropy = tf.reduce_mean(-tf.reduce_sum(Ytruth * tf.log(Y) + (1 - Ytruth)*tf.log(Y), reduction_indices=[1]))


# Optimization
optimizer = tf.train.AdamOptimizer(learning_rate=RATE).minimize(binxentropy)
# optimizer = tf.train.GradientDescentOptimizer(learning_rate=RATE).minimize(tf.nn.sigmoid_cross_entropy_with_logits)
# optimizer = tf.train.GradientDescentOptimizer(learning_rate=RATE).minimize(binxentropy)

In [14]:
if True:
    # sess = tf.InteractiveSession()
    
    # Initialize the variables
    tf.initialize_all_variables().run()
    print "W0Prior: {}".format(W1.eval())
    
    # Get a batch for training
    imbatch, wordbatch = get_training_batch([0,1,2,3,4,5])
    indict={X:imbatch, Ytruth:wordbatch}
    
    # Run the session
    optval, lossval, yval = sess.run([optimizer, binxentropy, Y], feed_dict=indict)
    print "W0SessRun: {}".format(W1.eval())
    
    # Separately run the optimizer
    optimizer.run(feed_dict=indict)
    print "W0OptRun: {}".format(W1.eval())
    
    print optval
    print lossval
    print yval
    
    

W0Prior: [[ 0.21997467 -1.19393682 -0.18705678 ...,  0.42633942 -0.07044954
  -0.56638432]
 [-0.71226329  0.89928865 -0.83504802 ...,  0.51589924  1.18704534
   0.80860144]
 [-0.42435294 -0.85224849  1.41782522 ...,  0.37382987  1.4340049
  -0.19368374]
 ..., 
 [ 1.25981188 -0.78272468  1.41784775 ..., -0.24662268  0.55633718
  -0.8620255 ]
 [-1.62061107 -0.08030051  1.69193399 ...,  1.98095596  1.27665401
   1.27177298]
 [ 0.63576293  0.40372279  1.95382655 ..., -1.42940867 -1.50172448
   1.18037844]]
W0SessRun: [[ 100.2197876    98.80587769   99.8127594  ...,  100.42602539
    99.92936707   99.43343353]
 [  99.28760529  100.89924622   99.16490173 ...,  100.51585388
   101.18699646  100.80853271]
 [  99.57553101   99.14768982  101.41777039 ...,  100.37376404
   101.43395233   99.80625153]
 ..., 
 [ 101.25971985   99.21724701  101.41781616 ...,   99.75333405
   100.55630493   99.13793182]
 [  98.37915802   99.91962433  101.69186401 ...,  101.98088074
   101.27657318  101.27165985]
 [ 1