Building an Image Caption Generator

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import tensorflow as tf
import time
import data_helpers

## Parameter definitions

In [2]:
batch_size = 1000
learning_rate = 0.005
max_steps = 1000


beginTime = time.time()

# Uncommenting this line removes randomness
# You'll get exactly the same result on each run
# np.random.seed(1)

## Prepare data

In [3]:
data_sets = data_helpers.load_data()

## Prepare the TensorFlow graph

In [4]:
# Define input placeholders
images_placeholder = tf.placeholder(tf.float32, shape=[None, 3072])
labels_placeholder = tf.placeholder(tf.int64, shape=[None])

In [5]:
# Define variables (these are the values we want to optimize)
weights = tf.Variable(tf.zeros([3072, 10]))
biases = tf.Variable(tf.zeros([10]))

In [6]:
# Define the classifier's result
logits = tf.matmul(images_placeholder, weights) + biases

In [7]:
# Define the loss function
loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
  labels=labels_placeholder))

In [8]:
# Define the training operation
train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)

In [9]:
# Operation comparing prediction with true label
correct_prediction = tf.equal(tf.argmax(logits, 1), labels_placeholder)

In [10]:
# Operation calculating the accuracy of our predictions
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

## Run the TensorFlow graph

In [11]:
with tf.Session() as sess:
  # Initialize variables
  sess.run(tf.global_variables_initializer())
    
  # Repeat max_steps times
  for i in range(max_steps):

    # Generate input data batch
    indices = np.random.choice(data_sets['images_train'].shape[0], batch_size)
    images_batch = data_sets['images_train'][indices]
    labels_batch = data_sets['labels_train'][indices]

    # Periodically print out the model's current accuracy
    if i % 100 == 0:
      train_accuracy = sess.run(accuracy, feed_dict={
        images_placeholder: images_batch, labels_placeholder: labels_batch})
      print('Step {:5d}: training accuracy {:g}'.format(i, train_accuracy))

    # Perform a single training step
    sess.run(train_step, feed_dict={images_placeholder: images_batch,
      labels_placeholder: labels_batch})

  # After finishing the training, evaluate on the test set
  test_accuracy = sess.run(accuracy, feed_dict={
    images_placeholder: data_sets['images_test'],
    labels_placeholder: data_sets['labels_test']})
  print('Test accuracy {:g}'.format(test_accuracy))

endTime = time.time()

Step     0: training accuracy 0.108
Step   100: training accuracy 0.282
Step   200: training accuracy 0.229
Step   300: training accuracy 0.281
Step   400: training accuracy 0.293
Step   500: training accuracy 0.297
Step   600: training accuracy 0.318
Step   700: training accuracy 0.313
Step   800: training accuracy 0.272
Step   900: training accuracy 0.281
Test accuracy 0.2715


In [12]:
print('Total time: {:5.2f}s'.format(endTime - beginTime))

Total time: 44.58s
