In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function


import tensorflow as tf

import image_embedding
import image_processing
import inputs as input_ops

In [None]:
print(tf.__version__)

In [None]:
class ShowAndTellModel(object):
    """Image-to-text implementation based on http://arxiv.org/abs/1411.4555.
      "Show and Tell: A Neural Image Caption Generator"
      Oriol Vinyals, Alexander Toshev, Samy Bengio, Dumitru Erhan
    """

    def __init__(self, config, mode, train_inception=False):
        """Basic setup.
        Args:
          config: Object containing configuration parameters.
          mode: "train", "eval" or "inference".
          train_inception: Whether the inception submodel variables are trainable.
        """
        assert mode in ["train", "eval", "inference"]
        self.config = config
        self.mode = mode
        self.train_inception = train_inception

        # Reader for the input data.
        self.reader = tf.TFRecordReader()

        # To match the "Show and Tell" paper we initialize all variables with a
        # random uniform initializer.
        #------CODE FROM im2txt------
        #self.initializer = tf.random_uniform_initializer(
        #    minval=-self.config.initializer_scale,
        #    maxval=self.config.initializer_scale)
        
        #He initialization 
        self.initializer = tf.contrib.layers.variance_scaling_initializer()
        
        # A float32 Tensor with shape [batch_size, height, width, channels].
        self.images = None

        # An int32 Tensor with shape [batch_size, padded_length].
        self.input_seqs = None

        # An int32 Tensor with shape [batch_size, padded_length].
        self.target_seqs = None

        # An int32 0/1 Tensor with shape [batch_size, padded_length].
        self.input_mask = None

        # A float32 Tensor with shape [batch_size, embedding_size].
        self.image_embeddings = None

        # A float32 Tensor with shape [batch_size, padded_length, embedding_size].
        self.seq_embeddings = None

        # A float32 scalar Tensor; the total loss for the trainer to optimize.
        self.total_loss = None

        # A float32 Tensor with shape [batch_size * padded_length].
        self.target_cross_entropy_losses = None

        # A float32 Tensor with shape [batch_size * padded_length].
        self.target_cross_entropy_loss_weights = None

        # Collection of variables from the inception submodel.
        self.inception_variables = []

        # Function to restore the inception submodel from checkpoint.
        self.init_fn = None

        # Global step Tensor.
        self.global_step = None

    def is_training(self):
        """Returns true if the model is built for training mode."""
        return self.mode == "train"

    def process_image(self, encoded_image, thread_id=0):
        """Decodes and processes an image string.
        Args:
          encoded_image: A scalar string Tensor; the encoded image.
          thread_id: Preprocessing thread id used to select the ordering of color
            distortions.
        Returns:
          A float32 Tensor of shape [height, width, 3]; the processed image.
        """
        return image_processing.process_image(encoded_image,
                                              is_training=self.is_training(),
                                              height=self.config.image_height,
                                              width=self.config.image_width,
                                              thread_id=thread_id,
                                              image_format=self.config.image_format)

    def build_inputs(self):
        """Input prefetching, preprocessing and batching.
        Outputs:
          self.images
          self.input_seqs
          self.target_seqs (training and eval only)
          self.input_mask (training and eval only)
        """
        if self.mode == "inference":
            # In inference mode, images and inputs are fed via placeholders.
            image_feed = tf.placeholder(dtype=tf.string, shape=[], name="image_feed")
            input_feed = tf.placeholder(dtype=tf.int64,
                                          shape=[None],  # batch_size
                                          name="input_feed")

            # Process image and insert batch dimensions.
            images = tf.expand_dims(self.process_image(image_feed), 0)
            input_seqs = tf.expand_dims(input_feed, 1)

            # No target sequences or input mask in inference mode.
            target_seqs = None
            input_mask = None
        else:
            # Prefetch serialized SequenceExample protos.
            input_queue = input_ops.prefetch_input_data(
                  self.reader,
                  self.config.input_file_pattern,
                  is_training=self.is_training(),
                  batch_size=self.config.batch_size,
                  values_per_shard=self.config.values_per_input_shard,
                  input_queue_capacity_factor=self.config.input_queue_capacity_factor,
                  num_reader_threads=self.config.num_input_reader_threads)

            # Image processing and random distortion. Split across multiple threads
            # with each thread applying a slightly different distortion.
            assert self.config.num_preprocess_threads % 2 == 0
            images_and_captions = []
            for thread_id in range(self.config.num_preprocess_threads):
                serialized_sequence_example = input_queue.dequeue()
                encoded_image, caption = input_ops.parse_sequence_example(
                    serialized_sequence_example,
                    image_feature=self.config.image_feature_name,
                    caption_feature=self.config.caption_feature_name)
                image = self.process_image(encoded_image, thread_id=thread_id)
                images_and_captions.append([image, caption])

              # Batch inputs.
            queue_capacity = (2 * self.config.num_preprocess_threads *
                                self.config.batch_size)
            images, input_seqs, target_seqs, input_mask = (
                  input_ops.batch_with_dynamic_pad(images_and_captions,
                                                   batch_size=self.config.batch_size,
                                                   queue_capacity=queue_capacity))

        self.images = images
        self.input_seqs = input_seqs
        self.target_seqs = target_seqs
        self.input_mask = input_mask

    def build_image_embeddings(self):
       
    
        #create own convolutional 
    
        inception_output = image_embedding.inception_v3(
            self.images,
            trainable=self.train_inception,
            is_training=self.is_training())
        self.inception_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="InceptionV3")

        # Map inception output into embedding space.
        with tf.variable_scope("image_embedding") as scope:
            image_embeddings = tf.contrib.layers.fully_connected(
                  inputs=inception_output,
                  num_outputs=self.config.embedding_size,
                  activation_fn=None,
                  weights_initializer=self.initializer,
                  biases_initializer=None,
                  scope=scope)

        # Save the embedding size in the graph.
        tf.constant(self.config.embedding_size, name="embedding_size")

        self.image_embeddings = image_embeddings

    def build_seq_embeddings(self):
        """Builds the input sequence embeddings.
        Inputs:
          self.input_seqs
        Outputs:
          self.seq_embeddings
        """
        with tf.variable_scope("seq_embedding"), tf.device("/cpu:0"):
            embedding_map = tf.get_variable(
                  name="map",
                  shape=[self.config.vocab_size, self.config.embedding_size],
                  initializer=self.initializer)
            seq_embeddings = tf.nn.embedding_lookup(embedding_map, self.input_seqs)

        self.seq_embeddings = seq_embeddings

    def build_model(self):
       

    def setup_inception_initializer(self):
    
    def setup_global_step(self):
        """Sets up the global step Tensor."""
        global_step = tf.Variable(
            initial_value=0,
            name="global_step",
            trainable=False,
            collections=[tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES])

        self.global_step = global_step

    def build(self):
        """Creates all ops for training and evaluation."""
        self.build_inputs()
        self.build_image_embeddings()
        self.build_seq_embeddings()
        #self.build_model()
        #self.setup_inception_initializer()
        #self.setup_global_step()

In [None]:
with tf.Graph().as_default():
    
    global_step = tf.contrib.framework.get_or_create_global_step()
    
    #image weight 299
    #image hight 299
    
    
    
    weights = { 
    # 5x5 conv, 3 input, 64 outputs 
    'wc1': tf.Variable(tf.random_normal([5, 5, 3, 64])), 
    # 5x5 conv, 64 inputs, 64 outputs 
    'wc2': tf.Variable(tf.random_normal([5, 5, 64, 192])), 
    # 5x5 conv, 192 inputs, 192 outputs 
    'wc3': tf.Variable(tf.random_normal([5, 5, 192, 1024])), 
        
    'wd1': tf.Variable(tf.random_normal([37 * 37 * 1024,1024])),
        
    # 1024 inputs, 10 outputs (class prediction) 
    'out': tf.Variable(tf.random_normal([1024, n_classes])) 
    } 

    biases = { 
    'bc1': tf.Variable(tf.random_normal([64])), 
    'bc2': tf.Variable(tf.random_normal([192])), 
    'bc3': tf.Variable(tf.random_normal([1024])),
    'bd1': tf.Variable(tf.random_normal([1024]))
    'out': tf.Variable(tf.random_normal([n_classes])) 
    } 
    
    #(W−F+2P)/S + 1
    #299 - 5 + 2*1  / 1 +1 = 297
    # conv1
    with tf.variable_scope('conv1') as scope:
        # apply zero padding image of size 301 x 301 x 3
        # 5x5 conv, 3 input, 64 outputs = 299 X 299 x 64
        conv = tf.nn.conv2d(images, weights['wc1'], [1, 1, 1, 1], padding='SAME')
        pre_activation = tf.nn.bias_add(conv, biases['bc1'])
        conv1 = tf.nn.relu(pre_activation, name=scope.name)
        _activation_summary(conv1)
    
    #apply zero padding conv1 of size 301 x 301 x 64
    # pool 3 x 3 = 149 x 149 x 64  (296/2) + 1 .. W2=(W1−F)/S + 1
    # pool1
    pool1 = tf.nn.max_pool(conv1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1],padding='SAME', name='pool1')
    
    #same size 149 x 149 x 64
    # norm1 
    norm1 = tf.nn.lrn(pool1, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,name='norm1')

    # conv2
    with tf.variable_scope('conv2') as scope:
        #apply zero padding image of size 151 x 151 x 64
        # 5x5 conv, 64 input, 192 outputs = 149 X 149 x ????  could be any value
        conv = tf.nn.conv2d(norm1, weights['wc2'], [1, 1, 1, 1], padding='SAME')
        pre_activation = tf.nn.bias_add(conv, biases['bc2'])
        conv2 = tf.nn.relu(pre_activation, name=scope.name)
        _activation_summary(conv2)

    # norm2
    norm2 = tf.nn.lrn(conv2, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
                        name='norm2')
    
    #apply zero padding conv1 of size 151 x 151 x 192
    # pool 3 x 3 = 74 x 74 x 192
    # pool2
    pool2 = tf.nn.max_pool(norm2, ksize=[1, 3, 3, 1],
                             strides=[1, 2, 2, 1], padding='SAME', name='pool2')

    # conv3
    with tf.variable_scope('conv3') as scope:
        #apply zero padding image of size 76 x 76 x 192
        # 5x5 conv, 192 input, 1024 outputs = 74 X 74 x 1024  could be any value???
        conv = tf.nn.conv2d(norm1, weights['wc3'], [1, 1, 1, 1], padding='SAME')
        pre_activation = tf.nn.bias_add(conv, biases['bc3'])
        conv3 = tf.nn.relu(pre_activation, name=scope.name)
        _activation_summary(conv3)

    # norm3
    norm3 = tf.nn.lrn(conv3, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,name='norm3')
    
    # pool 2 x 2 = 37 x 37 x 1024
    # pool3
    pool3 = tf.nn.max_pool(norm3, ksize=[1, 2, 2, 1],
                             strides=[1, 2, 2, 1], padding='SAME', name='pool3')
    
    
    # local3
    with tf.variable_scope('fulllayer1') as scope:
        # Fully connected layer 
        # Reshape conv3 output to fit fully connected layer input 
        fc1 = tf.reshape(conv2, [-1, weights['wd1'].get_shape().as_list()[0]]) 
        fc1 = tf.add(tf.matmul(fc1, weights['wd1']), biases['bd1']) 
        fc1 = tf.nn.relu(fc1) 
        # Apply Dropout 
        fc1 = tf.nn.dropout(fc1, dropout) 

        # Output, class prediction 
        local3 = tf.add(tf.matmul(fc1, weights['out']), biases['out'])
        
        
       
    # linear layer(WX + b),
    # We don't apply softmax here because
    # tf.nn.sparse_softmax_cross_entropy_with_logits accepts the unscaled logits
    # and performs the softmax internally for efficiency.
    with tf.variable_scope('softmax_linear') as scope:
        weights = _variable_with_weight_decay('weights', [192, NUM_CLASSES],stddev=1/192.0, wd=0.0)
        biases = _variable_on_cpu('biases', [NUM_CLASSES], tf.constant_initializer(0.0))
        softmax_linear = tf.add(tf.matmul(local3, weights), biases, name=scope.name)
        _activation_summary(softmax_linear)

    softmax_linear
