In [2]:
import tensorflow as tf 
from tensorflow import gfile
import tensorflow.contrib.slim as slim
from tensorflow import logging

import os 
import time 
import utils 
import eval_util
import readers 
import losses 

import video_level_models
import frame_level_models 

In [3]:
flags = {} 

model_name = "test_1"

flags["feature_names"]          =  "mean_rgb" 
flags["feature_sizes"]          =  "1024" 
flags["train_dir"]              =  "/Users/mohsenkiskani/yt8m/v2/models/video/"+model_name   
flags["train_data_pattern"]     =  "/Users/mohsenkiskani/yt8m/v2/video/train*.tfrecord" 
flags["batch_size"]             =  1024 
flags["regularization_penalty"] =  1.0 
flags["base_learning_rate"]     =  0.01 
flags["learning_rate_decay"]    =  0.95 
flags["num_epochs"]             =  5
flags["num_readers"]            =  8
flags["clip_gradient_norm"]     =  1.0
flags["log_device_placement"]   =  False
flags["learning_rate_decay_examples"] =  4000000 

In [4]:
def get_input_data_tensors(reader, data_pattern, batch_size=1000, num_epochs = None, num_readers=1):
    files = gfile.Glob(data_pattern)
    filename_queue = tf.train.string_input_producer(files, num_epochs=num_epochs, shuffle=True)
    training_data = [reader.prepare_reader(filename_queue) for _ in range(num_readers)]
    return tf.train.shuffle_batch_join(training_data,
                                batch_size=batch_size, 
                                capacity=5 * batch_size, 
                                min_after_dequeue= batch_size,
                                allow_smaller_final_batch=True,
                                enqueue_many=True) 

In [5]:
def build_graph(reader, model, train_data_pattern, label_loss = losses.CrossEntropyLoss(), 
                batch_size=1000, base_learning_rate=0.01, learning_rate_decay_examples=1000000,
                learning_rate_decay=0.95, optimizer_class=tf.train.AdamOptimizer, clip_gradient_norm=1.0,
                regularization_penalty=1, num_readers=1, num_epochs=None):
    
    global_step   = tf.Variable(0, trainable=False, name="global_step")
    learning_rate = tf.train.exponential_decay(base_learning_rate, global_step * batch_size,
                                               learning_rate_decay_examples, learning_rate_decay, staircase=True)
    tf.summary.scalar('learning_rate', learning_rate)
    optimizer     = tf.train.AdamOptimizer(learning_rate)
    
    unused_video_id, model_input_raw, labels_batch, num_frames = (get_input_data_tensors(
          reader,
          train_data_pattern,
          batch_size=batch_size ,
          num_readers=num_readers,
          num_epochs=num_epochs))
    
    tf.summary.histogram("model/input_raw", model_input_raw)
    feature_dim   = len(model_input_raw.get_shape()) - 1
    model_input   = tf.nn.l2_normalize(model_input_raw, feature_dim)
    
    tower_label_losses = []
    tower_reg_losses   = []
    tower_gradients    = []
    tower_predictions  = []

    with (slim.arg_scope([slim.model_variable, slim.variable])):
        result = model.create_model(
                    model_input=model_input_raw,
                    num_frames=num_frames,
                    vocab_size=reader.num_classes,
                    labels=labels_batch)

        for variable in slim.get_model_variables():
            tf.summary.histogram(variable.op.name, variable)

        predictions   = result["predictions"]
        tower_predictions.append(predictions)
        label_loss_fn = losses.CrossEntropyLoss()
        label_loss    = label_loss_fn.calculate_loss(predictions, labels_batch)
        reg_loss      = tf.constant(0.0)
        reg_losses    = tf.losses.get_regularization_losses()

        if reg_losses:
            reg_loss += tf.add_n(reg_losses)

        tower_reg_losses.append(reg_loss)
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        if update_ops:
                with tf.control_dependencies(update_ops):
                    barrier = tf.no_op(name="gradient_barrier")
                    with tf.control_dependencies([barrier]):
                        label_loss = tf.identity(label_loss)

        tower_label_losses.append(label_loss)
        final_loss = flags["regularization_penalty"] * reg_loss + label_loss
        gradients = optimizer.compute_gradients(final_loss, colocate_gradients_with_ops=False)
        tower_gradients.append(gradients)

    label_loss = tf.reduce_mean(tf.stack(tower_label_losses))
    tf.summary.scalar("label_loss", label_loss)
    if flags["regularization_penalty"] != 0:
        reg_loss = tf.reduce_mean(tf.stack(tower_reg_losses))
        tf.summary.scalar("reg_loss", reg_loss)
    merged_gradients = utils.combine_gradients(tower_gradients)

    if flags["clip_gradient_norm"] > 0:
        merged_gradients = utils.clip_gradient_norms(merged_gradients, flags["clip_gradient_norm"])

    train_op = optimizer.apply_gradients(merged_gradients, global_step=global_step)

    tf.add_to_collection("global_step", global_step)
    tf.add_to_collection("loss", label_loss)
    tf.add_to_collection("predictions", tf.concat(tower_predictions, 0))
    tf.add_to_collection("input_batch_raw", model_input_raw)
    tf.add_to_collection("input_batch", model_input)
    tf.add_to_collection("num_frames", num_frames)
    tf.add_to_collection("labels", tf.cast(labels_batch, tf.float32))
    tf.add_to_collection("train_op", train_op)

In [6]:
def train(model, max_steps_reached = False, max_steps = None ):
    
    feature_names, feature_sizes = utils.GetListOfFeatureNamesAndSizes(
                                   flags["feature_names"],flags["feature_sizes"])
    reader = readers.YT8MAggregatedFeatureReader(feature_names=feature_names, 
                                                 feature_sizes=feature_sizes)
    
    train_dir = flags["train_dir"]
    if not os.path.exists(train_dir):
          os.makedirs(train_dir)
    
    with tf.Graph().as_default() as graph:

        saver =  build_graph(reader=reader,
                             model=model,
                             clip_gradient_norm=flags["clip_gradient_norm"],
                             train_data_pattern=flags["train_data_pattern"],
                             base_learning_rate=flags["base_learning_rate"],
                             learning_rate_decay=flags["learning_rate_decay"],
                             learning_rate_decay_examples=flags["learning_rate_decay_examples"],
                             regularization_penalty=flags["regularization_penalty"],
                             num_readers=flags["num_readers"],
                             batch_size=flags["batch_size"],
                             num_epochs=flags["num_epochs"])
        
        global_step = tf.get_collection("global_step")[0]
        loss = tf.get_collection("loss")[0]
        predictions = tf.get_collection("predictions")[0]
        labels = tf.get_collection("labels")[0]
        train_op = tf.get_collection("train_op")[0]
        init_op = tf.global_variables_initializer()
        
    sv = tf.train.Supervisor(graph, logdir= train_dir, init_op=init_op, global_step=global_step,
                             save_model_secs=15 * 60, save_summaries_secs=120, saver=saver)

    config = tf.ConfigProto(allow_soft_placement=True,log_device_placement=flags["log_device_placement"])
    
    with sv.managed_session("", config=config) as sess:
        try:
            while (not sv.should_stop()) and (not max_steps_reached):
                batch_start_time = time.time()
                _, global_step_val, loss_val, predictions_val, labels_val = sess.run(
                    [train_op, global_step, loss, predictions, labels])
                seconds_per_batch = time.time() - batch_start_time
                examples_per_second = labels_val.shape[0] / seconds_per_batch

                if max_steps and max_steps <= global_step_val:
                    max_steps_reached = True

                if global_step_val % 10 == 0 and train_dir:
                    eval_start_time = time.time()
                    hit_at_one = eval_util.calculate_hit_at_one(predictions_val, labels_val)
                    perr = eval_util.calculate_precision_at_equal_recall_rate(predictions_val, labels_val)
                    gap = eval_util.calculate_gap(predictions_val, labels_val)
                    eval_end_time = time.time()
                    eval_time = eval_end_time - eval_start_time

                    logging.info("training step " + str(global_step_val) + " | Loss: " + ("%.2f" % loss_val) +
                    " Examples/sec: " + ("%.2f" % examples_per_second) + " | Hit@1: " +
                    ("%.2f" % hit_at_one) + " PERR: " + ("%.2f" % perr) +
                    " GAP: " + ("%.2f" % gap))

                    sv.summary_writer.add_summary(utils.MakeSummary("model/Training_Hit@1", hit_at_one),
                                                  global_step_val)
                    sv.summary_writer.add_summary(utils.MakeSummary("model/Training_Perr", perr), 
                                                  global_step_val)
                    sv.summary_writer.add_summary(utils.MakeSummary("model/Training_GAP", gap), 
                                                  global_step_val)
                    sv.summary_writer.add_summary(utils.MakeSummary("global_step/Examples/Second", 
                                                                    examples_per_second), global_step_val)
                    sv.summary_writer.flush()

        except tf.errors.OutOfRangeError:
            logging.info("Done training -- epoch limit reached.")

    logging.info("Exited training loop.")
    sv.Stop()

In [10]:
class LogisticModel():
    def create_model(self, model_input, vocab_size, l2_penalty=1e-8, **unused_params):
        output = slim.fully_connected(model_input, vocab_size, activation_fn=tf.nn.sigmoid,
                                      weights_regularizer=slim.l2_regularizer(l2_penalty))
        return {"predictions": output}

In [41]:
class MoeModel():
    def create_model(self, model_input, vocab_size, num_mixtures=5, l2_penalty=1e-8, **unused_params):
        
        gate_activations = slim.fully_connected(model_input,
                                                vocab_size * (num_mixtures + 1),
                                                activation_fn=None,
                                                biases_initializer=None,
                                                weights_regularizer=slim.l2_regularizer(l2_penalty),
                                                scope="gates")
        
        expert_activations = slim.fully_connected(model_input,
                                                  vocab_size * num_mixtures,
                                                  activation_fn=None,
                                                  weights_regularizer=slim.l2_regularizer(l2_penalty),
                                                  scope="experts")
        
        gating_distribution = tf.nn.softmax(tf.reshape(gate_activations,
                                            [-1, num_mixtures + 1]))  # (Batch * #Labels) x (num_mixtures + 1)
        expert_distribution = tf.nn.sigmoid(tf.reshape(expert_activations,
                                            [-1, num_mixtures]))  # (Batch * #Labels) x num_mixtures
        
        final_probabilities_by_class_and_batch = tf.reduce_sum(
            gating_distribution[:, :num_mixtures] * expert_distribution, 1)
        
        final_probabilities = tf.reshape(final_probabilities_by_class_and_batch, [-1, vocab_size])
        
        return {"predictions": final_probabilities}

In [53]:
class KerasMoeModel():
    def create_model(self, model_input, vocab_size, num_mixtures=5, l2_penalty=1e-8, **unused_params):
        
        gate_activations = Dense(vocab_size * (num_mixtures + 1), activation='softmax', 
                                 kernel_regularizer=regularizers.l2(l2_penalty))(model_input)
        
        expert_activations = Dense(vocab_size * num_mixtures, activation='sigmoid',
                                   kernel_regularizer=regularizers.l2(l2_penalty))(model_input)
        
        #gating_distribution = keras.layers.Flatten(gate_activations)
        #expert_distribution = keras.layers.Flatten(expert_activations)
        
        final_probabilities_by_class_and_batch = keras.layers.add(keras.layers.Multiply([
            gate_activations[:, :num_mixtures], expert_activations]))
        
        final_probabilities = final_probabilities_by_class_and_batch
        
        #gating_distribution = tf.nn.softmax(tf.reshape(gate_activations,
        #                                    [-1, num_mixtures + 1]))  # (Batch * #Labels) x (num_mixtures + 1)
        #expert_distribution = tf.nn.sigmoid(tf.reshape(expert_activations,
        #                                    [-1, num_mixtures]))  # (Batch * #Labels) x num_mixtures
        
        #final_probabilities_by_class_and_batch = tf.reduce_sum(
        #    gating_distribution[:, :num_mixtures] * expert_distribution, 1)
        
        #final_probabilities = tf.reshape(final_probabilities_by_class_and_batch, [-1, vocab_size])
        
        return {"predictions": final_probabilities}

In [55]:
#from keras.utils import plot_model
#from keras.models import Model
#from keras.layers import Input
from keras.layers import Dense
from keras.layers import Conv2D

from keras import regularizers
import keras 

class KerasLogisticModel():
    def create_model(self, model_input, vocab_size, l2_penalty=1e-6, **unused_params):
        #output = Dense(vocab_size, activation='sigmoid', 
        #               kernel_regularizer=regularizers.l2(l2_penalty))(model_input)
        
        output_1 = Conv2D(filters = [32, 32], kernel_size= 4, strides=1, padding='same', 
                          kernel_regularizer=regularizers.l2(l2_penalty))(model_input)
        output_2 = Dense(vocab_size, activation='sigmoid', 
                         kernel_regularizer=regularizers.l2(l2_penalty))(output_1)
        return {"predictions": output_2}

In [56]:
logging.set_verbosity(tf.logging.INFO)
#model = LogisticModel()
#model = MoeModel()
model = KerasLogisticModel()
#model = KerasMoeModel()
train(model)

ValueError: Input 0 is incompatible with layer conv2d_1: expected ndim=4, found ndim=2

In [29]:
logging.set_verbosity(tf.logging.INFO)
model = LogisticModel()
#model = MoeModel()
#model = KerasLogisticModel()
train(model)

INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Starting standard services.
INFO:tensorflow:Starting queue runners.
INFO:tensorflow:global_step/sec: 0
INFO:tensorflow:Recording summary at step 0.


KeyboardInterrupt: 

In [None]:
None +1