In [108]:
import tensorflow as tf
import numpy as np
from common import *

import logging
import os, time
import tflearn
from io import StringIO
import copy
import pickle
from functools import partial

logging.basicConfig(level=logging.DEBUG)

# Prepare word vectors

In [2]:
w2v_model = Word2Vec.load(join(DATA_FOLDER, 'vectors/w2v_model_300_w10'))

2017-08-17 14:20:05,951 : INFO : loading Word2Vec object from ../data/vectors/w2v_model_300_w10
2017-08-17 14:20:08,703 : INFO : loading wv recursively from ../data/vectors/w2v_model_300_w10.wv.* with mmap=None
2017-08-17 14:20:08,705 : INFO : loading syn0 from ../data/vectors/w2v_model_300_w10.wv.syn0.npy with mmap=None
2017-08-17 14:20:09,007 : INFO : setting ignored attribute syn0norm to None
2017-08-17 14:20:09,008 : INFO : loading syn1neg from ../data/vectors/w2v_model_300_w10.syn1neg.npy with mmap=None
2017-08-17 14:20:09,303 : INFO : setting ignored attribute cum_table to None
2017-08-17 14:20:09,304 : INFO : loaded ../data/vectors/w2v_model_300_w10


In [3]:
word_embeddings = w2v_model.wv.syn0.copy()
index2word = copy.deepcopy(w2v_model.wv.index2word)
del w2v_model

In [5]:
index2word.insert(0, 'PAD')
with open(join(DATA_FOLDER, "dictionary.pickle"), "wb") as output_file:
    pickle.dump(index2word, output_file)

In [8]:
stds = np.apply_along_axis(np.std, 1, word_embeddings)
pd.Series(stds).describe()

count    680760.000000
mean          0.193707
std           0.070564
min           0.000910
25%           0.137368
50%           0.178099
75%           0.244486
max           0.744395
dtype: float64

In [9]:
# 0.34 is chosen so the unknown vectors have (approximately) same variance as pre-trained ones
pad_vec = np.random.uniform(-0.34,0.34, word_embeddings.shape[1])
np.std(pad_vec)

0.19733612657600905

In [10]:
word_embeddings = np.insert(word_embeddings, 0, pad_vec, axis=0)

In [11]:
np.save(join(DATA_FOLDER, 'word_embeddings_%s.npy' % word_embeddings.shape[1]), word_embeddings)

# Load input data

In [2]:
word_embeddings = np.load(join(DATA_FOLDER, 'word_embeddings_300.npy'))
with open(join(DATA_FOLDER, "dictionary.pickle"), "rb") as input_file:
    index2word = pickle.load(input_file)

In [3]:
filenames = glob('../data/corpus/*.txt')[:1000]

In [4]:
filenames = np.reshape(filenames[:-(len(filenames)%3)], (-1,3))
# filenames = np.reshape(filenames, (-1,3))

# Input pipline

In [5]:
def parse_csv(text):
    strings = tf.string_split([text], delimiter='\n')
    raw_nums = tf.string_split(strings.values)
    nums = tf.string_to_number(raw_nums.values, tf.int32)
    dense = tf.sparse_to_dense(raw_nums.indices, 
                               raw_nums.dense_shape, 
                               nums,
                               default_value=0)
#     dense.set_shape(raw_nums.get_shape())
    return dense

def read_input_tuple(filename_queue):
    fnames = filename_queue.dequeue()
    example = []
    for fn in tf.unstack(fnames):
        record_string = tf.read_file(fn)
        arr = parse_csv(record_string)
        example.append(arr)
    return example

def input_pipeline(filenames, batch_size, num_epochs=None):
    filename_queue = tf.train.input_producer(
        filenames, 
        num_epochs=num_epochs, 
        capacity=32,
        shuffle=True,
        seed=0)
    example = read_input_tuple(filename_queue)    

    min_after_dequeue = 10000
    capacity = min_after_dequeue + 3 * batch_size
    positive, negative, anchor = tf.train.batch(
        example, 
        batch_size=batch_size, 
        capacity=capacity,
        dynamic_pad=True,
#         allow_smaller_final_batch=True,
        num_threads=cpu_count
    )
    return anchor, positive, negative

# Model definition

In [10]:
class TextCNN(object):
    def __init__(self, n_sents, n_words, vocab_size, embedding_size, 
                 sent_filter_sizes=[3,4,5], sent_nb_filter=5, 
                 doc_filter_sizes=[3], doc_nb_filter=5, 
                 sent_kmax=10, doc_kmax=10):
        self.n_sents = n_sents
        self.n_words = n_words
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.sent_filter_sizes = sent_filter_sizes
        self.sent_nb_filter = sent_nb_filter
        self.doc_filter_sizes = doc_filter_sizes
        self.doc_nb_filter = doc_nb_filter
        self.sent_kmax = sent_kmax
        self.doc_kmax = doc_kmax

    def inference(self, batch_x):
        # Placeholders for input, output and dropout
        self.X = tf.placeholder(tf.int32, [None, n_sents, n_words], name="X")
        self.dropout_prob = tf.placeholder(tf.float32, name="dropout_prob")
        
        """ This is the forward calculation from x to y """
        # Embedding layer
        with tf.device('/cpu:0'), tf.name_scope("embedding"):
            self.W = tf.Variable(tf.constant(0.0, shape=[vocab_size, embedding_size]),
                                 trainable=False, name="W")

            self.embedding_placeholder = tf.placeholder(tf.float32, [vocab_size, embedding_size])
            self.embedding_init = self.W.assign(self.embedding_placeholder)

            self.embedded_words = tf.nn.embedding_lookup(self.W, self.X)
            self.embedded_words_expanded = tf.expand_dims(self.embedded_words, -1)

        with tf.variable_scope('sent'):
            self._create_sharable_weights(sent_filter_sizes, embedding_size, sent_nb_filter)
            # iter over each document
            def convolv_on_sents(embeds):
                return self._convolv_on_embeddings(
                    embeds, sent_filter_sizes, sent_nb_filter, sent_kmax)
            sent_embed = tf.map_fn(convolv_on_sents, 
                                   self.embedded_words_expanded, 
                                   parallel_iterations=10,
                                   name='iter over docs')
            # sent_embed shape is [batch, n_sent, sent_sent_kmax*sent_nb_filter*len(sent_filter_sizes), 1]
        
        with tf.variable_scope('doc'):        
            doc_embedding_size = tf.shape(sent_embed)[2]
            self._create_sharable_weights(doc_filter_sizes, doc_embedding_size, doc_nb_filter)
            # finally, convolv on documents
            doc_embed = self._convolv_on_embeddings(
                    sent_embed, doc_filter_sizes, doc_nb_filter, doc_kmax)
            # doc_embed shape is [batch, doc_kmax*doc_nb_filter*len(doc_filter_sizes), 1]
        
        with tf.name_scope('L2 nomalization'):
            doc_embed = tf.nn.l2_normalize(doc_embed)

        anchor, positive, negative = tf.unstack(tf.reshape(doc_embed, [-1,3,doc_embedding_size]), 3, 1)
        return anchor, positive, negative
            
    def loss(self, batch_x):
        with tf.name_scope("loss"):
            anchor_embed, positive_embed, negative_embed = self.inference(batch_x)
        return triplet_loss(anchor_embed, positive_embed, negative_embed)
        
    def optimize(self, batch_x):
        return tf.train.AdamOptimizer(1e-3).minimize(self.loss, name="optimizer")

    def triplet_loss(anchor_embed, positive_embed, negative_embed, margin=0.2):
        """
        input: Three L2 normalized tensors of shape [None, dim], compute on a batch
        """
        with tf.variable_scope('triplet_loss'):
            d_pos = tf.reduce_sum(tf.square(anchor_embed - positive_embed), 1)
            d_neg = tf.reduce_sum(tf.square(anchor_embed - negative_embed), 1)

            loss = tf.maximum(0., margin + d_pos - d_neg)
            loss = tf.reduce_mean(loss)    
    
        return loss
    
    def _convolv_on_embeddings(self, embeds, filter_sizes, nb_filter, kmax):
        """
        Create a convolution + k-max pool layer for each filter size, then concat and vectorize
        embeds shape is [batch, (n_words or n_sents), embedding_size, 1]
        """
        pooled_outputs = []
        for fsize in filter_sizes:
            with tf.name_scope("%s-conv-%s" %  fsize):

                embedding_size = tf.shape(embeds)[2]
                filter_shape = [fsize, embedding_size, 1, nb_filter]
                with tf.variable_scope("share conv weights filter size=%s" % fsize, reuse=True):
                    weights_init = tf.get_variable('W')
                    bias_init = tf.get_variable('b')
                conv = tf.nn.conv2d(
                    embeds,
                    weights_init,
                    strides=[1, 1, 1, 1],
                    padding="VALID",
                    name="conv")

                h = tf.nn.relu(tf.nn.bias_add(conv, bias_init), name="relu")
                # h shape is [batch, n_words - fsize + 1, 1, nb_filter]
            with tf.name_scope('k-maxpool'):
                # k-maxpooling over the outputs                 
                trans = tf.transpose(h, perm=[0,2,3,1])        
                values, indices = tf.nn.top_k(trans, k=kmax, sorted=False)
                pooled = tf.transpose(values, perm=[0,3,1,2])
                # pooled shape is [batch, kmax, 1, nb_filter]
                pooled_outputs.append(pooled)

        with tf.name_scope('concat and vectorize'):
            # Combine all the pooled features
            h_pool = tf.concat(pooled_outputs, 3)
            # h_pool shape is [batch, kmax, 1, nb_filter*len(filter_sizes)]

            # Vectorize filters for each sent to get sent embeddings
            trans = tf.transpose(pooled_outputs, perm=[0,2,3,1])  
            batch = tf.shape(embeds)[0]
            sent_embed = tf.reshape(trans, [batch,-1,1])
            # sent_embed shape is [batch, kmax*nb_filter*len(filter_sizes), 1]

        return sent_embed

    def _create_sharable_weights(filter_sizes, embedding_size, nb_filter):
        """
        Create sharable weights for each type of convolution
        """
        for fsize in filter_sizes:
            with tf.variable_scope("share conv weights filter size=%s" % fsize):
                filter_shape = [fsize, embedding_size, 1, nb_filter]
                weights_init = tf.get_variable('W', initializer=tf.truncated_normal(filter_shape, stddev=0.1))
                bias_init = tf.get_variable('b', tf.constant(0.1, shape=[nb_filter]))
                        

In [113]:
g = tf.Graph()
with g.as_default():  
    tf.set_random_seed(0)
    sess = tf.Session()
    with sess.as_default():

        t = tf.convert_to_tensor([[1, 2, 3], [4, 5, 6], [7,8,9], 
                                 [10, 21, 32], [34, 55, 66], [71,85,59],
                                 [12, 223, 34], [468, 85, 61], [73,3843,94]])
        anchor, positive, negative = tf.unstack(tf.reshape(t, [-1,3,3]), 3, 1)
        
        res = sess.run([t, anchor, positive, negative])
        print(res)
        
                
        train_writer = tf.summary.FileWriter('../data/summary',
                      sess.graph)

[array([[   1,    2,    3],
       [   4,    5,    6],
       [   7,    8,    9],
       [  10,   21,   32],
       [  34,   55,   66],
       [  71,   85,   59],
       [  12,  223,   34],
       [ 468,   85,   61],
       [  73, 3843,   94]], dtype=int32), array([[  1,   2,   3],
       [ 10,  21,  32],
       [ 12, 223,  34]], dtype=int32), array([[  4,   5,   6],
       [ 34,  55,  66],
       [468,  85,  61]], dtype=int32), array([[   7,    8,    9],
       [  71,   85,   59],
       [  73, 3843,   94]], dtype=int32)]


In [96]:
[[1,2,3],[4,5,6]]

[[1, 2, 3], [4, 5, 6]]

In [None]:
# Data loading params
tf.flags.DEFINE_float("dev_sample_percentage", .1, "Percentage of the training data to use for validation")

# Model Hyperparameters
tf.flags.DEFINE_integer("embedding_dim", wv.syn0[1], "Dimensionality of word embedding")
tf.flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-separated filter sizes (default: '3,4,5')")
tf.flags.DEFINE_integer("num_filters", 128, "Number of filters per filter size (default: 128)")
tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)")
tf.flags.DEFINE_float("l2_reg_lambda", 0.0, "L2 regularization lambda (default: 0.0)")

# Training parameters
tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)")
tf.flags.DEFINE_integer("num_epochs", 10, "Number of training epochs (default: 10)")
tf.flags.DEFINE_integer("evaluate_every", 100, "Evaluate model on dev set after this many steps (default: 100)")
tf.flags.DEFINE_integer("checkpoint_every", 100, "Save model after this many steps (default: 100)")
tf.flags.DEFINE_integer("num_checkpoints", 5, "Number of checkpoints to store (default: 5)")
# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")


# Data Preparation
# ==================================================


# train/dev split here


# Training
# ==================================================

with tf.Graph().as_default(): 
#  If you would like TensorFlow to automatically choose an existing and supported device to 
#  run the operations in case the specified one doesn't exist, you can set allow_soft_placement to True
    
    session_conf = tf.ConfigProto(
      allow_soft_placement=FLAGS.allow_soft_placement,
      log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        cnn = TextCNN(
            num_classes=y_train.shape[1],
            vocab_size=len(vocab_processor.vocabulary_),
            embedding_size=FLAGS.embedding_dim,
            filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
            num_filters=FLAGS.num_filters,
            l2_reg_lambda=FLAGS.l2_reg_lambda)

        # Define Training procedure
        global_step = tf.Variable(0, name="global_step", trainable=False)
        optimizer = tf.train.AdamOptimizer(1e-3)
        grads_and_vars = optimizer.compute_gradients(cnn.loss)
        train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

        # Keep track of gradient values and sparsity (optional)
        grad_summaries = []
        for g, v in grads_and_vars:
            if g is not None:
                grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g)
                sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
                grad_summaries.append(grad_hist_summary)
                grad_summaries.append(sparsity_summary)
        grad_summaries_merged = tf.summary.merge(grad_summaries)

        # Output directory for models and summaries
        timestamp = str(int(time.time()))
        out_dir = abspath(join(curdir, "runs", timestamp))
        print("Writing to {}\n".format(out_dir))

        # Summaries for loss and accuracy
        loss_summary = tf.summary.scalar("loss", cnn.loss)
        acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)

        # Train Summaries
        train_summary_op = tf.summary.merge([loss_summary, acc_summary, grad_summaries_merged])
        train_summary_dir = join(out_dir, "summaries", "train")
        train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)

        # Dev summaries
        dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
        dev_summary_dir = join(out_dir, "summaries", "dev")
        dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph)

        # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
        checkpoint_dir = abspath(join(out_dir, "checkpoints"))
        checkpoint_prefix = join(checkpoint_dir, "model")
        if not exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints)

        # Initialize all variables
        sess.run(tf.global_variables_initializer())
        
        # Assign word embeddings to variable W
        sess.run(cnn.embedding_init, feed_dict={cnn.embedding_placeholder: wv.syn0}) #!!!! index is shifted by 1

        def train_step(x_batch, y_batch):
            """
            A single training step
            """
            feed_dict = {
              cnn.input_x: x_batch,
              cnn.input_y: y_batch,
              cnn.dropout_keep_prob: FLAGS.dropout_keep_prob
            }
            _, step, summaries, loss, accuracy = sess.run(
                [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy],
                feed_dict)
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
            train_summary_writer.add_summary(summaries, step)

        def dev_step(x_batch, y_batch, writer=None):
            """
            Evaluates model on a dev set
            """
            feed_dict = {
              cnn.input_x: x_batch,
              cnn.input_y: y_batch,
              cnn.dropout_keep_prob: 1.0
            }
            step, summaries, loss, accuracy = sess.run(
                [global_step, dev_summary_op, cnn.loss, cnn.accuracy],
                feed_dict)
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
            if writer:
                writer.add_summary(summaries, step)

        # Generate batches
        batches = data_helpers.batch_iter(
            list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs)
        # Training loop. For each batch...
        for batch in batches:
            x_batch, y_batch = zip(*batch)
            train_step(x_batch, y_batch)
            current_step = tf.train.global_step(sess, global_step)
            if current_step % FLAGS.evaluate_every == 0:
                print("\nEvaluation:")
                dev_step(x_dev, y_dev, writer=dev_summary_writer)
                print("")
            if current_step % FLAGS.checkpoint_every == 0:
                path = saver.save(sess, checkpoint_prefix, global_step=current_step)
                print("Saved model checkpoint to {}\n".format(path))

In [92]:
start_time = time.time()

g = tf.Graph()
with g.as_default():  
    tf.set_random_seed(0)
    sess = tf.Session()
    with sess.as_default():
        res = input_pipeline(filenames, batch_size=64, num_epochs=1)        
        
        init_local = tf.local_variables_initializer()
        init_global = tf.global_variables_initializer()
        sess.run([init_global, init_local])
                
        # Start input enqueue threads.
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        try:
            while not coord.should_stop():
#                 batches, keys = sess.run([example_batch, label_batch])  
                [fnq] = sess.run([res])  
#                 time_distributed(batches, conv_2d, [num_filters, filter_sizes, strides])
                print([t.shape for t in fnq])
        except tf.errors.OutOfRangeError:
            print('Done training -- epoch limit reached')
        finally:
            # When done, ask the threads to stop.
            coord.request_stop()

        # Wait for threads to finish.
        coord.join(threads)        

print("--- %s seconds ---" % (time.time() - start_time))

[(64, 123, 40), (64, 123, 40), (64, 123, 40)]
[(64, 123, 40), (64, 123, 40), (64, 123, 40)]
[(64, 123, 40), (64, 123, 40), (64, 123, 40)]
[(64, 123, 40), (64, 123, 40), (64, 123, 40)]
[(64, 123, 40), (64, 123, 40), (64, 123, 40)]
Done training -- epoch limit reached
--- 0.16971611976623535 seconds ---


In [109]:
512/3

170.66666666666666