In [None]:
import tensorflow as tf
import numpy as np
from common import *

import logging
import os, time
import tflearn
import tflearn.helpers.summarizer as s
from io import StringIO
import copy
import pickle
from functools import partial
import datetime

logging.basicConfig(level=logging.DEBUG)

# Prepare word vectors

In [62]:
w2v_model = Word2Vec.load(join(DATA_FOLDER, 'vectors/w2v_model_300_w10'))

2017-08-22 17:31:05,986 : INFO : loading Word2Vec object from ../data/vectors/w2v_model_300_w10
2017-08-22 17:31:09,212 : INFO : loading wv recursively from ../data/vectors/w2v_model_300_w10.wv.* with mmap=None
2017-08-22 17:31:09,214 : INFO : loading syn0 from ../data/vectors/w2v_model_300_w10.wv.syn0.npy with mmap=None
2017-08-22 17:31:11,186 : INFO : setting ignored attribute syn0norm to None
2017-08-22 17:31:11,189 : INFO : loading syn1neg from ../data/vectors/w2v_model_300_w10.syn1neg.npy with mmap=None
2017-08-22 17:31:13,711 : INFO : setting ignored attribute cum_table to None
2017-08-22 17:31:13,712 : INFO : loaded ../data/vectors/w2v_model_300_w10


In [3]:
word_embeddings = w2v_model.wv.syn0.copy()
index2word = copy.deepcopy(w2v_model.wv.index2word)
del w2v_model

In [5]:
index2word.insert(0, 'PAD')
with open(join(DATA_FOLDER, "dictionary.pickle"), "wb") as output_file:
    pickle.dump(index2word, output_file)

In [8]:
stds = np.apply_along_axis(np.std, 1, word_embeddings)
pd.Series(stds).describe()

count    680760.000000
mean          0.193707
std           0.070564
min           0.000910
25%           0.137368
50%           0.178099
75%           0.244486
max           0.744395
dtype: float64

In [9]:
# 0.34 is chosen so the unknown vectors have (approximately) same variance as pre-trained ones
pad_vec = np.random.uniform(-0.34,0.34, word_embeddings.shape[1])
np.std(pad_vec)

0.19733612657600905

In [10]:
word_embeddings = np.insert(word_embeddings, 0, pad_vec, axis=0)

In [11]:
np.save(join(DATA_FOLDER, 'word_embeddings_%s.npy' % word_embeddings.shape[1]), word_embeddings)

# Load input data

In [2]:
word_embeddings = np.load(join(DATA_FOLDER, 'word_embeddings_300.npy'))
with open(join(DATA_FOLDER, "dictionary.pickle"), "rb") as input_file:
    index2word = pickle.load(input_file)

In [3]:
ids = glob('../data/corpus/**.txt')
with open(join(DATA_FOLDER, 'sims.json'), 'r') as f:
    sims = json.load(f)

In [10]:
# random select nagative examples
import random
random.seed(0)

def full_name(_id):
    return join(DATA_FOLDER, 'corpus/%s.txt' % _id)

def random_triples(sims, ids, num_epochs=1):
    """
    Get random triples, select negatives at random in each epoch.
    Output: [anchor, positive, negative]
    """
    ixs = list(range(len(ids)))
    for ep in range(num_epochs):
        random.shuffle(ixs)
        it = iter(ixs)
        for k, v in tqdm(sims.items()):
            exclude = [full_name(i) for i in [k] + v]
            for vi in v:
                ix = next(it)
                _neg = ids[ix]
                while _neg in exclude:
                    ix = next(it)
                    _neg = ids[ix]
                yield [full_name(k), full_name(vi), _neg]

# Input pipline

In [9]:
def parse_csv(text):
    with tf.name_scope('parse_csv'):
        strings = tf.string_split([text], delimiter='\n')
        raw_nums = tf.string_split(strings.values)
        nums = tf.string_to_number(raw_nums.values, tf.int32)
        dense = tf.sparse_to_dense(
            raw_nums.indices, raw_nums.dense_shape, nums, default_value=0)
        dense.set_shape(raw_nums.get_shape())
    return dense

def read_input_tuple(filename_queue):
    with tf.name_scope('read_input_tuple'):
        fnames = filename_queue.dequeue()
        example = []
        for fn in tf.unstack(fnames):
            record_string = tf.read_file(fn)
            arr = parse_csv(record_string)
            example.append(arr)
    return example

def input_pipeline(triples, batch_size, num_epochs=1):
    filename_queue = tf.train.input_producer(
        triples, num_epochs=num_epochs, capacity=32, shuffle=True, seed=0)
    example = read_input_tuple(filename_queue)

    min_after_dequeue = 10000
    capacity = min_after_dequeue + 3 * batch_size
    anchor, positive, negative = tf.train.batch(
        example,
        batch_size=batch_size,
        capacity=capacity,
        dynamic_pad=True,
        #         allow_smaller_final_batch=True,
        num_threads=cpu_count)
    return anchor, positive, negative

# Model definition

In [64]:
class TextCNN(object):
    def __init__(self,
                 n_sents,
                 n_words,
                 vocab_size,
                 embedding_size,
                 sent_filter_sizes=[2,3,4,5],
                 sent_nb_filter=15,
                 doc_filter_sizes=[1,2,3],
                 doc_nb_filter=10,
                 sent_kmax=10,
                 doc_kmax=10):
        self.n_sents = n_sents
        self.n_words = n_words
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.sent_filter_sizes = sent_filter_sizes
        self.sent_nb_filter = sent_nb_filter
        self.doc_filter_sizes = doc_filter_sizes
        self.doc_nb_filter = doc_nb_filter
        self.sent_kmax = sent_kmax
        self.doc_kmax = doc_kmax

        self.global_step = tf.Variable(0, name="global_step", trainable=False)
        self.dropout_prob = tf.placeholder(tf.float32, name="dropout_prob")
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-08)

        # Embedding layer
        with tf.device('/cpu:0'), tf.name_scope("embedding"):
            W = tf.Variable(
                tf.constant(0.0, shape=[self.vocab_size, self.embedding_size]),
                trainable=False,
                name="W")

            self.embedding_placeholder = tf.placeholder(
                tf.float32, [self.vocab_size, self.embedding_size])
            self.embedding_init = W.assign(self.embedding_placeholder)

            embedded_words = tf.nn.embedding_lookup(W, X)
            self.embedded_words_expanded = tf.expand_dims(embedded_words, -1)

        with tf.variable_scope('sent'):
            self._create_sharable_weights(sent_filter_sizes, embedding_size,
                                          sent_nb_filter)
            self.sent_embedding_size = tf.convert_to_tensor(
                sent_kmax * sent_nb_filter * len(sent_filter_sizes))

        with tf.variable_scope('doc'):
            self._create_sharable_weights(
                doc_filter_sizes, self.sent_embedding_size, doc_nb_filter)
            self.doc_embedding_size = tf.convert_to_tensor(
                doc_kmax * doc_nb_filter * len(doc_filter_sizes))

    def inference(self, X):
        """ This is the forward calculation from batch X to doc embeddins """
        with tf.variable_scope('sent'):
            def convolv_on_sents(embeds):
                return self._convolv_on_embeddings(
                    embeds, self.sent_filter_sizes, self.sent_nb_filter,
                    self.sent_kmax)
            # iter over each document
            sent_embed = tf.map_fn(
                convolv_on_sents,
                self.embedded_words_expanded,
                parallel_iterations=10,
                name='iter_over_docs')
            # sent_embed shape is [batch, n_sent, sent_sent_kmax*sent_nb_filter*len(sent_filter_sizes), 1]

        with tf.variable_scope('doc'):
            # finally, convolv on documents
            doc_embed = self._convolv_on_embeddings(
                sent_embed, self.doc_filter_sizes, self.doc_nb_filter,
                self.doc_kmax)
            # doc_embed shape is [batch, doc_kmax*doc_nb_filter*len(doc_filter_sizes), 1]

        doc_embed_normalized = tf.nn.l2_normalize(
            doc_embed, dim=1, name='L2_nomalization')

        anchor, positive, negative = tf.unstack(
            tf.reshape(doc_embed_normalized, [-1, 3, self.doc_embedding_size]),
            3, 1)
        return anchor, positive, negative

    def loss(self, X):
        with tf.name_scope("loss"):
            anchor_embed, positive_embed, negative_embed = self.inference(X)
            _loss = self.triplet_loss(anchor_embed, positive_embed,
                                      negative_embed)
        return _loss

    def optimize(self, X):
        with tf.name_scope("optimize"):
            self.loss_op = self.loss(X)
            self.gradients = self.optimizer.compute_gradients(self.loss_op)
            apply_gradient_op = self.optimizer.apply_gradients(
                self.gradients, global_step=self.global_step)
        return apply_gradient_op

    def triplet_loss(self,
                     anchor_embed,
                     positive_embed,
                     negative_embed,
                     margin=0.2):
        """
        input: Three L2 normalized tensors of shape [None, dim], compute on a batch
        output: float
        """
        with tf.variable_scope('triplet_loss'):
            d_pos = tf.reduce_sum(tf.square(anchor_embed - positive_embed), 1)
            d_neg = tf.reduce_sum(tf.square(anchor_embed - negative_embed), 1)

            loss = tf.maximum(0., margin + d_pos - d_neg)
            loss = tf.reduce_mean(loss)

        return loss

    def _convolv_on_embeddings(self, embeds, filter_sizes, nb_filter, kmax):
        """
        Create a convolution + k-max pool layer for each filter size, then concat and vectorize.
        embeds shape is [batch, (n_words or n_sents), embedding_size, 1]
        """
        pooled_outputs = []
        for fsize in filter_sizes:
            with tf.name_scope("conv-%s" % fsize):
                with tf.variable_scope(
                        "conv_weights_fsize-%s" % fsize, reuse=True):
                    weights_init = tf.get_variable('W')
                    bias_init = tf.get_variable('b')
                conv = tf.nn.conv2d(
                    embeds,
                    weights_init,
                    strides=[1, 1, 1, 1],
                    padding="VALID",
                    name="conv")

                h = tf.nn.relu(tf.nn.bias_add(conv, bias_init), name="relu")
                tf.summary.histogram("relu", h)
                # h shape is [batch, n_words - fsize + 1, 1, nb_filter]
            with tf.name_scope('%s-maxpool-fsize-%s' % (kmax, fsize)):
                # k-maxpooling over the outputs
                trans = tf.transpose(h, perm=[0, 2, 3, 1])
                values, indices = tf.nn.top_k(trans, k=kmax, sorted=False)
                pooled = tf.transpose(values, perm=[0, 3, 1, 2])
                # pooled shape is [batch, kmax, 1, nb_filter]
                pooled_outputs.append(pooled)

        with tf.name_scope('concat_and_vectorize'):
            # Combine all the pooled features
            h_pool = tf.concat(pooled_outputs, 3)
            # h_pool shape is [batch, kmax, 1, nb_filter*len(filter_sizes)]

            # Vectorize filters for each sent to get sent embeddings
            trans = tf.transpose(h_pool, perm=[0, 2, 3, 1])
            batch = tf.shape(embeds)[0]
            sent_embed = tf.reshape(trans, [batch, -1, 1])
            # sent_embed shape is [batch, kmax*nb_filter*len(filter_sizes), 1]

        return sent_embed

    def _create_sharable_weights(self, filter_sizes, embedding_size,
                                 nb_filter):
        """ Create sharable weights for each type of convolution """
        with tf.name_scope('sharable_weights'):
            for fsize in filter_sizes:
                with tf.variable_scope("conv_weights_fsize-%s" % fsize):
                    filter_shape = [fsize, embedding_size, 1, nb_filter]
                    weights_init = tf.get_variable(
                        'W',
                        initializer=tf.truncated_normal(
                            filter_shape, stddev=0.1))
                    bias_init = tf.get_variable(
                        'b', initializer=tf.constant(0.1, shape=[nb_filter]))

In [65]:
ids1000 = random.sample(list(sims), 64*2)
triples = list(random_triples({k:sims[k] for k in ids1000}, ids, num_epochs=1))
triples[10:15]

100%|██████████| 128/128 [00:00<00:00, 32918.69it/s]


[['../data/corpus/5984d139b6b113440d63850d.txt',
  '../data/corpus/5984c783b6b11367c5638508.txt',
  '../data/corpus/5984cb09b6b1130b98638529.txt'],
 ['../data/corpus/5984d139b6b113440d63850d.txt',
  '../data/corpus/5984cc18b6b11318d1638546.txt',
  '../data/corpus/5984b6abb6b113168c63850f.txt'],
 ['../data/corpus/5984d977b6b11315fc63853a.txt',
  '../data/corpus/5984c753b6b1136591638533.txt',
  '../data/corpus/5984d428b6b113639c63852e.txt'],
 ['../data/corpus/5984d977b6b11315fc63853a.txt',
  '../data/corpus/5984c233b6b1132dbd638519.txt',
  '../data/corpus/5984b75ab6b113230c638547.txt'],
 ['../data/corpus/5984bb8fb6b1135afd638512.txt',
  '../data/corpus/5984b8dcb6b113393f63850d.txt',
  '../data/corpus/5984bac2b6b1134fb5638537.txt']]

In [66]:
len(triples)

262

In [70]:
start_time = time.time()

vocab_size, embedding_size = word_embeddings.shape
# X = tf.placeholder(tf.int32, [None, n_sents, n_words], name="X")

g = tf.Graph()
with g.as_default():
    tf.set_random_seed(0)

    session_conf = tf.ConfigProto(
        allow_soft_placement=True, log_device_placement=False)
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        anchor_batch, positive_batch, negative_batch = input_pipeline(
            triples, batch_size=64, num_epochs=1)
        X = tf.concat(
            [anchor_batch, positive_batch, negative_batch],
            axis=0,
            name='concat_tupples')

        with tf.name_scope('init_model'):
            with tf.name_scope('batch_shape'):
                n_sents = tf.shape(X)[1]
                n_words = tf.shape(X)[2]
            model = TextCNN(
                n_sents,
                n_words,
                vocab_size,
                embedding_size,
                sent_filter_sizes=[2, 3, 4, 5],
                sent_nb_filter=15,
                doc_filter_sizes=[1, 2, 3],
                doc_nb_filter=10,
                sent_kmax=10,
                doc_kmax=10)
        train_op = model.optimize(X)

        init_local = tf.local_variables_initializer()
        init_global = tf.global_variables_initializer()
        sess.run([init_global, init_local])

        tf.summary.scalar("loss", model.loss_op)
        # Create summaries to visualize weights
        for var in tf.trainable_variables():
            tf.summary.histogram(var.name.replace(':', '_'), var)
        # Summarize all gradients
        for grad, var in model.gradients:
            tf.summary.histogram(var.name.replace(':', '_') + '/gradient', grad)
        merged_summary_op = tf.summary.merge_all()
        train_writer = tf.summary.FileWriter('../data/summary', sess.graph)

        # Assign word embeddings to variable W
        #!!!! index is shifted by 1
        sess.run(
            model.embedding_init,
            feed_dict={model.embedding_placeholder: word_embeddings})

        # Start input enqueue threads.
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        try:
            while not coord.should_stop():
                _, step, loss, summary = sess.run([
                    train_op, model.global_step, model.loss_op,
                    merged_summary_op
                ])
                current_step = tf.train.global_step(sess, model.global_step)
                train_writer.add_summary(summary, current_step)
                print(current_step, loss)
        except tf.errors.OutOfRangeError:
            print('Done training -- epoch limit reached')
        finally:
            # When done, ask the threads to stop.
            coord.request_stop()

        # Wait for threads to finish.
        coord.join(threads)

print("--- %s seconds ---" % (time.time() - start_time))

InvalidArgumentError: The node 'Merge/MergeSummary' has inputs from different frames. The input 'optimize/loss/sent/iter_over_docs/while/conv-5/relu_1' is in frame 'optimize/loss/sent/iter_over_docs/while/optimize/loss/sent/iter_over_docs/while/'. The input 'doc/conv_weights_fsize-3/b_0/gradient' is in frame ''.

In [72]:
1200000*1000/(93*60*60)

3584.2293906810037

In [32]:
g = tf.Graph()
with g.as_default():
    tf.set_random_seed(0)
    sess = tf.Session()
    with sess.as_default():

        with tf.variable_scope('var'):
            t = tf.get_variable('w', initializer=tf.constant(0), dtype=tf.int32)
        
        r = t*2
        
        with tf.variable_scope('var', reuse=True):
            tt = tf.get_variable('w', initializer=tf.constant(0), dtype=tf.int32)
        
        s = tt+44
        
        sess.run(tf.global_variables_initializer())
        
        res = sess.run([t, r, s])
        print(res)

        train_writer = tf.summary.FileWriter('../data/summary', sess.graph)

[0, 0, 44]
