In [1]:
import tensorflow as tf
import numpy as np
from common import *

import logging
import os, time
import tflearn
import tflearn.helpers.summarizer as s
from io import StringIO
import copy
import pickle
from functools import partial
import datetime

from sklearn.model_selection import train_test_split

logging.basicConfig(level=logging.DEBUG)

# Prepare word vectors

In [3]:
w2v_model = Word2Vec.load(join(DATA_FOLDER, 'vectors/w2v_model_300_w10'))

2017-08-23 10:20:03,354 : INFO : loading Word2Vec object from ../data/vectors/w2v_model_300_w10
2017-08-23 10:20:06,546 : INFO : loading wv recursively from ../data/vectors/w2v_model_300_w10.wv.* with mmap=None
2017-08-23 10:20:06,548 : INFO : loading syn0 from ../data/vectors/w2v_model_300_w10.wv.syn0.npy with mmap=None
2017-08-23 10:20:06,849 : INFO : setting ignored attribute syn0norm to None
2017-08-23 10:20:06,851 : INFO : loading syn1neg from ../data/vectors/w2v_model_300_w10.syn1neg.npy with mmap=None
2017-08-23 10:20:07,159 : INFO : setting ignored attribute cum_table to None
2017-08-23 10:20:07,160 : INFO : loaded ../data/vectors/w2v_model_300_w10


In [4]:
word_embeddings = w2v_model.wv.syn0.copy()
index2word = copy.deepcopy(w2v_model.wv.index2word)
del w2v_model

In [5]:
index2word.insert(0, 'PAD')
with open(join(DATA_FOLDER, "dictionary.pickle"), "wb") as output_file:
    pickle.dump(index2word, output_file)

In [6]:
stds = np.apply_along_axis(np.std, 1, word_embeddings)
pd.Series(stds).describe()

count    680760.000000
mean          0.193705
std           0.070574
min           0.000910
25%           0.137368
50%           0.178099
75%           0.244486
max           0.744395
dtype: float64

In [7]:
# 0.34 is chosen so the unknown vectors have (approximately) same variance as pre-trained ones
pad_vec = np.random.uniform(-0.34,0.34, word_embeddings.shape[1])
np.std(pad_vec)

0.18944918087888768

In [8]:
word_embeddings = np.insert(word_embeddings, 0, pad_vec, axis=0)

In [9]:
np.save(join(DATA_FOLDER, 'word_embeddings_%s.npy' % word_embeddings.shape[1]), word_embeddings)

# Load input data

In [2]:
word_embeddings = np.load(join(DATA_FOLDER, 'word_embeddings_300.npy'))
with open(join(DATA_FOLDER, "dictionary.pickle"), "rb") as input_file:
    index2word = pickle.load(input_file)

In [3]:
ids = glob('../data/corpus/**.txt')
with open(join(DATA_FOLDER, 'sims.json'), 'r') as f:
    sims = json.load(f)

In [4]:
# random select nagative examples
import random

def full_name(_id):
    return join(DATA_FOLDER, 'corpus/%s.txt' % _id)

def random_triples(sims, ids, num_epochs=1, seed=0):
    """
    Get random triples, select negatives at random in each epoch.
    Output: [anchor, positive, negative]
    """
    random.seed(0)
    ixs = list(range(len(ids)))
    for ep in range(num_epochs):
        random.shuffle(ixs)
        it = iter(ixs)
        for k, v in tqdm(sims.items()):
            exclude = [full_name(i) for i in [k] + v]
            for vi in v:
                ix = next(it)
                _neg = ids[ix]
                while _neg in exclude:
                    ix = next(it)
                    _neg = ids[ix]
                yield [full_name(k), full_name(vi), _neg]

# Input pipline

In [106]:
def parse_csv(text):
    with tf.name_scope('parse_csv'):
        strings = tf.string_split([text], delimiter='\n')
        raw_nums = tf.string_split(strings.values)
        nums = tf.string_to_number(raw_nums.values, tf.int32)
        dense = tf.sparse_to_dense(
            raw_nums.indices, raw_nums.dense_shape, nums, default_value=0)
        dense.set_shape(raw_nums.get_shape())
    return dense

def read_input_tuple(filename_queue):
    with tf.name_scope('read_input_tuple'):
        fnames = filename_queue.dequeue()
        example = []
        for fn in tf.unstack(fnames):
            record_string = tf.read_file(fn)
            arr = parse_csv(record_string)
            example.append(arr)
        example.append(fnames)
    return example

def input_pipeline(triples, batch_size, num_epochs=1, num_threads=cpu_count, shuffle=True):
    filename_queue = tf.train.input_producer(
        triples, num_epochs=num_epochs, capacity=32, shuffle=shuffle, seed=0)
    example = read_input_tuple(filename_queue)

    min_after_dequeue = 10000
    capacity = min_after_dequeue + 3 * batch_size
    anchor, positive, negative, fnames = tf.train.batch(
        example,
        batch_size=batch_size,
        capacity=capacity,
        dynamic_pad=True,
        #         allow_smaller_final_batch=True,
        num_threads=num_threads)
    return anchor, positive, negative, fnames

# Model definition

In [28]:
class TextCNN(object):
    def __init__(self,
                 n_sents,
                 n_words,
                 vocab_size,
                 embedding_size,
                 sent_filter_sizes=[2, 3, 4, 5],
                 sent_nb_filter=15,
                 doc_filter_sizes=[1, 2, 3],
                 doc_nb_filter=10,
                 sent_kmax=10,
                 doc_kmax=10,
                 learning_rate=0.001):
        self.n_sents = n_sents
        self.n_words = n_words
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.sent_filter_sizes = sent_filter_sizes
        self.sent_nb_filter = sent_nb_filter
        self.doc_filter_sizes = doc_filter_sizes
        self.doc_nb_filter = doc_nb_filter
        self.sent_kmax = sent_kmax
        self.doc_kmax = doc_kmax
        self.learning_rate = learning_rate

        self.global_step = tf.get_variable("global_step", initializer=tf.constant(0), trainable=False)
        self.dropout_prob = tf.placeholder(tf.float32, name="dropout_prob")
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate=learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-08)

        # Embedding layer
        with tf.device('/cpu:0'), tf.name_scope("embedding"):
            self.LT = tf.get_variable('LT',
                initializer=tf.constant(0.0, shape=[vocab_size, embedding_size]),
                trainable=False)

            self.embedding_placeholder = tf.placeholder(
                tf.float32, [self.vocab_size, self.embedding_size])
            self.embedding_init = self.LT.assign(self.embedding_placeholder)

        with tf.variable_scope('sent'):
            self._create_sharable_weights(sent_filter_sizes, embedding_size,
                                          sent_nb_filter)
            self.sent_embedding_size = tf.convert_to_tensor(
                sent_kmax * sent_nb_filter * len(sent_filter_sizes))

        with tf.variable_scope('doc'):
            self._create_sharable_weights(doc_filter_sizes,
                                          self.sent_embedding_size.eval(),
                                          doc_nb_filter)
            self.doc_embedding_size = tf.convert_to_tensor(
                doc_kmax * doc_nb_filter * len(doc_filter_sizes))

    def inference(self, X):
        """ This is the forward calculation from batch X to doc embeddins """
        
        embedded_words = tf.nn.embedding_lookup(self.LT, X)
        embedded_words_expanded = tf.expand_dims(embedded_words, -1)
        
        with tf.variable_scope('sent'):

            def convolv_on_sents(embeds):
                return self._convolv_on_embeddings(
                    embeds, self.sent_filter_sizes, self.sent_nb_filter,
                    self.sent_kmax)

            # iter over each document
            self.sent_embed = tf.map_fn(
                convolv_on_sents,
                embedded_words_expanded,
                parallel_iterations=10,
                name='iter_over_docs')
            # sent_embed shape is [batch, n_sent, sent_sent_kmax*sent_nb_filter*len(sent_filter_sizes), 1]

        with tf.variable_scope('doc'):
            # finally, convolv on documents
            self.doc_embed = self._convolv_on_embeddings(
                self.sent_embed, self.doc_filter_sizes, self.doc_nb_filter,
                self.doc_kmax)
            # doc_embed shape is [batch, doc_kmax*doc_nb_filter*len(doc_filter_sizes), 1]

        doc_embed_normalized = tf.nn.l2_normalize(
            self.doc_embed, dim=1, name='doc_embed_normalized')

        return doc_embed_normalized

    def loss(self, X):
        with tf.name_scope("loss"):
            doc_embed_normalized = self.inference(X)
            self.anchor, self.positive, self.negative = tf.unstack(
                tf.reshape(doc_embed_normalized, [-1, 3, self.doc_embedding_size]),
                3, 1)
            _loss = triplet_loss(self.anchor, self.positive, self.negative)
        return _loss

    def optimize(self, X):
        with tf.name_scope("optimize"):
            self.loss_op = self.loss(X)
            self.gradients = self.optimizer.compute_gradients(self.loss_op)
            apply_gradient_op = self.optimizer.apply_gradients(
                self.gradients, global_step=self.global_step)
        return apply_gradient_op

    def _convolv_on_embeddings(self, embeds, filter_sizes, nb_filter, kmax):
        """
        Create a convolution + k-max pool layer for each filter size, then concat and vectorize.
        embeds shape is [batch, (n_words or n_sents), embedding_size, 1]
        """
        pooled_outputs = []
        for fsize in filter_sizes:
            with tf.name_scope("conv-%s" % fsize):
                with tf.variable_scope(
                        "conv_weights_fsize-%s" % fsize, reuse=True):
                    weights_init = tf.get_variable('W')
                    bias_init = tf.get_variable('b')
                conv = tf.nn.conv2d(
                    embeds,
                    weights_init,
                    strides=[1, 1, 1, 1],
                    padding="VALID",
                    name="conv")

                h = tf.nn.relu(tf.nn.bias_add(conv, bias_init), name="relu")
#                 h shape is [batch, n_words - fsize + 1, 1, nb_filter]
#             tf.summary.histogram("relu", h)

            with tf.name_scope('%s-maxpool-fsize-%s' % (kmax, fsize)):
                # k-maxpooling over the outputs
                trans = tf.transpose(h, perm=[0, 2, 3, 1])
                values, indices = tf.nn.top_k(trans, k=kmax, sorted=False)
                pooled = tf.transpose(values, perm=[0, 3, 1, 2])
                # pooled shape is [batch, kmax, 1, nb_filter]
                pooled_outputs.append(pooled)

        with tf.name_scope('concat_and_vectorize'):
            # Combine all the pooled features
            h_pool = tf.concat(pooled_outputs, 3)
            # h_pool shape is [batch, kmax, 1, nb_filter*len(filter_sizes)]

            # Vectorize filters for each sent to get sent embeddings
            trans = tf.transpose(h_pool, perm=[0, 2, 3, 1])
            batch = tf.shape(embeds)[0]
            sent_embed = tf.reshape(trans, [batch, -1, 1])
            # sent_embed shape is [batch, kmax*nb_filter*len(filter_sizes), 1]

        return sent_embed

    def _create_sharable_weights(self, filter_sizes, embedding_size,
                                 nb_filter):
        """ Create sharable weights for each type of convolution """
        with tf.name_scope('sharable_weights'):
            for fsize in filter_sizes:
                with tf.variable_scope("conv_weights_fsize-%s" % fsize):
                    filter_shape = [fsize, embedding_size, 1, nb_filter]
                    initializer = tf.contrib.layers.xavier_initializer_conv2d(
                        uniform=True)
#                     initializer=tf.truncated_normal(stddev=0.1))
                    weights_init = tf.get_variable(
                        'W', filter_shape, initializer=initializer)
                    bias_init = tf.get_variable(
                        'b', initializer=tf.constant(0.1, shape=[nb_filter]))
        
        
def triplet_loss(anchor_embed,
                 positive_embed,
                 negative_embed,
                 margin=0.2):
    """
    input: Three L2 normalized tensors of shape [None, dim], compute on a batch
    output: float
    """
    with tf.variable_scope('triplet_loss'):
        d_pos = tf.reduce_sum(tf.square(anchor_embed - positive_embed), 1)
        d_neg = tf.reduce_sum(tf.square(anchor_embed - negative_embed), 1)

        loss = tf.maximum(0., margin + d_pos - d_neg)
        loss = tf.reduce_mean(loss)

    return loss
    

In [7]:
triples_all = list(random_triples(sims, ids, num_epochs=1, seed=1))

100%|██████████| 368458/368458 [00:10<00:00, 33884.90it/s]


In [8]:
_triples, triples_test = train_test_split(triples_all, test_size=0.2, random_state=0)
triples_train, triples_val = train_test_split(_triples, test_size=0.3, random_state=0)

In [9]:
print(len(triples_train), len(triples_val), len(triples_test))

(428414, 183606, 153005)


In [107]:
start_time = time.time()

vocab_size, embedding_size = word_embeddings.shape
n_sents, n_words = 123, 40

with tf.Graph().as_default():
    tf.set_random_seed(0)

    session_conf = tf.ConfigProto(
        allow_soft_placement=True, log_device_placement=False)
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        anchor_batch, positive_batch, negative_batch, fnames = input_pipeline(
            triples_train[:64*100], batch_size=64, num_epochs=1)
        X = tf.reshape(
            tf.transpose([anchor_batch, positive_batch, negative_batch],
                         [1, 0, 2, 3]), [-1, n_sents, n_words],
            name='X')

        with tf.name_scope('init_model'):
            model = TextCNN(
                n_sents,
                n_words,
                vocab_size,
                embedding_size,
                sent_filter_sizes=[3, 4, 5],
                sent_nb_filter=50,
                doc_filter_sizes=[1, 2, 3],
                doc_nb_filter=10,
                sent_kmax=4,
                doc_kmax=10,
                learning_rate=0.001)
        train_op = model.optimize(X)

        init_local = tf.local_variables_initializer()
        init_global = tf.global_variables_initializer()
        sess.run([init_global, init_local])

        print('sent_embedding_size %s' % model.sent_embedding_size.eval())
        print('doc_embedding_size %s' % model.doc_embedding_size.eval())

        saver = tf.train.Saver()

        # ===================summary====================
        tf.summary.scalar("loss", model.loss_op)
        tf.summary.histogram("anchor", model.anchor)
        tf.summary.histogram("positive", model.positive)
        tf.summary.histogram("negative", model.negative)
        # Create summaries to visualize weights
        for var in tf.trainable_variables():
            tf.summary.histogram(var.name.replace(':', '_'), var)
        # Summarize all gradients
        for grad, var in model.gradients:
            tf.summary.histogram(
                var.name.replace(':', '_') + '/gradient', grad)
        merged_summary_op = tf.summary.merge_all()
        train_writer = tf.summary.FileWriter(
            join(DATA_FOLDER, 'summary', 'train',
                 str(datetime.datetime.now())), sess.graph)
        # ===================summary====================

        # Assign word embeddings to variable W
        #!!!! index is shifted by 1
        sess.run(
            model.embedding_init,
            feed_dict={model.embedding_placeholder: word_embeddings})

        # Start input enqueue threads.
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        try:
            while not coord.should_stop():
                _, step, loss, summary, anchor = sess.run([
                    train_op, model.global_step, model.loss_op,
                    merged_summary_op, model.anchor
                ])
                current_step = tf.train.global_step(sess, model.global_step)
                train_writer.add_summary(summary, current_step)
                print(current_step, loss)
        except tf.errors.OutOfRangeError:
            print('Done training -- epoch limit reached')
        finally:
            # When done, ask the threads to stop.
            coord.request_stop()

        # Wait for threads to finish.
        coord.join(threads)

    saver.save(
        sess,
        join(DATA_FOLDER, 'models', '%s' % str(datetime.datetime.now())),
        global_step=current_step)

print("--- %s seconds ---" % (time.time() - start_time))

sent_embedding_size 600
doc_embedding_size 300
(1, 0.19604474)


KeyboardInterrupt: 

In [55]:
model_dir = join(DATA_FOLDER, 'models')
!ls {model_dir}

2017-08-25 11:36:11.921470-100.data-00000-of-00001
2017-08-25 11:36:11.921470-100.index
2017-08-25 11:36:11.921470-100.meta
checkpoint


# Testing

https://github.com/aymericdamien/TensorFlow-Examples/blob/master/examples/4_Utils/save_restore_model.py

In [None]:
vocab_size, embedding_size = word_embeddings.shape
n_sents, n_words = 123, 40

doc_filter_sizes=[1, 2, 3]
doc_nb_filter=10
doc_kmax=10

with tf.Graph().as_default():
    tf.set_random_seed(0)

    session_conf = tf.ConfigProto(
        allow_soft_placement=True, log_device_placement=False)
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        anchor_batch, positive_batch, negative_batch, fnames_batch = input_pipeline(
            triples_test[:64*20], batch_size=64, num_epochs=1, num_threads=1, shuffle=False)
        X = tf.reshape(
            tf.transpose([anchor_batch, positive_batch, negative_batch],
                         [1, 0, 2, 3]), [-1, n_sents, n_words],
            name='X')
        
        init_local = tf.local_variables_initializer()
        init_global = tf.global_variables_initializer()
        sess.run([init_global, init_local])
        
        # do not restore before global initialization, otherwise all weights are set to default !!!
        saver = tf.train.import_meta_graph(
            join(model_dir, '2017-08-25 11:36:11.921470-100.meta'), input_map={'X':X})
        saver.restore(sess, tf.train.latest_checkpoint(model_dir))
        graph = tf.get_default_graph()
    
        doc_embed_normalized = graph.get_operation_by_name('optimize/loss/doc_embed_normalized').outputs[0]
        
        anchor, positive, negative = tf.unstack(
            tf.reshape(doc_embed_normalized, [-1, 3, doc_kmax * doc_nb_filter * len(doc_filter_sizes)]),
            3, 1)
        _loss = triplet_loss(anchor, positive, negative)
                             
#         pprint([n.name for n in tf.get_default_graph().as_graph_def().node])

        doc_embeds = []

        # Start input enqueue threads.
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        try:
            while not coord.should_stop():
                [batch_embeds, loss, fnames] = sess.run([doc_embed_normalized, _loss, fnames_batch])
                doc_embeds.append(batch_embeds)
                print(batch_embeds.shape, loss, fnames)
        except tf.errors.OutOfRangeError:
            print('Done testing -- epoch limit reached')
        finally:
            # When done, ask the threads to stop.
            coord.request_stop()

        # Wait for threads to finish.
        coord.join(threads)

INFO:tensorflow:Restoring parameters from ../data/models/2017-08-25 11:36:11.921470-100


2017-08-25 13:12:20,761 : INFO : Restoring parameters from ../data/models/2017-08-25 11:36:11.921470-100


((192, 300, 1), 0.03564262, array([['../data/corpus/5984d324b6b1135cdf63851f.txt',
        '../data/corpus/5984c93ab6b11379ac638509.txt',
        '../data/corpus/5984b87ab6b113329b638539.txt'],
       ['../data/corpus/5984d1d0b6b1134725638543.txt',
        '../data/corpus/5984b744b6b113214a638508.txt',
        '../data/corpus/5984def7b6b1133d29638515.txt'],
       ['../data/corpus/5984dd94b6b11335596384f7.txt',
        '../data/corpus/5984d2c1b6b11357c6638526.txt',
        '../data/corpus/5984ce40b6b11328fb63852f.txt'],
       ['../data/corpus/5984ddaeb6b113356f638533.txt',
        '../data/corpus/5984d261b6b113503b63854c.txt',
        '../data/corpus/5984c4f6b6b1134b8663853b.txt'],
       ['../data/corpus/5984c98db6b1137cd6638532.txt',
        '../data/corpus/5984bef6b6b11303f4638511.txt',
        '../data/corpus/5984d55bb6b11371c7638551.txt'],
       ['../data/corpus/5984c113b6b1131e74638520.txt',
        '../data/corpus/5984b83ab6b1132dcc638545.txt',
        '../data/corpus/5984bf20

((192, 300, 1), 0.017135276, array([['../data/corpus/5984bdebb6b113759c638534.txt',
        '../data/corpus/5984b673b6b113137a63853c.txt',
        '../data/corpus/5984d824b6b1130b046384ee.txt'],
       ['../data/corpus/5984d5a7b6b11375a8638535.txt',
        '../data/corpus/5984cbdab6b113178d6384fb.txt',
        '../data/corpus/5984bc20b6b1135fd6638511.txt'],
       ['../data/corpus/5984d068b6b1133d2e638543.txt',
        '../data/corpus/5984bdacb6b1137440638516.txt',
        '../data/corpus/5984cf7ab6b11334a063851c.txt'],
       ['../data/corpus/5984d570b6b11373a7638520.txt',
        '../data/corpus/5984c570b6b11351c2638505.txt',
        '../data/corpus/5984bd1fb6b1136c63638547.txt'],
       ['../data/corpus/5984d66eb6b1137b436384fc.txt',
        '../data/corpus/5984c7f0b6b1136a86638515.txt',
        '../data/corpus/5984d2d2b6b113599f6384fd.txt'],
       ['../data/corpus/5984dcf8b6b113314c638522.txt',
        '../data/corpus/5984c6fbb6b113623d63851f.txt',
        '../data/corpus/5984cd6

((192, 300, 1), 0.013230741, array([['../data/corpus/5984ce62b6b1132a526384f9.txt',
        '../data/corpus/5984c32cb6b113364263851f.txt',
        '../data/corpus/5984caa3b6b1130665638526.txt'],
       ['../data/corpus/5984de7cb6b1133923638547.txt',
        '../data/corpus/5984d647b6b113799b638533.txt',
        '../data/corpus/5984c073b6b11318bf6384f7.txt'],
       ['../data/corpus/5984d585b6b113748e638535.txt',
        '../data/corpus/5984c54cb6b11350886384fe.txt',
        '../data/corpus/5984ca22b6b113032d638509.txt'],
       ['../data/corpus/5984c951b6b1137aaa6384f6.txt',
        '../data/corpus/5984b5cfb6b113074e63853c.txt',
        '../data/corpus/5984b895b6b113354e638515.txt'],
       ['../data/corpus/5984d33fb6b1135e1c6384fc.txt',
        '../data/corpus/5984c64eb6b11358486384fe.txt',
        '../data/corpus/5984c3c1b6b11340196384fa.txt'],
       ['../data/corpus/5984d5b1b6b11376c56384f3.txt',
        '../data/corpus/5984cff9b6b1133add638515.txt',
        '../data/corpus/5984b5a

((192, 300, 1), 0.027117617, array([['../data/corpus/5984db85b6b11321aa63851b.txt',
        '../data/corpus/5984ca1bb6b11303276384f7.txt',
        '../data/corpus/5984c4efb6b1134ab1638537.txt'],
       ['../data/corpus/5984c2eab6b11332b4638550.txt',
        '../data/corpus/5984b8e5b6b113395863852f.txt',
        '../data/corpus/5984c48db6b113465f6384f3.txt'],
       ['../data/corpus/5984bf73b6b11308e66384fb.txt',
        '../data/corpus/5984b654b6b11311b963854a.txt',
        '../data/corpus/5984d0fab6b11341d863851b.txt'],
       ['../data/corpus/5984c3d9b6b11341236384ee.txt',
        '../data/corpus/5984bd6bb6b11371e8638521.txt',
        '../data/corpus/5984b872b6b11331d26384f6.txt'],
       ['../data/corpus/5984c7f6b6b1136a8c638530.txt',
        '../data/corpus/5984b7f4b6b1132a7c638543.txt',
        '../data/corpus/5984d4efb6b113693463854d.txt'],
       ['../data/corpus/5984d6d1b6b1137de763853c.txt',
        '../data/corpus/5984d170b6b113459d638507.txt',
        '../data/corpus/5984d2e

((192, 300, 1), 0.020871785, array([['../data/corpus/5984de69b6b113391563853a.txt',
        '../data/corpus/5984bc20b6b1135fdd638518.txt',
        '../data/corpus/5984cb1eb6b1130d6b638501.txt'],
       ['../data/corpus/5984c283b6b113301e638545.txt',
        '../data/corpus/5984b97bb6b11340306384fd.txt',
        '../data/corpus/5984d88cb6b11311cf6384f7.txt'],
       ['../data/corpus/5984cd68b6b1132354638513.txt',
        '../data/corpus/5984c536b6b1134f61638520.txt',
        '../data/corpus/5984bfb6b6b1130b846384f0.txt'],
       ['../data/corpus/5984cfffb6b1133ac8638539.txt',
        '../data/corpus/5984cba6b6b113156063850e.txt',
        '../data/corpus/5984b882b6b113338263850c.txt'],
       ['../data/corpus/5984bce0b6b113690863853f.txt',
        '../data/corpus/5984b7dcb6b1132964638547.txt',
        '../data/corpus/5984b643b6b1130f256384f1.txt'],
       ['../data/corpus/5984c72fb6b113646c638512.txt',
        '../data/corpus/5984c081b6b11318cc63853f.txt',
        '../data/corpus/5984d01

((192, 300, 1), 0.020573003, array([['../data/corpus/5984d58cb6b113748d638545.txt',
        '../data/corpus/5984c095b6b11319de638518.txt',
        '../data/corpus/5984deb9b6b1133b59638511.txt'],
       ['../data/corpus/5984c710b6b1136349638505.txt',
        '../data/corpus/5984c251b6b1132ec0638533.txt',
        '../data/corpus/5984d0fab6b11341c3638517.txt'],
       ['../data/corpus/5984d016b6b1133c0063852f.txt',
        '../data/corpus/5984c757b6b1136582638544.txt',
        '../data/corpus/5984cad0b6b1130847638538.txt'],
       ['../data/corpus/5984d66cb6b1137b4a6384f3.txt',
        '../data/corpus/5984b8ceb6b113382f63853d.txt',
        '../data/corpus/5984c3e7b6b113412a638530.txt'],
       ['../data/corpus/5984d5a3b6b11375a8638522.txt',
        '../data/corpus/5984ba01b6b11348b8638529.txt',
        '../data/corpus/5984c761b6b11366996384fc.txt'],
       ['../data/corpus/5984c617b6b1135588638534.txt',
        '../data/corpus/5984c075b6b11318b7638510.txt',
        '../data/corpus/5984ddd

((192, 300, 1), 0.02336137, array([['../data/corpus/5984d876b6b11310a163854a.txt',
        '../data/corpus/5984bbd9b6b1135d6f63850f.txt',
        '../data/corpus/5984c08bb6b11319c66384ff.txt'],
       ['../data/corpus/5984cb7ab6b1131334638545.txt',
        '../data/corpus/5984bbb5b6b1135c35638524.txt',
        '../data/corpus/5984baa7b6b1134e8e63854b.txt'],
       ['../data/corpus/5984dcbeb6b1132f13638525.txt',
        '../data/corpus/5984bcd7b6b11368ff63851f.txt',
        '../data/corpus/5984d01cb6b1133bf163850c.txt'],
       ['../data/corpus/5984c3abb6b1133ef4638514.txt',
        '../data/corpus/5984c10db6b1131e5c63850e.txt',
        '../data/corpus/5984cdbfb6b113259f63853c.txt'],
       ['../data/corpus/5984d97bb6b11315fc63854b.txt',
        '../data/corpus/5984d114b6b11342ea6384fa.txt',
        '../data/corpus/5984bda7b6b113743363851b.txt'],
       ['../data/corpus/5984be78b6b1137dae638549.txt',
        '../data/corpus/5984b675b6b113138263854e.txt',
        '../data/corpus/5984c782

((192, 300, 1), 0.023403047, array([['../data/corpus/5984db71b6b11321a26384f5.txt',
        '../data/corpus/5984c51bb6b1134e5f638522.txt',
        '../data/corpus/5984bf31b6b1130671638508.txt'],
       ['../data/corpus/5984c1f9b6b1132b7e63851d.txt',
        '../data/corpus/5984bfc1b6b1130b76638524.txt',
        '../data/corpus/5984d53db6b1136f09638545.txt'],
       ['../data/corpus/5984cff7b6b1133ac0638523.txt',
        '../data/corpus/5984b97bb6b11340306384fd.txt',
        '../data/corpus/5984bd0fb6b1136b646384f6.txt'],
       ['../data/corpus/5984c53eb6b1134f7463853e.txt',
        '../data/corpus/5984bf2ab6b1130663638513.txt',
        '../data/corpus/5984c31fb6b113354263852b.txt'],
       ['../data/corpus/5984c13eb6b1131fc5638509.txt',
        '../data/corpus/5984bb66b6b11359cc6384f3.txt',
        '../data/corpus/5984ca9ab6b113065b638516.txt'],
       ['../data/corpus/5984bd49b6b11370cc6384fa.txt',
        '../data/corpus/5984ba98b6b1134e8a63850f.txt',
        '../data/corpus/5984d43

In [89]:
for i in range(20):
    !cat {triples_test[64*i][0]}|head -n 1


67806 1723 8168 601 318 2252 137 275
3437 4709 45 361 10637 1048 4 60 49
156 1473 59
20 77 2789 2381 87 345
4 8030 2187 2728 2222 2060 988 988 13127 20 8030 2187 91
4 226 13169 9849
2 1296 22 5 5418 222
1094 2032 961 222 20601 109
4761 23844 5457 320
121 712
8061 11896 3570 10160
4 179 5225 4191 1125 44
4 60 5538 3804 5049
610 1004 5 77611 3133
4 60 850 12710
792 4428
1864 225 538 7187 3352
20 380 186 70 573 22124 39
4 2896 118 186 70 573
4 226 850 2537 1769


In [72]:
1200000*1000/(93*60*60)

3584.2293906810037

In [213]:
g = tf.Graph()
with g.as_default():
    tf.set_random_seed(0)
    sess = tf.Session()
    with sess.as_default():

        a = tf.convert_to_tensor([[[1,2],[3,4]],[[5,6],[7,8]],[[9,10],[11,12]]])
        b = tf.convert_to_tensor([[[13,14],[15,16]],[[17,18],[19,20]],[[21,22],[23,24]]])
        c = tf.convert_to_tensor([[[25,26],[27,28]],[[29,30],[31,32]],[[33,34],[35,36]]])
        
        d = tf.reshape(tf.transpose([a,b,c], [1,0,2,3]), [-1,2,2])
        
        sess.run(tf.global_variables_initializer())
        
        [res] = sess.run([d])
        print(res)

#         train_writer = tf.summary.FileWriter('../data/summary', sess.graph)

[[[ 1  2]
  [ 3  4]]

 [[13 14]
  [15 16]]

 [[25 26]
  [27 28]]

 [[ 5  6]
  [ 7  8]]

 [[17 18]
  [19 20]]

 [[29 30]
  [31 32]]

 [[ 9 10]
  [11 12]]

 [[21 22]
  [23 24]]

 [[33 34]
  [35 36]]]
