In [1]:
from common import *

Using TensorFlow backend.


In [2]:
all_docs = get_all_docs(DATA_FOLDER)
val_docs = sorted(glob(join(DATA_FOLDER, 'validate/*.txt')))
ziped_files = sorted(glob(DATA_FOLDER + '/documents/*.gz'), key=natural_keys)

In [3]:
w2v_model = gensim.models.Word2Vec.load(join(DATA_FOLDER, 'vectors/w2v_model_300_w10'))
wv = w2v_model.wv

2017-08-03 00:07:10,200 : INFO : loading Word2Vec object from ../data/vectors/w2v_model_300_w10
2017-08-03 00:07:12,369 : INFO : loading wv recursively from ../data/vectors/w2v_model_300_w10.wv.* with mmap=None
2017-08-03 00:07:12,370 : INFO : loading syn0 from ../data/vectors/w2v_model_300_w10.wv.syn0.npy with mmap=None
2017-08-03 00:07:12,532 : INFO : setting ignored attribute syn0norm to None
2017-08-03 00:07:12,533 : INFO : loading syn1neg from ../data/vectors/w2v_model_300_w10.syn1neg.npy with mmap=None
2017-08-03 00:07:12,685 : INFO : setting ignored attribute cum_table to None
2017-08-03 00:07:12,686 : INFO : loaded ../data/vectors/w2v_model_300_w10


# Save corpus as single file

In [4]:
def save_corpus(fname, ziped_files, wv):
    with open(fname, 'w') as f:
        for fn in tqdm(ziped_files):
            with GzipFile(fn, 'r') as gzf:
                text = gzf.read()
            docs = json.loads(text)
            for doc in docs:
                sents = [(' ').join([str(wv.vocab[w].index) for w in sent if w in wv]) for sent in doc]
                doc_str = (' . ').join([s for s in sents if s != ''])
                f.write(doc_str + '\n')

In [5]:
# corpus_file = join(DATA_FOLDER, "corpus.txt")
# save_corpus(corpus_file, ziped_files, wv)

# Iterate over corpus efficiently

In [9]:
def iterate_docs(corpus_file, wv, batch_size=100, chunk_size=10):
    """
    batch_size - number of file lines in one chunk
    """
    with GzipFile(corpus_file, 'r') as f:
        while True:
            chunk_lines = list(islice(f, batch_size*chunk_size))
            if not chunk_lines:
                return
            
            for batch_lines in np.array_split(chunk_lines, batch_size):
                for line in batch_lines:
                    doc = line.decode('utf-8').split('.')
                    vectorized_doc = [wv.syn0[[int(ix) for ix in sent.split()]] for sent in doc]
                    yield vectorized_doc

In [7]:
def gen_batches(corpus_file, wv, batch_size):
    it = iterate_docs(corpus_file, wv, batch_size)
    while True:
        batch = tuple(islice(it, batch_size))
        if not batch:
            return
        yield batch

In [10]:
batch_size = 100

corpus_file = join(DATA_FOLDER, "corpus.txt.gz")

gen = gen_batches(corpus_file, wv, batch_size)

# islice(gen, len(all_docs) - 10, None)
tot = 0
for batch in gen:
    tot += len(batch)
    if tot % 3000 == 0:
        print(tot)

3000
6000
9000
12000
15000
18000
21000
24000
27000
30000
33000
36000
39000
42000
45000
48000
51000
54000
57000
60000
63000
66000
69000
72000
75000
78000
81000
84000
87000
90000
93000
96000
99000
102000
105000
108000
111000
114000
117000
120000
123000
126000
129000
132000
135000
138000
141000
144000
147000
150000
153000
156000
159000
162000
165000
168000
171000
174000
177000
180000
183000
186000
189000
192000
195000
198000
201000
204000
207000
210000
213000
216000
219000
222000
225000
228000
231000
234000
237000
240000
243000
246000
249000
252000
255000
258000
261000
264000
267000
270000
273000
276000
279000
282000
285000
288000
291000
294000
297000
300000
303000
306000
309000
312000
315000
318000
321000
324000
327000
330000
333000
336000
339000
342000
345000
348000
351000
354000
357000
360000
363000
366000
369000
372000
375000
378000
381000
384000
387000
390000
393000
396000
399000
402000
405000
408000
411000
414000
417000
420000
423000
426000
429000
432000
435000
438000
441000
444000


In [30]:
from common import *

from pymongo import MongoClient
import pymongo
from bson.objectid import ObjectId

In [32]:
client = MongoClient()
db = client.fips

In [41]:
cursor = db.patents.find({'similar': {'$exists': True}}, 
                       {'similar': 1})
similar = []
for doc in cursor:
    similar.append((str(doc['_id']), doc['similar']))

In [42]:
len(similar)

368458

In [43]:
similar[:10]

[('5984b7c2b6b1132856638528', ['5984b65cb6b1131291638512']),
 ('5984b980b6b113401d638515', ['5984b65cb6b113129b638521']),
 ('5984b9f9b6b11348ca63850f', ['5984b71fb6b1131bef638550']),
 ('5984b9fab6b11348c163851e', ['5984b65ab6b113129b638508']),
 ('5984ba05b6b11348c1638540', ['5984b680b6b1131490638515']),
 ('5984ba11b6b11349ec6384f0',
  ['5984b688b6b113147e638533',
   '5984b5dbb6b1130839638502',
   '5984b832b6b1132dc8638509']),
 ('5984ba15b6b11349ee638507', ['5984b78bb6b113254c63852e']),
 ('5984ba15b6b11349ee638508', ['5984b584b6b11303c1638551']),
 ('5984ba17b6b11349f563850c', ['5984b787b6b113256063850b']),
 ('5984ba17b6b11349f563850d', ['5984b5ccb6b113073663853e'])]

# CNN

In [45]:
class TextCNN(object):
    """
    A CNN for text classification.
    Uses an embedding layer, followed by a convolutional, k-max-pooling and softmax layer.
    """
    def __init__(self, vocab_size, embedding_size, 
                 filter_sizes, num_filters, l2_reg_lambda=0.0):

        self.W = tf.Variable(tf.constant(0.0, shape=[vocab_size, embedding_size]),
                             trainable=False, name="W")

        self.embedding_placeholder = tf.placeholder(tf.float32, [vocab_size, embedding_size])
        self.embedding_init = self.W.assign(self.embedding_placeholder)
        
        
        # Placeholders for input, output and dropout
        self.X = tf.placeholder(tf.int32, [None, None], name="X")
        self.y = tf.placeholder(tf.float32, [None, 2], name="y")
        self.dropout_prob = tf.placeholder(tf.float32, name="dropout_prob")

        # Keeping track of l2 regularization loss (optional)
        l2_loss = tf.constant(0.0)

        # Embedding layer
        with tf.device('/cpu:0'), tf.name_scope("embedding"):
            self.W = tf.Variable(
                tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0),
                name="W")
            self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x)
            self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)

        # Create a convolution + maxpool layer for each filter size
        pooled_outputs = []
        for i, filter_size in enumerate(filter_sizes):
            with tf.name_scope("conv-maxpool-%s" % filter_size):
                # Convolution Layer
                filter_shape = [filter_size, embedding_size, 1, num_filters]
                W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
                b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
                conv = tf.nn.conv2d(
                    self.embedded_chars_expanded,
                    W,
                    strides=[1, 1, 1, 1],
                    padding="VALID",
                    name="conv")
                # Apply nonlinearity
                h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
                # Maxpooling over the outputs
                pooled = tf.nn.max_pool(
                    h,
                    ksize=[1, sequence_length - filter_size + 1, 1, 1],
                    strides=[1, 1, 1, 1],
                    padding='VALID',
                    name="pool")
                pooled_outputs.append(pooled)

        # Combine all the pooled features
        num_filters_total = num_filters * len(filter_sizes)
        self.h_pool = tf.concat(pooled_outputs, 3)
        self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])

        # Add dropout
        with tf.name_scope("dropout"):
            self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_prob)

        # Final (unnormalized) scores and predictions
        with tf.name_scope("output"):
            W = tf.get_variable(
                "W",
                shape=[num_filters_total, num_classes],
                initializer=tf.contrib.layers.xavier_initializer())
            b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
            l2_loss += tf.nn.l2_loss(W)
            l2_loss += tf.nn.l2_loss(b)
            self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")
            self.predictions = tf.argmax(self.scores, 1, name="predictions")

        # CalculateMean cross-entropy loss
        with tf.name_scope("loss"):
            losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y)
            self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss

        # Accuracy
        with tf.name_scope("accuracy"):
            correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")

In [51]:
# Data loading params
tf.flags.DEFINE_float("dev_sample_percentage", .1, "Percentage of the training data to use for validation")

# Model Hyperparameters
tf.flags.DEFINE_integer("embedding_dim", wv.syn0[1], "Dimensionality of word embedding")
tf.flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-separated filter sizes (default: '3,4,5')")
tf.flags.DEFINE_integer("num_filters", 128, "Number of filters per filter size (default: 128)")
tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)")
tf.flags.DEFINE_float("l2_reg_lambda", 0.0, "L2 regularization lambda (default: 0.0)")

# Training parameters
tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)")
tf.flags.DEFINE_integer("num_epochs", 10, "Number of training epochs (default: 10)")
tf.flags.DEFINE_integer("evaluate_every", 100, "Evaluate model on dev set after this many steps (default: 100)")
tf.flags.DEFINE_integer("checkpoint_every", 100, "Save model after this many steps (default: 100)")
tf.flags.DEFINE_integer("num_checkpoints", 5, "Number of checkpoints to store (default: 5)")
# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")


# Data Preparation
# ==================================================


# train/dev split here


# Training
# ==================================================

with tf.Graph().as_default(): 
#  If you would like TensorFlow to automatically choose an existing and supported device to 
#  run the operations in case the specified one doesn't exist, you can set allow_soft_placement to True
    
    session_conf = tf.ConfigProto(
      allow_soft_placement=FLAGS.allow_soft_placement,
      log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        cnn = TextCNN(
            num_classes=y_train.shape[1],
            vocab_size=len(vocab_processor.vocabulary_),
            embedding_size=FLAGS.embedding_dim,
            filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
            num_filters=FLAGS.num_filters,
            l2_reg_lambda=FLAGS.l2_reg_lambda)

        # Define Training procedure
        global_step = tf.Variable(0, name="global_step", trainable=False)
        optimizer = tf.train.AdamOptimizer(1e-3)
        grads_and_vars = optimizer.compute_gradients(cnn.loss)
        train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

        # Keep track of gradient values and sparsity (optional)
        grad_summaries = []
        for g, v in grads_and_vars:
            if g is not None:
                grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g)
                sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
                grad_summaries.append(grad_hist_summary)
                grad_summaries.append(sparsity_summary)
        grad_summaries_merged = tf.summary.merge(grad_summaries)

        # Output directory for models and summaries
        timestamp = str(int(time.time()))
        out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
        print("Writing to {}\n".format(out_dir))

        # Summaries for loss and accuracy
        loss_summary = tf.summary.scalar("loss", cnn.loss)
        acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)

        # Train Summaries
        train_summary_op = tf.summary.merge([loss_summary, acc_summary, grad_summaries_merged])
        train_summary_dir = os.path.join(out_dir, "summaries", "train")
        train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)

        # Dev summaries
        dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
        dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
        dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph)

        # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
        checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
        checkpoint_prefix = os.path.join(checkpoint_dir, "model")
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints)

        # Write vocabulary
        vocab_processor.save(os.path.join(out_dir, "vocab"))

        # Initialize all variables
        sess.run(tf.global_variables_initializer())
        
        # Assign word embeddings to variable
        sess.run(cnn.embedding_init, feed_dict={cnn.embedding_placeholder: wv.syn0})

        def train_step(x_batch, y_batch):
            """
            A single training step
            """
            feed_dict = {
              cnn.input_x: x_batch,
              cnn.input_y: y_batch,
              cnn.dropout_keep_prob: FLAGS.dropout_keep_prob
            }
            _, step, summaries, loss, accuracy = sess.run(
                [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy],
                feed_dict)
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
            train_summary_writer.add_summary(summaries, step)

        def dev_step(x_batch, y_batch, writer=None):
            """
            Evaluates model on a dev set
            """
            feed_dict = {
              cnn.input_x: x_batch,
              cnn.input_y: y_batch,
              cnn.dropout_keep_prob: 1.0
            }
            step, summaries, loss, accuracy = sess.run(
                [global_step, dev_summary_op, cnn.loss, cnn.accuracy],
                feed_dict)
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
            if writer:
                writer.add_summary(summaries, step)

        # Generate batches
        batches = data_helpers.batch_iter(
            list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs)
        # Training loop. For each batch...
        for batch in batches:
            x_batch, y_batch = zip(*batch)
            train_step(x_batch, y_batch)
            current_step = tf.train.global_step(sess, global_step)
            if current_step % FLAGS.evaluate_every == 0:
                print("\nEvaluation:")
                dev_step(x_dev, y_dev, writer=dev_summary_writer)
                print("")
            if current_step % FLAGS.checkpoint_every == 0:
                path = saver.save(sess, checkpoint_prefix, global_step=current_step)
                print("Saved model checkpoint to {}\n".format(path))

<tf.Tensor 'input_x_5:0' shape=(?, 100, 300) dtype=int32>

In [52]:
tf.global_variables()

[]