In [2]:
from gensim.models.ldamodel import LdaModel
import gensim.corpora as corpora
import pandas as pd
from gensim.models import Word2Vec
import tensorflow as tf
import numpy as np

## Sequence labelling model

In [1]:
MAX_SEQ_LEN = 64
t2i_dict = {'c': 1, 'o': 2, 'b': 3, 'a': 4}

In [15]:
def word2ind_with_w2v(fname, word_set):
  sentences = []
  tags = []
  with open(fname, 'r', encoding='utf8') as fin:
    content = fin.read().replace('\n\n', '')
    content_list = content.split('\n')
    for line in content_list:
      line_word_seg = []
      line_tag_seg = []
      for phrase in line.split(' '):
        if len(phrase) < 3: continue
        words, tag = phrase.split('/')
        words = words.split('_')
        tag = [t2i_dict[tag]] * len(words)
        words = [word_set.index(word) for word in words]
        line_word_seg.extend(words)
        line_tag_seg.extend(tag)
      if len(line_word_seg) < MAX_SEQ_LEN:
        line_word_seg.extend([0] * (MAX_SEQ_LEN - len(line_word_seg)))
        line_tag_seg.extend([0] * (MAX_SEQ_LEN - len(line_tag_seg)))
      elif len(line_word_seg) > MAX_SEQ_LEN:
        line_word_seg = line_word_seg[: MAX_SEQ_LEN]
        line_tag_seg = line_tag_seg[: MAX_SEQ_LEN]
      sentences.append(line_word_seg)
      tags.append(line_tag_seg)
  return sentences, tags

In [16]:
def get_t2i_map(fname, preserve_zero=True):
  word_list = []

  with open(fname, 'r', encoding='utf8') as fin:
    content = fin.read().replace('\n\n', '')
    content_list = content.split('\n')
    for line in content_list:
      for phrase in line.split(' '):
        if len(phrase) < 3: continue
        words, tag = phrase.split('/')
        words = words.split('_')
        word_list.extend(words)
  word_set = set(word_list)
  word_set = sorted(word_set)

  if preserve_zero:
    word_set.insert(0, '<UNK>')

  return word_set

In [30]:
# word_set = get_t2i_map('/home/lian/data/nlp/datagrand_info_extra/train.txt')
len(word_set)

4549

In [18]:
sentences, tags = word2ind_with_w2v('/home/lian/data/nlp/datagrand_info_extra/train.txt', word_set)

In [20]:
import numpy as np
sentences = np.array(sentences)
tags = np.array(tags)

In [21]:
sentences.shape, tags.shape

((17000, 64), (17000, 64))

In [89]:
tf.reset_default_graph()

In [90]:
def data_iterate(X, y, batch_size):
  """

  Args:
    X:
    y:
    batch_size:

  Returns:

  """
  # Split dataset to train and test.
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
  train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
  train_dataset = train_dataset.batch(batch_size)

  dev_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))
  dev_dataset = dev_dataset.batch(batch_size)

  test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))
  test_dataset = test_dataset.batch(batch_size)

  # A reinitializable iterator
  iterator = tf.data.Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes)

  train_initializer = iterator.make_initializer(train_dataset)
  dev_initializer = iterator.make_initializer(dev_dataset)
  test_initializer = iterator.make_initializer(test_dataset)

  return train_initializer, dev_initializer, test_initializer, iterator

In [91]:
from sklearn.model_selection import train_test_split
import tensorflow as tf

train_initializer, dev_initializer, test_initializer, iterator = data_iterate(sentences, tags, 10)

In [92]:
x, y = iterator.get_next()

In [93]:
embedding = tf.Variable(tf.random_normal([4550, 20]), dtype=tf.float32)
inputs = tf.nn.embedding_lookup(embedding, x + 1)

In [94]:
from tensorflow.contrib import rnn
def lstm_cell(rnn_units, keep_prob, reuse=False):
  if reuse:
    tf.get_variable_scope.reuse_variables()
  cell = rnn.LSTMCell(rnn_units)
  return rnn.DropoutWrapper(cell, output_keep_prob=keep_prob)

In [95]:
cell_fw = [lstm_cell(128, 1) for _ in range(1)]
cell_bw = [lstm_cell(128, 1) for _ in range(1)]
inputs = tf.unstack(inputs, 64, axis=1)
output, _, _ = tf.contrib.rnn.stack_bidirectional_rnn(cell_fw, cell_bw, inputs=inputs, dtype=tf.float32)

In [96]:
output = tf.stack(output, axis=1)
print(output.shape)
output = tf.reshape(output, [-1, 128 * 2])

(?, 64, 256)


In [97]:
tf.nn.embedding_lookup(embedding, x + 1).shape, output.shape

(TensorShape([Dimension(None), Dimension(64), Dimension(20)]),
 TensorShape([Dimension(None), Dimension(256)]))

In [98]:
len(tf.unstack(tf.nn.embedding_lookup(embedding, x + 1), axis=1)), output.shape

(64, TensorShape([Dimension(None), Dimension(256)]))

In [99]:
logits = tf.keras.layers.Dense(5)(output)
y_predict = tf.cast(tf.argmax(logits, axis=1), tf.int32)
print('Output Y', y_predict)

Output Y Tensor("Cast:0", shape=(?,), dtype=int32)


In [100]:
# Reshape y_label
y_label_reshape = tf.cast(tf.reshape(y, [-1]), tf.int32)
# Prediction
correct_prediction = tf.equal(y_predict, y_label_reshape)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
# Loss
cross_entropy = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
    labels=y_label_reshape, logits=tf.cast(logits, tf.float32)))

# Train
train = tf.train.AdamOptimizer(0.001).minimize(cross_entropy)

In [105]:
with tf.Session() as sess:
    sess.run(tf.initialize_all_variables())
    for epoch in range(100):
    #     tf.train.global_step(sess, global_step_tensor=global_step)
        # Train
        sess.run(train_initializer)
        for step in range(200):
            loss, acc, _ = sess.run([cross_entropy, accuracy, train])
            # Print log
            if step % 10 == 0:
                print('Step', step, 'Train Loss', loss, 'Accuracy', acc)

        if epoch % 2 == 0:
            # Dev
            sess.run(dev_initializer)
            for step in range(5):
                if step % 20 == 0:
                    print('Dev Accuracy', sess.run(accuracy), 'Step', step)

Step 0 Train Loss 1.6643097 Accuracy 0.125
Step 10 Train Loss 0.65001404 Accuracy 0.8859375
Step 20 Train Loss 0.29254133 Accuracy 0.94375
Step 30 Train Loss 0.5746679 Accuracy 0.8515625
Step 40 Train Loss 0.41754866 Accuracy 0.8921875
Step 50 Train Loss 0.24996157 Accuracy 0.94375
Step 60 Train Loss 0.2578134 Accuracy 0.9375
Step 70 Train Loss 0.4252585 Accuracy 0.8859375
Step 80 Train Loss 0.48681563 Accuracy 0.884375
Step 90 Train Loss 0.35601166 Accuracy 0.9078125
Step 100 Train Loss 0.3305379 Accuracy 0.921875
Step 110 Train Loss 0.16293706 Accuracy 0.96875
Step 120 Train Loss 0.39690804 Accuracy 0.88125
Step 130 Train Loss 0.3899981 Accuracy 0.9078125
Step 140 Train Loss 0.37138325 Accuracy 0.9203125
Step 150 Train Loss 0.22349629 Accuracy 0.9578125
Step 160 Train Loss 0.26988006 Accuracy 0.9296875
Step 170 Train Loss 0.3218057 Accuracy 0.9125
Step 180 Train Loss 0.3206797 Accuracy 0.925
Step 190 Train Loss 0.3426709 Accuracy 0.8953125
Dev Accuracy 0.9375 Step 0
Step 0 Train Loss

Step 80 Train Loss 0.12594815 Accuracy 0.95
Step 90 Train Loss 0.1506597 Accuracy 0.9484375
Step 100 Train Loss 0.09384526 Accuracy 0.9640625
Step 110 Train Loss 0.029721165 Accuracy 0.990625
Step 120 Train Loss 0.18913771 Accuracy 0.940625
Step 130 Train Loss 0.12572488 Accuracy 0.959375
Step 140 Train Loss 0.12509416 Accuracy 0.9703125
Step 150 Train Loss 0.10286264 Accuracy 0.9671875
Step 160 Train Loss 0.08271749 Accuracy 0.98125
Step 170 Train Loss 0.20891432 Accuracy 0.9328125
Step 180 Train Loss 0.08236475 Accuracy 0.9734375
Step 190 Train Loss 0.14337416 Accuracy 0.94375
Dev Accuracy 0.946875 Step 0
Step 0 Train Loss 0.23589773 Accuracy 0.9171875
Step 10 Train Loss 0.118662655 Accuracy 0.9546875
Step 20 Train Loss 0.10347664 Accuracy 0.965625
Step 30 Train Loss 0.096301824 Accuracy 0.975
Step 40 Train Loss 0.09333758 Accuracy 0.9703125
Step 50 Train Loss 0.06422241 Accuracy 0.9765625
Step 60 Train Loss 0.08218221 Accuracy 0.96875
Step 70 Train Loss 0.07707981 Accuracy 0.9703125

Step 140 Train Loss 0.040335074 Accuracy 0.9859375
Step 150 Train Loss 0.022399431 Accuracy 0.9890625
Step 160 Train Loss 0.021328378 Accuracy 0.99375
Step 170 Train Loss 0.045651875 Accuracy 0.990625
Step 180 Train Loss 0.01245164 Accuracy 0.996875
Step 190 Train Loss 0.030835137 Accuracy 0.99375
Dev Accuracy 0.9484375 Step 0
Step 0 Train Loss 0.027434051 Accuracy 0.9921875
Step 10 Train Loss 0.019144727 Accuracy 0.9953125
Step 20 Train Loss 0.011485824 Accuracy 0.996875
Step 30 Train Loss 0.012901898 Accuracy 0.9984375
Step 40 Train Loss 0.018540863 Accuracy 0.99375
Step 50 Train Loss 0.013585779 Accuracy 0.9984375
Step 60 Train Loss 0.016502064 Accuracy 0.9953125
Step 70 Train Loss 0.014553184 Accuracy 0.996875
Step 80 Train Loss 0.012259731 Accuracy 0.996875
Step 90 Train Loss 0.012450235 Accuracy 0.996875
Step 100 Train Loss 0.009749418 Accuracy 1.0
Step 110 Train Loss 0.004197526 Accuracy 1.0
Step 120 Train Loss 0.030114124 Accuracy 0.990625
Step 130 Train Loss 0.014885353 Accura

Step 10 Train Loss 0.0021415534 Accuracy 1.0
Step 20 Train Loss 0.0033090226 Accuracy 1.0
Step 30 Train Loss 0.002021383 Accuracy 1.0
Step 40 Train Loss 0.002545141 Accuracy 1.0
Step 50 Train Loss 0.0027055652 Accuracy 1.0
Step 60 Train Loss 0.0039074626 Accuracy 0.9984375
Step 70 Train Loss 0.0046749664 Accuracy 0.9984375
Step 80 Train Loss 0.001438461 Accuracy 1.0
Step 90 Train Loss 0.0032866125 Accuracy 0.9984375
Step 100 Train Loss 0.0017843582 Accuracy 1.0
Step 110 Train Loss 0.0015407784 Accuracy 1.0
Step 120 Train Loss 0.017012872 Accuracy 0.996875
Step 130 Train Loss 0.0059601655 Accuracy 0.996875
Step 140 Train Loss 0.0040011858 Accuracy 1.0
Step 150 Train Loss 0.002903359 Accuracy 1.0
Step 160 Train Loss 0.005173605 Accuracy 0.9984375
Step 170 Train Loss 0.0053299004 Accuracy 1.0
Step 180 Train Loss 0.0018728409 Accuracy 1.0
Step 190 Train Loss 0.0028413183 Accuracy 1.0
Step 0 Train Loss 0.0022313078 Accuracy 1.0
Step 10 Train Loss 0.0030688911 Accuracy 1.0
Step 20 Train Loss

Step 150 Train Loss 0.0003761794 Accuracy 1.0
Step 160 Train Loss 0.0009279965 Accuracy 1.0
Step 170 Train Loss 0.0020385007 Accuracy 1.0
Step 180 Train Loss 0.00087944773 Accuracy 1.0
Step 190 Train Loss 0.002641727 Accuracy 1.0
Step 0 Train Loss 0.0014524559 Accuracy 1.0
Step 10 Train Loss 0.0015452623 Accuracy 1.0
Step 20 Train Loss 0.00061481656 Accuracy 1.0
Step 30 Train Loss 0.0020265107 Accuracy 0.9984375
Step 40 Train Loss 0.0016148525 Accuracy 1.0
Step 50 Train Loss 0.016218962 Accuracy 0.996875
Step 60 Train Loss 0.0007403589 Accuracy 1.0
Step 70 Train Loss 0.005004911 Accuracy 0.9984375
Step 80 Train Loss 0.0014584771 Accuracy 1.0
Step 90 Train Loss 0.00064769865 Accuracy 1.0
Step 100 Train Loss 0.001729558 Accuracy 1.0
Step 110 Train Loss 0.0006771806 Accuracy 1.0
Step 120 Train Loss 0.0018077346 Accuracy 1.0
Step 130 Train Loss 0.0006931002 Accuracy 1.0
Step 140 Train Loss 0.0013322764 Accuracy 1.0
Step 150 Train Loss 0.0007405207 Accuracy 1.0
Step 160 Train Loss 0.0021254

Step 100 Train Loss 0.00018579871 Accuracy 1.0
Step 110 Train Loss 0.000109593544 Accuracy 1.0
Step 120 Train Loss 0.00033383397 Accuracy 1.0
Step 130 Train Loss 0.00025307358 Accuracy 1.0
Step 140 Train Loss 0.00030588172 Accuracy 1.0
Step 150 Train Loss 0.00012097899 Accuracy 1.0
Step 160 Train Loss 0.00029430483 Accuracy 1.0
Step 170 Train Loss 0.0005548973 Accuracy 1.0
Step 180 Train Loss 0.0002089742 Accuracy 1.0
Step 190 Train Loss 0.0005178702 Accuracy 1.0
Dev Accuracy 0.95 Step 0
Step 0 Train Loss 0.0003285006 Accuracy 1.0
Step 10 Train Loss 0.00021159454 Accuracy 1.0
Step 20 Train Loss 9.137754e-05 Accuracy 1.0
Step 30 Train Loss 0.000527433 Accuracy 1.0
Step 40 Train Loss 0.00039825082 Accuracy 1.0
Step 50 Train Loss 0.00029538907 Accuracy 1.0
Step 60 Train Loss 0.00018618119 Accuracy 1.0
Step 70 Train Loss 0.00023587728 Accuracy 1.0
Step 80 Train Loss 0.00022647553 Accuracy 1.0
Step 90 Train Loss 0.00017450459 Accuracy 1.0
Step 100 Train Loss 0.00014615896 Accuracy 1.0
Step 

KeyboardInterrupt: 