In [1]:
import codecs
import numpy as np
import tensorflow as tf
from konlpy.tag import Twitter
from gensim.models import doc2vec

  from ._conv import register_converters as _register_converters


In [2]:
twitter = Twitter()

In [3]:
model = doc2vec.Doc2Vec.load('../embed/doc2vec_model/doc2vec_twitter_kowiki_300000_docs.model')

In [4]:
max_sent_len = 30
vocab_size = len(model.wv.vocab)
vector_size = model.vector_size

In [5]:
def tagger(sent):
    return [word for word, tag in twitter.pos(sent) if tag not in ['Punctuation', 'Unknown']]

In [6]:
def vectorize(model):
    def vec(sent):
        return model.wv[filter(lambda x: x in model.wv.vocab, tagger(sent))]
    
    return vec

vec = vectorize(model)

In [7]:
def padding(max_len):
    def pad(sent):
        if len(sent) > max_len:
            return sent[:max_len]
        else:
            return np.vstack((sent, np.zeros((max_len - sent.shape[0], sent.shape[1]))))
    
    return pad

In [8]:
with codecs.open('../sample_data/movie_review/train/train_data', encoding='utf-8') as f:
    data = f.readlines()
    data = map(vec, data)
    data = list(map(padding(max_sent_len), data))

In [9]:
with open('../sample_data/movie_review/train/train_label') as f:
    label = f.readlines()
    label = list(map(int, label))

In [10]:
plc_embed = tf.placeholder(tf.float32, (None, max_sent_len, vector_size))
plc_label = tf.placeholder(tf.float32, (None))
training = tf.placeholder(tf.bool)
keep_prob = tf.placeholder(tf.float32)

In [11]:
embed_size = 128
block_num = 16
highway_num = 4

In [12]:
prenet = tf.layers.dense(plc_embed, embed_size, activation=tf.nn.relu)
prenet = tf.nn.dropout(prenet, keep_prob=keep_prob)

prenet = tf.layers.dense(prenet, embed_size // 2, activation=tf.nn.sigmoid)
prenet = tf.nn.dropout(prenet, keep_prob=keep_prob)

In [13]:
block_num = 16

output = tf.layers.conv1d(prenet, embed_size // 2, 1, 1, padding='SAME')
for n in range(2, block_num + 1):
    tmp = tf.layers.conv1d(prenet, embed_size // 2, n, 1, padding='SAME')
    output = tf.concat((output, tmp), axis=-1)

output = tf.layers.batch_normalization(output, training=training)
output = tf.nn.relu(output)

In [14]:
pool = tf.layers.max_pooling1d(output, 2, 1, padding='SAME')

In [15]:
conv1 = tf.layers.conv1d(output, embed_size // 2, 3, 1, padding='SAME')
conv1 = tf.layers.batch_normalization(conv1, training=training)
conv1 = tf.nn.relu(conv1)

conv2 = tf.layers.conv1d(conv1, embed_size // 2, 3, 1, padding='SAME')
conv2 = tf.layers.batch_normalization(conv2, training=training)

In [16]:
highway = conv2 + prenet
for i in range(highway_num):
    H = tf.layers.dense(highway, embed_size // 2, activation=tf.nn.relu)
    T = tf.layers.dense(highway, embed_size // 2, activation=tf.nn.sigmoid)
    highway = H * T + highway * (1.0 - T)

In [17]:
fw = tf.nn.rnn_cell.GRUCell(embed_size // 2)
bw = tf.nn.rnn_cell.GRUCell(embed_size // 2)
o, s = tf.nn.bidirectional_dynamic_rnn(fw, bw, tf.transpose(highway, (1, 0, 2)), dtype=tf.float32, time_major=True)
o = tf.transpose(tf.concat(o, axis=-1), (1, 0, 2))

In [18]:
attn = tf.layers.dense(o, max_sent_len) / (max_sent_len ** 0.5)
attn = tf.matmul(tf.nn.softmax(attn), o)

In [21]:
conv = tf.layers.conv1d(attn, 1, 1, 1, padding='SAME')
result = tf.reduce_mean(tf.reshape(conv, (-1, max_sent_len)), axis=1)

In [35]:
loss = tf.reduce_sum(tf.square(result - plc_label))
opt = tf.train.AdamOptimizer(0.9).minimize(loss)

In [36]:
class Batch(object):
    def __init__(self, x, y, batch_size):
        self.total_x = x
        self.total_y = y
        self.batch_size = batch_size

        self.iter_per_epoch = len(x) // batch_size
        self.epochs_completed = 0

        self._iter = 0

    def __call__(self):
        start = self._iter * self.batch_size
        end = (self._iter + 1) * self.batch_size

        batch_x = self.total_x[start:end]
        batch_y = self.total_y[start:end]

        self._iter += 1
        if self._iter == self.iter_per_epoch:
            self.epochs_completed += 1
            self._iter = 0

        return batch_x, batch_y

In [37]:
batch = Batch(data, label, 32)
summary = tf.summary.merge([
    tf.summary.scalar('loss', loss)
])

In [38]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [39]:
writer = tf.summary.FileWriter('./summary/movie1', sess.graph)

In [41]:
n_epoch = 1000
for epoch in range(n_epoch):
    for n in range(batch.iter_per_epoch):
        batch_x, batch_y = batch()
        _, s = sess.run([opt, summary], feed_dict={plc_embed: batch_x, plc_label: batch_y, training: True, keep_prob: 0.5})
        writer.add_summary(s, epoch * n_epoch + n)

KeyboardInterrupt: 