* get training data

In [1]:
!wget http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz

--2017-01-30 19:26:43--  http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
Resolving www.fit.vutbr.cz (www.fit.vutbr.cz)... 147.229.9.23, 2001:67c:1220:809::93e5:917
Connecting to www.fit.vutbr.cz (www.fit.vutbr.cz)|147.229.9.23|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 34869662 (33M) [application/x-gtar]
Saving to: 'simple-examples.tgz'


2017-01-30 19:27:48 (535 KB/s) - 'simple-examples.tgz' saved [34869662/34869662]



In [5]:
!tar xfz simple-examples.tgz

tar: Failed to set default locale


In [7]:
import time
import numpy as np
import tensorflow as tf
from tensorflow.python.ops.rnn_cell import BasicLSTMCell, MultiRNNCell, DropoutWrapper

In [9]:
from tensorflow.models.rnn.ptb import reader

In [19]:
# 학습 설정용 configuration class
class SmallConfig(object):
    init_scale = 0.1 # 가중치 행렬을 랜덤하게 초기화할 때 생성되는 값의 범위
    learning_rate = 1.0 # 학습 속도 조절. 학습 속도 = learning_rate * lr_decay
    max_grad_norm = 5 # gradient clipping threshold. gradient l2-norm이 threshold보다 큰 경우, threshold / l2-norm 값을 gradient에 곱한다.
    num_layers = 2
    num_steps = 20 # 연속적으로 처리할 데이터 양.
    hidden_size = 200 # 한 layer에 배치할 뉴런의 수.
    max_epoch = 4 # 여기 도달할 때까지는 (최초 5회는) 초기 학습 속도가 유지된다.
    max_max_epoch = 13 # 전체 학습 횟수
    keep_prob = 1.0 # drop out 할 때 유지할 비율. 1.0은 drop out을 하지 않음.
    lr_decay = 0.5
    batch_size = 20
    vocab_size = 10000
    

config = SmallConfig()
eval_config = SmallConfig()
eval_config.batch_size = 1
eval_config.num_steps = 1

class PTBModel(object):
    def __init__(self, config, is_training=False):
        self.batch_size = config.batch_size
        self.num_steps = config.num_steps
        input_size = [config.batch_size, config.num_steps]
        self.input_data = tf.placeholder(tf.int32, input_size)
        self.targets = tf.placeholder(tf.int32, input_size)
        
        lstm_cell = BasicLSTMCell(config.hidden_size, forget_bias=0.0, state_is_tuple=True)
        
        # SmallConfig에서는 Dropout이 적용되지 않음.
        if is_training and config.keep_prob < 1.0:
            lstm_cell = DropoutWrapper(lstm_cell, config.keep_prob)
            
        # 두 개의 계층을 가진 신경망 구조
        cell = MultiRNNCell([lstm_cell] * config.num_layers, state_is_tuple=True)
        
        self.initial_state = cell.zero_state(config.batch_size, tf.float32)
        
        with tf.device("/cpu:0"):
            embedding_size = [config.vocab_size, config.hidden_size]
            embedding = tf.get_variable("embedding", embedding_size)
            inputs = tf.nn.embedding_lookup(embedding, self.input_data)
            
        # SmallConfig에서는 dropout이 적용되지 않음.
        if is_training and config.keep_prob < 1:
            inputs = tf.nn.dropout(inputs, config.keep_prob)
            
        # 각 배치마다 순서대로 데이터를 뽑아 셀에 입력.
        outputs = []
        state = self.initial_state
        with tf.variable_scope("RNN"):
            for time_step in range(config.num_steps):
                if time_step > 0: tf.get_variable_scope().reuse_variables()
                (cell_output, state) = cell(inputs[:, time_step, :], state)
                outputs.append(cell_output)
                
        # output의 크기를 20*20*200에서 400*200으로 변경
        output = tf.reshape(tf.concat(1, outputs), [-1, config.hidden_size])
        softmax_w_size = [config.hidden_size, config.vocab_size]
        softmax_w = tf.get_variable("softmax_w", softmax_w_size)
        softmax_b = tf.get_variable("softmax_b", [config.vocab_size])
        
        # logits의 크기는 400 * 10000
        logits = tf.matmul(output, softmax_w) + softmax_b
        
        loss = tf.nn.seq2seq.sequence_loss_by_example(
                    [logits],
                    [tf.reshape(self.targets, [-1])],
                    [tf.ones([config.batch_size * config.num_steps])])
        self.cost = tf.reduce_sum(loss) / config.batch_size
        self.final_state = state
        
        if not is_training:
            return
        
        self.lr = tf.Variable(0.0, trainable=False)
        tvars = tf.trainable_variables()
        
        # 기울기 클리핑
        grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), config.max_grad_norm)
        optimizer = tf.train.GradientDescentOptimizer(self.lr)
        self.train_op = optimizer.apply_gradients(zip(grads, tvars))
        
    def assign_lr(self, session, lr_value):
        session.run(tf.assign(self.lr, lr_value))
        

def run_epoch(session, m, data, is_training=False):
    epoch_size = ((len(data) // m.batch_size) - 1) // m.num_steps
    start_time = time.time()
    costs = 0.0
    iters = 0
    
    eval_op = m.train_op if is_training else tf.no_op()
    
    # initial_state는 20*200 텐서를 원소로 갖는 2*2 튜플
    state_list = []
    for c, h in m.initial_state:
        state_list.extend([c.eval(), h.eval()])
        
    ptb_iter = reader.ptb_iterator(data, m.batch_size, m.num_steps)
    for step, (x, y) in enumerate(ptb_iter):
        fetch_list = [m.cost]
        
        # final_state 튜플에 담긴 상태를 꺼내 fetch_list에 담는다.
        for c, h in m.final_state:
            fetch_list.extend([c, h])
        
        fetch_list.append(eval_op)
        
        # 이전 스텝에서 구한 state_list가 feed_dict로 주입
        feed_dict = {m.input_data: x, m.targets: y}
        for i in range(len(m.initial_state)):
            c, h = m.initial_state[i]
            feed_dict[c], feed_dict[h] = state_list[i*2:(i+1)*2]
            
        # fetch_list에 담긴 final_state의 결과를 state_list로 전달받음.
        vals = session.run(fetch_list, feed_dict)
        
        cost = vals[0]
        state = vals[1]

        cost += cost
        iters += m.num_steps
        
        if is_training and step % (epoch_size // 10) == 10:
            print("%.3f perplexity: %.3f speed: %.0f wps" %
                     (step * 1.0 / epoch_size, np.exp(costs / iters),
                     iters * m.batch_size / (time.time() - start_time)))
        
    return np.exp(costs / iters)


raw_data = reader.ptb_raw_data('simple-examples/data')
train_data, valid_data, test_data, _ = raw_data

with tf.Graph().as_default(), tf.Session() as session:
    initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale)
    
    # 학습, 검증, 테스트를 위한 모델을 만든다.
    with tf.variable_scope("model", reuse=None, initializer=initializer):
        m = PTBModel(config, is_training=True)
        
    with tf.variable_scope("model", reuse=True, initializer=initializer):
        mvalid = PTBModel(config)
        mtest = PTBModel(eval_config)
        
    tf.initialize_all_variables().run()
    
    for i in range(config.max_max_epoch):
        # lr_decay는 반복 속도를 조절해주는 역할
        lr_decay = config.lr_decay ** max(i - config.max_epoch, 0.0)
        m.assign_lr(session, config.learning_rate * lr_decay)
        print("Epoch: %d Learning rate: %.3f" % (i+1, session.run(m.lr)))
        
        perplexity = run_epoch(session, m, train_data, is_training=True)
        print("Epoch: %d Train Perplexity: %.3f" % (i+1, perplexity))
        
        perplexity = run_epoch(session, mvalid, valid_data)
        print("Epoch: %d Valid Perplexity: %.3f" % (i+1, perplexity))
        
    perplexity = run_epoch(session, mtest, test_data)
    print("Test Perplexity: %.3f" % perplexity)

Epoch: 1 Learning rate: 1.000
0.004 perplexity: 1.000 speed: 2071 wps
0.104 perplexity: 1.000 speed: 2194 wps
0.204 perplexity: 1.000 speed: 2152 wps
0.304 perplexity: 1.000 speed: 2157 wps
0.404 perplexity: 1.000 speed: 2158 wps
0.504 perplexity: 1.000 speed: 2160 wps
0.604 perplexity: 1.000 speed: 2160 wps
0.703 perplexity: 1.000 speed: 2159 wps
0.803 perplexity: 1.000 speed: 2158 wps
0.903 perplexity: 1.000 speed: 2158 wps
Epoch: 1 Train Perplexity: 1.000
Epoch: 1 Valid Perplexity: 1.000
Epoch: 2 Learning rate: 1.000
0.004 perplexity: 1.000 speed: 2141 wps
0.104 perplexity: 1.000 speed: 2149 wps
0.204 perplexity: 1.000 speed: 2146 wps
0.304 perplexity: 1.000 speed: 2148 wps
0.404 perplexity: 1.000 speed: 2145 wps
0.504 perplexity: 1.000 speed: 2141 wps
0.604 perplexity: 1.000 speed: 2140 wps
0.703 perplexity: 1.000 speed: 2137 wps
0.803 perplexity: 1.000 speed: 2136 wps
0.903 perplexity: 1.000 speed: 2135 wps
Epoch: 2 Train Perplexity: 1.000
Epoch: 2 Valid Perplexity: 1.000
Epoch: 3