In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import os
import time

import tensorflow as tf
import numpy as np

# rnn/ptb/reader.py

In [2]:
def _read_words(filename):
    with tf.gfile.GFile(filename,'r') as f:
        return f.read().decode('utf-8').replace('\n','<eos>').split()

In [3]:
def _build_vocab(filename):
    data=_read_words(filename)
    
    counter=collections.Counter(data)
    count_pairs=sorted(counter.items(),key=lambda x:(-x[1],x[0])) # sorted by counts, if equal alphabetical order
    
    words,_=list(zip(*count_pairs))
    word_to_id=dict(zip(words,range(len(words))))
    
    return word_to_id

In [4]:
def _file_to_word_ids(filename,word_to_id):
    data=_read_words(filename)
    return [word_to_id[word] for word in data if word in word_to_id]

In [5]:
def ptb_raw_data(data_path=None):
    train_path=os.path.join(data_path,'ptb.train.txt')
    valid_path=os.path.join(data_path,'ptb.valid.txt')
    test_path=os.path.join(data_path,'ptb.test.txt')
    
    word_to_id=_build_vocab(train_path)
    train_data=_file_to_word_ids(train_path,word_to_id)
    valid_data=_file_to_word_ids(valid_path,word_to_id)
    test_data=_file_to_word_ids(test_path,word_to_id)
    vocabulary=len(word_to_id) # not used
    return train_data,valid_data,test_data,vocabulary

In [6]:
def ptb_producer(raw_data,batch_size,num_steps,name=None):
    with tf.name_scope(name,'PTBProducer',[raw_data,batch_size,num_steps]):
        raw_data=tf.convert_to_tensor(raw_data,name='raw_data',dtype=tf.int32)
        data_len=tf.size(raw_data)
        batch_len=data_len//batch_size
        
        data=tf.reshape(raw_data[0:batch_size*batch_len],[batch_size,batch_len])
        
        epoch_size=(batch_len-1) // num_steps
        assertion=tf.assert_positive(epoch_size,message='epoch_size == 0,decrease batch_size or num_steps')        
        with tf.control_dependencies([assertion]):
            epoch_size=tf.identity(epoch_size,name='epoch_size')
            
        i=tf.train.range_input_producer(epoch_size,shuffle=False).dequeue()
        x=tf.slice(data,[0,i*num_steps],[batch_size,num_steps])
        y=tf.slice(data,[0,i*num_steps+1],[batch_size,num_steps])
        return x,y

In [7]:
# reader_test.py example
raw_data = [4, 3, 2, 1, 0, 5, 6, 1, 1, 1, 1, 0, 3, 4, 1]
batch_size = 3
num_steps = 2

with tf.name_scope(None,'PTBProducer',[raw_data,batch_size,num_steps]):
    raw_data=tf.convert_to_tensor(raw_data,name='raw_data',dtype=tf.int32)
    data_len=tf.size(raw_data)
    batch_len=data_len//batch_size
    data=tf.reshape(raw_data[0:batch_size*batch_len],[batch_size,batch_len])
        
    epoch_size=(batch_len-1) // num_steps
    assertion=tf.assert_positive(epoch_size,message='epoch_size == 0,decrease batch_size or num_steps')        
    with tf.control_dependencies([assertion]):
        epoch_size=tf.identity(epoch_size,name='epoch_size')
        
    i=tf.train.range_input_producer(epoch_size,shuffle=False).dequeue()
    x=tf.slice(data,[0,i*num_steps],[batch_size,num_steps])
    y=tf.slice(data,[0,i*num_steps+1],[batch_size,num_steps])
    
with tf.Session() as sess:
    coord = tf.train.Coordinator()
    tf.train.start_queue_runners(sess, coord=coord)
    try:
        print(sess.run([i,epoch_size]))
        print(sess.run(i))
        print(sess.run(i))
        print(sess.run(i))
    finally:
        coord.request_stop()
        coord.join()

[0, 2]
1
0
1


# ptb_word_lm.py

In [8]:
flags=tf.flags
logging=tf.logging

flags.DEFINE_string('model','small','A type of model. Possible options are: small, medium, large')
flags.DEFINE_string('data_path','ptb','Where the training/test data is stored')
flags.DEFINE_string('save_path','ptb','Model output directory')
flags.DEFINE_bool('use_fp16',False,'Train using 16-bit floats instead of 32bit floats')

FLAGS=flags.FLAGS

In [9]:
class SmallConfig(object):
    init_scale=0.1
    learning_rate=1.0
    max_grad_norm=5
    num_layers=2
    num_steps=20
    hidden_size=200
    max_epoch=4
    max_max_epoch=13
    keep_prob=1.0
    lr_decay=0.5
    batch_size=20
    vocab_size=10000

In [10]:
class MediumConfig(object):
    init_scale=0.05
    learning_rate=1.0
    max_grad_norm=5
    num_layers=2
    num_steps=35
    hidden_size=650
    max_epoch=6
    max_max_epoch=39
    keep_prob=0.5
    lr_decay=0.8
    batch_size=20
    vocab_size=10000

In [11]:
class LargeConfig(object):
    init_scale=0.04
    learning_rate=1.0
    max_grad_norm=10
    num_layers=2
    num_steps=35
    hidden_size=1500
    max_epoch=14
    max_max_epoch=55
    keep_prob=0.35
    lr_decay=1/1.15
    batch_size=20
    vocab_size=10000

In [12]:
class TestConfig(object):
    'Tiny config, for testing.'
    init_scale=0.1
    learning_rate=1.0
    max_grad_norm=1
    num_layers=1
    num_steps=2
    hidden_size=2
    max_epoch=1
    max_max_epoch=1
    keep_prob=1.0
    lr_decay=0.5
    batch_size=20
    vocab_size=10000

In [13]:
def get_config():
    if FLAGS.model=='small':
        return SmallConfig()
    elif FLAGS.model=='medium':
        return MediumConfig()
    elif FLAGS.model=='large':
        return LargeConfig()
    elif FLAGS.model=='test':
        return TestConfig()
    else:
        raise ValueError('Invalid model: %s',FLAGS.model)

In [14]:
def data_type():
    return tf.float16 if FLAGS.use_fp16 else tf.float32

In [15]:
class PTBInput(object):
    def __init__(self,config,data,name=None):
        self.batch_size=batch_size=config.batch_size
        self.num_steps=num_steps=config.num_steps
        self.epoch_size=((len(data)//batch_size)-1)//num_steps
        self.input_data,self.targets=ptb_producer(data,batch_size,num_steps,name=name)

In [21]:
class PTBModel(object):
    def __init__(self,is_training,config,input_):
        self._input=input_
        
        batch_size=input_.batch_size
        num_steps=input_.num_steps
        size=config.hidden_size
        vocab_size=config.vocab_size
        
        lstm_cell=tf.nn.rnn_cell.BasicLSTMCell(size,forget_bias=0.0,state_is_tuple=True)
        if is_training and config.keep_prob<1:
            lstm_cell=tf.nn.rnn_cell.DropoutWrapper(lstm_cell,output_keep_prob=config.keep_prob)
        cell=tf.nn.rnn_cell.MultiRNNCell([lstm_cell]*config.num_layers,state_is_tuple=True)
        
        self._initial_state=cell.zero_state(batch_size,data_type())
        
        with tf.device('/cpu:0'):
            embedding=tf.get_variable('embedding',[vocab_size,size],dtype=data_type())
            inputs=tf.nn.embedding_lookup(embedding,input_.input_data)
            
        if is_training and config.keep_prob<1:
            inputs=tf.nn.droput(inputs,config.keep_prob)
            
        outputs=[]
        state=self._initial_state
        with tf.variable_scope('RNN'):
            for time_step in range(num_steps):
                if time_step>0: tf.get_variable_scope().reuse_variables()
                (cell_output,state)=cell(inputs[:,time_step,:],state)
                outputs.append(cell_output)
                
        output=tf.reshape(tf.concat(1,outputs),[-1,size])
        softmax_w=tf.get_variable('softmax_w',[size,vocab_size],dtype=data_type())
        softmax_b=tf.get_variable('softmax_b',[vocab_size],dtype=data_type())
        logits=tf.matmul(output,softmax_w)+softmax_b
        loss=tf.nn.seq2seq.sequence_loss_by_example(
        [logits],
        [tf.reshape(input_.targets,[-1])],
        [tf.ones([batch_size*num_steps],dtype=data_type())])
        self._cost=cost=tf.reduce_sum(loss)/batch_size
        self._final_state=state
        
        if not is_training:
            return
        
        self._lr=tf.Variable(0.0,trainable=False)
        tvars=tf.trainable_variables()
        grads,_=tf.clip_by_global_norm(tf.gradients(cost,tvars),config.max_grad_norm)
        optimizer=tf.train.GradientDescentOptimizer(self._lr)
        self._train_op=optimizer.apply_gradients(
            zip(grads,tvars),global_step=tf.contrib.framework.get_or_create_global_step())
        self._new_lr=tf.placeholder(tf.float32,shape=[],name='new_learning_rate')
        self._lr_update=tf.assign(self._lr,self._new_lr)
        
    def assign_lr(self,session,lr_value):
        session.run(self._lr_update,feed_dict={self._new_lr:lr_value})

    def input(self):
        return self._input

    def initial_state(self):
        return self._initial_state

    def cost(self):
        return self._cost

    def final_state(self):
        return self._final_state

    def lr(self):
        return self._lr

    def train_op(self):
        return self._train_op

In [34]:
def run_epoch(session,model,eval_op=None,verbose=False):
    start_time=time.time()
    costs=0.0
    iters=0
    state=session.run(model.initial_state())
    
    fetches={
        'cost':model.cost(),
        'final_state':model.final_state(),
    }
    if eval_op is not None:
        fetches['eval_op']=eval_op
        
    for step in range(model.input().epoch_size):
        feed_dict={}
        for i,(c,h) in enumerate(model.initial_state()):
            feed_dict[c]=state[i].c
            feed_dict[h]=state[i].h
            
        vals=session.run(fetches,feed_dict)
        cost=vals['cost']
        state=vals['final_state']
        
        costs+=cost
        iters+=model.input().num_steps
        
        if verbose and step % (model.input().epoch_size//10)==10:
            print('%.3f perplexity: %.3f speed: %.0f wps' % 
                  (step*1.0/model.input().epoch_size,np.exp(costs/iters),
                  iters*model.input().batch_size/(time.time()-start_time)))
    return np.exp(costs/iters)

In [18]:
if not FLAGS.data_path:
    raise ValueError('Must set --data_path to PTB data directory')
    
train_data,valid_data,test_data,_=ptb_raw_data(FLAGS.data_path)

config=get_config()
eval_config=get_config()
eval_config.batch_size=1
eval_config.num_steps=1

In [36]:
with tf.Graph().as_default():
    initializer=tf.random_uniform_initializer(-config.init_scale,config.init_scale)
    
    with tf.name_scope('Train'):
        train_input=PTBInput(config=config,data=train_data,name='TrainInput')
        with tf.variable_scope('Model',reuse=None,initializer=initializer):
            m=PTBModel(is_training=True,config=config,input_=train_input)
        tf.scalar_summary('Training Loss',m.cost())
        tf.scalar_summary('Learning Rate',m.lr())
        
    with tf.name_scope('Valid'):
        valid_input=PTBInput(config=config,data=valid_data,name='ValidInput')
        with tf.variable_scope('Model',reuse=True,initializer=initializer):
            mvalid=PTBModel(is_training=False,config=config,input_=valid_input)
        tf.scalar_summary('Validation Loss',mvalid.cost())
        
    with tf.name_scope('Test'):
        test_input=PTBInput(config=eval_config,data=test_data,name='TestInput')
        with tf.variable_scope('Model',reuse=True,initializer=initializer):
            mtest=PTBModel(is_training=False,config=eval_config,input_=test_input)
        
    sv=tf.train.Supervisor(logdir=FLAGS.save_path)
    with sv.managed_session() as session:
        for i in range(config.max_max_epoch):
            lr_decay=config.lr_decay**max(i+1-config.max_epoch,0.0)
            m.assign_lr(session,config.learning_rate*lr_decay)
            
            print('Epoch: %d Learning rate: %.3f' % (i+1,session.run(m.lr())))
            train_perplexity=run_epoch(session,m,eval_op=m.train_op(),verbose=True)
            
            print('Epoch: %d Train Perplexity: %.3f' % (i+1,train_perplexity))
            valid_perplexity=run_epoch(session,mvalid)
            print('Epoch: %d Valid Perplexity: %.3f' % (i+1,valid_perplexity))
            
        test_perplexity=run_epoch(session,mtest)
        print('Test Perplexity: %.3f' % test_perplexity)
        
        if FLAGS.save_path:
            print('Saving model to %s.' % FLAGS.save_path)
            sv.saver.save(session,FLAGS.save_path,global_step=sv.global_step)

Epoch: 1 Learning rate: 1.000
0.004 perplexity: 5965.121 speed: 5273 wps
0.104 perplexity: 836.646 speed: 18855 wps
0.204 perplexity: 621.997 speed: 21972 wps
0.304 perplexity: 503.289 speed: 23343 wps
0.404 perplexity: 435.003 speed: 24112 wps
0.504 perplexity: 390.037 speed: 24584 wps
0.604 perplexity: 351.578 speed: 24928 wps
0.703 perplexity: 324.803 speed: 25123 wps
0.803 perplexity: 303.517 speed: 25290 wps
0.903 perplexity: 283.973 speed: 25416 wps
Epoch: 1 Train Perplexity: 269.580
Epoch: 1 Valid Perplexity: 182.811
Epoch: 2 Learning rate: 1.000
0.004 perplexity: 202.417 speed: 27499 wps
0.104 perplexity: 150.811 speed: 26576 wps
0.204 perplexity: 158.375 speed: 26514 wps
0.304 perplexity: 152.989 speed: 26607 wps
0.404 perplexity: 150.200 speed: 26525 wps
0.504 perplexity: 147.834 speed: 26521 wps
0.604 perplexity: 143.278 speed: 26581 wps
0.703 perplexity: 141.084 speed: 26590 wps
0.803 perplexity: 139.063 speed: 26621 wps
0.903 perplexity: 135.405 speed: 26635 wps
Epoch: 2 T