# Text Generation

In [1]:
import os
import time
import random
import json
import numpy as np
import tensorflow as tf
import random
import nltk
from tqdm import tnrange, tqdm_notebook
nltk.download("punkt")

  from ._conv import register_converters as _register_converters


[nltk_data] Downloading package punkt to /Users/junwonpk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
class ModelNetwork:
    """
    RNN with num_layers LSTM layers and a fully-connected output layer
    The network allows for a dynamic number of iterations, depending on the
    inputs it receives.
       out   (fc layer; out_size)
        ^
       lstm
        ^
       lstm  (lstm size)
        ^
        in   (in_size)
    """
    def __init__(self, in_size, lstm_size, num_layers, out_size, session,
                 learning_rate=0.003, name="rnn"):
        self.scope = name
        self.in_size = in_size
        self.lstm_size = lstm_size
        self.num_layers = num_layers
        self.out_size = out_size
        self.session = session
        self.learning_rate = tf.constant(learning_rate)
        # Last state of LSTM, used when running the network in TEST mode
        self.lstm_last_state = np.zeros(
            (self.num_layers * 2 * self.lstm_size,)
        )
        with tf.variable_scope(self.scope):
            # (batch_size, timesteps, in_size)
            self.xinput = tf.placeholder(
                tf.float32,
                shape=(None, None, self.in_size),
                name="xinput"
            )
            self.lstm_init_value = tf.placeholder(
                tf.float32,
                shape=(None, self.num_layers * 2 * self.lstm_size),
                name="lstm_init_value"
            )
            # LSTM
            self.lstm_cells = [
                tf.contrib.rnn.BasicLSTMCell(
                    self.lstm_size,
                    forget_bias=1.0,
                    state_is_tuple=False
                ) for i in range(self.num_layers)
            ]
            self.lstm = tf.contrib.rnn.MultiRNNCell(
                self.lstm_cells,
                state_is_tuple=False
            )
            # Iteratively compute output of recurrent network
            outputs, self.lstm_new_state = tf.nn.dynamic_rnn(
                self.lstm,
                self.xinput,
                initial_state=self.lstm_init_value,
                dtype=tf.float32
            )
            # Linear activation (FC layer on top of the LSTM net)
            self.rnn_out_W = tf.Variable(
                tf.random_normal(
                    (self.lstm_size, self.out_size),
                    stddev=0.01
                )
            )
            self.rnn_out_B = tf.Variable(
                tf.random_normal(
                    (self.out_size,), stddev=0.01
                )
            )
            outputs_reshaped = tf.reshape(outputs, [-1, self.lstm_size])
            network_output = tf.matmul(
                outputs_reshaped,
                self.rnn_out_W
            ) + self.rnn_out_B
            batch_time_shape = tf.shape(outputs)
            self.final_outputs = tf.reshape(
                tf.nn.softmax(network_output),
                (batch_time_shape[0], batch_time_shape[1], self.out_size)
            )
            # Training: provide target outputs for supervised training.
            self.y_batch = tf.placeholder(
                tf.float32,
                (None, None, self.out_size)
            )
            y_batch_long = tf.reshape(self.y_batch, [-1, self.out_size])
            self.cost = tf.reduce_mean(
                tf.nn.softmax_cross_entropy_with_logits(
                    logits=network_output,
                    labels=y_batch_long
                )
            )
            self.train_op = tf.train.RMSPropOptimizer(
                self.learning_rate,
                0.9
            ).minimize(self.cost)

    # Input: X is a single element, not a list!
    def run_step(self, x, init_zero_state=True):
        # Reset the initial state of the network.
        if init_zero_state:
            init_value = np.zeros((self.num_layers * 2 * self.lstm_size,))
        else:
            init_value = self.lstm_last_state
        out, next_lstm_state = self.session.run(
            [self.final_outputs, self.lstm_new_state],
            feed_dict={
                self.xinput: [x],
                self.lstm_init_value: [init_value]
            }
        )
        self.lstm_last_state = next_lstm_state[0]
        return out[0][0]

    # xbatch must be (batch_size, timesteps, input_size)
    # ybatch must be (batch_size, timesteps, output_size)
    def train_batch(self, xbatch, ybatch):
        init_value = np.zeros(
            (xbatch.shape[0], self.num_layers * 2 * self.lstm_size)
        )
        cost, _ = self.session.run(
            [self.cost, self.train_op],
            feed_dict={
                self.xinput: xbatch,
                self.y_batch: ybatch,
                self.lstm_init_value: init_value
            }
        )
        return cost


def embed_to_vocab(data_, vocab):
    """
    Embed string to character-arrays -- it generates an array len(data)
    x len(vocab).
    Vocab is a list of elements.
    """
    data = np.zeros((len(data_), len(vocab)))
    cnt = 0
    for s in data_:
        v = [0.0] * len(vocab)
        v[vocab.index(s)] = 1.0
        data[cnt, :] = v
        cnt += 1
    return data


def decode_embed(array, vocab):
    return vocab[array.index(1)]


def load_data(input):
    # Load the data
    data_ = ""
    with open(input, 'r') as f:
        data_ += f.read()
    data_ = data_.lower()
    # Convert to 1-hot coding
    vocab = sorted(list(set(data_)))
    data = embed_to_vocab(data_, vocab)
    return data, vocab


def check_restore_parameters(sess, saver):
    """ Restore the previously trained parameters if there are any. """
    ckpt = tf.train.get_checkpoint_state(os.path.dirname('saved/checkpoint'))
    if ckpt and ckpt.model_checkpoint_path:
        saver.restore(sess, ckpt.model_checkpoint_path)

In [3]:
def train(input_file):
    ckpt_file = "saved/model.ckpt"

    data, vocab = load_data(input_file)

    in_size = out_size = len(vocab)
    lstm_size = 256  # 128
    num_layers = 2
    batch_size = 64  # 128
    time_steps = 100  # 50

    NUM_TRAIN_BATCHES = 20000

    # Number of test characters of text to generate after training the network
    LEN_TEST_TEXT = 500

    # Initialize the network
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.InteractiveSession(config=config)
    net = ModelNetwork(
        in_size=in_size,
        lstm_size=lstm_size,
        num_layers=num_layers,
        out_size=out_size,
        session=sess,
        learning_rate=0.003,
        name="char_rnn_network"
    )
    sess.run(tf.global_variables_initializer())
    saver = tf.train.Saver(tf.global_variables())

    # TRAIN THE NETWORK
#     check_restore_parameters(sess, saver)
    last_time = time.time()
    batch = np.zeros((batch_size, time_steps, in_size))
    batch_y = np.zeros((batch_size, time_steps, in_size))
    possible_batch_ids = range(data.shape[0] - time_steps - 1)

    for i in tnrange(NUM_TRAIN_BATCHES):
        # Sample time_steps consecutive samples from the dataset text file
        batch_id = random.sample(possible_batch_ids, batch_size)

        for j in range(time_steps):
            ind1 = [k + j for k in batch_id]
            ind2 = [k + j + 1 for k in batch_id]

            batch[:, j, :] = data[ind1, :]
            batch_y[:, j, :] = data[ind2, :]

        cst = net.train_batch(batch, batch_y)

        if (i % 100) == 0:
            new_time = time.time()
            diff = new_time - last_time
            last_time = new_time
            print("batch: {}  loss: {}  speed: {} batches / s".format(
                i, cst, 100 / diff
            ))
            saver.save(sess, ckpt_file)

In [4]:
def generate(input_file, prefix):
    ckpt_file = "saved/model.ckpt"

    data, vocab = load_data(input_file)

    in_size = out_size = len(vocab)
    lstm_size = 256  # 128
    num_layers = 2
    batch_size = 64  # 128
    time_steps = 100  # 50

    NUM_TRAIN_BATCHES = 20000

    # Number of test characters of text to generate after training the network
    LEN_TEST_TEXT = 500

    # Initialize the network
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.InteractiveSession(config=config)
    net = ModelNetwork(
        in_size=in_size,
        lstm_size=lstm_size,
        num_layers=num_layers,
        out_size=out_size,
        session=sess,
        learning_rate=0.003,
        name="char_rnn_network"
    )
    sess.run(tf.global_variables_initializer())
    saver = tf.train.Saver(tf.global_variables())

    # 2) GENERATE LEN_TEST_TEXT CHARACTERS USING THE TRAINED NETWORK
    saver.restore(sess, ckpt_file)

    TEST_PREFIX = prefix
    TEST_PREFIX = TEST_PREFIX.lower()
    for i in range(len(TEST_PREFIX)):
        out = net.run_step(embed_to_vocab(TEST_PREFIX[i], vocab), i == 0)

    print("Sentence:")
    gen_str = TEST_PREFIX
    for i in range(LEN_TEST_TEXT):
        # Sample character from the network according to the generated
        # output probabilities.
        element = np.random.choice(range(len(vocab)), p=out)
        gen_str += vocab[element]
        out = net.run_step(embed_to_vocab(vocab[element], vocab), False)

    print(gen_str)

In [5]:
# train("data/shakespeare.txt")

In [6]:
# generate("data/shakespeare.txt", "The")

In [None]:
def writeTXT(threshold):
    startwords = set()
    senlens = list()
    numsens = list()
    with open("data/ProcessedTrain", "r") as inFile:
        with open("data/inspireComments.txt", "w") as outFile:
            for i, line in enumerate(inFile, 1):
                comment = json.loads(line)
                if comment["num_child_comments"] > threshold:
                    sentences = nltk.sent_tokenize(comment["body"])
                    numsens.append(len(sentences))
                    for sentence in sentences:
                        words = nltk.word_tokenize(sentence)
                        startwords.add(words[0])
                        senlens.append(len(words))
                        outFile.write(sentence + " ")
                if i % 1000000 == 0:
                    print ("Processed {} lines".format(i))
    startwords = list(startwords)
    return startwords, senlens, numsens
# writeTXT(30)

In [None]:
train("data/inspireComments.txt")

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See tf.nn.softmax_cross_entropy_with_logits_v2.



batch: 0  loss: 4.940942287445068  speed: 137.92833113554076 batches / s
batch: 100  loss: 3.2343590259552  speed: 1.7801784460071794 batches / s
batch: 200  loss: 3.052891731262207  speed: 1.7472711916230277 batches / s
batch: 300  loss: 2.5932068824768066  speed: 1.4706789007857004 batches / s
batch: 400  loss: 2.2671058177948  speed: 1.6185652791417167 batches / s
batch: 500  loss: 1.9780768156051636  speed: 1.695493333861213 batches / s
batch: 600  loss: 1.891066551208496  speed: 1.591594518947286 batches / s
batch: 700  loss: 1.6866545677185059  speed: 1.4182211306821173 batches / s
batch: 800  loss: 1.6181329488754272  speed: 1.4162201279193034 batches / s
batch: 900  loss: 1.5571234226226807  speed: 1.5041335104395688 batches / s
batch: 1000  loss: 1.538813829421997  speed: 1.579233956632582 batches / s
batch: 1100  loss: 1.4829394817352295  speed: 1.5186433616196546 batches / s
batch: 1200  loss: 1.4241018295288086  speed: 1.591538208125636 batches / s


In [None]:
prefix = startwords[random.randint(0,len(startwords)-1)]
print(prefix)
generate("data/shakespeare.txt", prefix) 
#TODO: control comment length. numsens * senlens