<a href="https://colab.research.google.com/github/kyrcha/deep-learning-pipelines/blob/master/generating_paper_titles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Generating plausible paper titles with Recurrent Neural Networks

In this notebook we will generate fictional paper titles using recurrent neural networks and more specifically LSTM. We reused some ideas and code from:
- https://adventuresinmachinelearning.com/keras-lstm-tutorial/ (Keras stuff)
- https://github.com/dennybritz/rnn-tutorial-rnnlm/blob/master/RNNLM.ipynb (Preprocessing stuff)

In [0]:
import csv
import itertools
import operator
import numpy as np
import nltk
import sys
from datetime import datetime

import matplotlib.pyplot as plt
%matplotlib inline

In [0]:
%%capture
# Download NLTK model data (you need to do this once)
nltk.download("book")

In [0]:
# Declare tokens to be used for unknown words, start and end of titles.
unknown_token = "UNKNOWN_TOKEN"
title_start_token = "TITLE_START"
title_end_token = "TITLE_END"

In [0]:
# Read the file
with open('data/ieee-tnnls-titles.txt', 'r') as f:
    text = f.read()

In [0]:
# Test what I've read
#print(text)

## Explore the data

In [30]:
print('Dataset Stats')
print('Roughly the number of unique words: {}'.format(len({word: None for word in text.split()})))
titles = text.splitlines()
print('Number of titles: {}'.format(len(titles)))

word_count_sentence = [len(title.split()) for title in titles]
print('Average number of words in each title: {}'.format(np.average(word_count_sentence)))

Dataset Stats
Roughly the number of unique words: 2705
Number of titles: 1207
Average number of words in each title: 10.198011599005799


In [0]:
# Tokenize
from nltk.tokenize import sent_tokenize, word_tokenize
sentences = itertools.chain(*[nltk.sent_tokenize(x.lower()) for x in titles])

In [32]:
# Append SENTENCE_START and SENTENCE_END
tokenized_titles = ["%s %s %s" % (title_start_token, x, title_end_token) for x in sentences]
print("Parsed %d sentences." % (len(tokenized_titles)))


Parsed 1207 sentences.


In [0]:
# Tokenize the sentences into words
tokenized_titles = [nltk.word_tokenize(title) for title in tokenized_titles]

In [0]:
final_title = []
for title in tokenized_titles:
  final_title.append([token for token in title if token.isalpha() or token == title_start_token or token == title_end_token])
  
tokenized_titles = final_title

In [0]:
# Test the titles
#print(tokenized_titles)

In [36]:
# Count the word frequencies
word_freq = nltk.FreqDist(itertools.chain(*tokenized_titles))
print("Found %d unique words tokens." % len(word_freq.items()))

Found 2073 unique words tokens.


In [37]:
# Get the most common words and build index_to_word and word_to_index vectors
vocabulary_size = 250
vocab = word_freq.most_common(vocabulary_size-1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])

print("Using vocabulary size %d." % vocabulary_size)
print("The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1]))

Using vocabulary size 250.
The least frequent word in our vocabulary is 'stable' and appeared 7 times.


In [38]:
# What does the vocabulary looks like?
vocab

[('TITLE_START', 1207),
 ('TITLE_END', 1207),
 ('for', 553),
 ('of', 400),
 ('with', 318),
 ('and', 286),
 ('neural', 284),
 ('learning', 267),
 ('networks', 253),
 ('a', 242),
 ('control', 186),
 ('systems', 145),
 ('in', 129),
 ('adaptive', 123),
 ('on', 118),
 ('network', 112),
 ('nonlinear', 104),
 ('the', 102),
 ('using', 98),
 ('via', 87),
 ('to', 82),
 ('data', 78),
 ('analysis', 71),
 ('deep', 70),
 ('based', 68),
 ('classification', 64),
 ('approach', 57),
 ('feature', 54),
 ('by', 52),
 ('synchronization', 51),
 ('an', 51),
 ('clustering', 50),
 ('dynamic', 49),
 ('robust', 49),
 ('image', 45),
 ('method', 42),
 ('stochastic', 42),
 ('algorithm', 42),
 ('model', 42),
 ('tracking', 41),
 ('stability', 41),
 ('online', 41),
 ('multiple', 38),
 ('estimation', 38),
 ('sparse', 37),
 ('selection', 37),
 ('optimization', 36),
 ('optimal', 36),
 ('recurrent', 35),
 ('design', 34),
 ('delays', 34),
 ('regression', 33),
 ('matrix', 32),
 ('distributed', 31),
 ('reinforcement', 29),
 (

In [39]:
# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_titles):
    tokenized_titles[i] = [w if w in word_to_index else unknown_token for w in sent]

print("\nExample sentence: '%s'" % titles[100])
print("\nExample sentence after Pre-processing: '%s'" % tokenized_titles[100])


Example sentence: 'Plume Tracing via Model-Free Reinforcement Learning Method'

Example sentence after Pre-processing: '['TITLE_START', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'via', 'reinforcement', 'learning', 'method', 'TITLE_END']'


In [40]:
# Just an example of how to make a training dataset without a generator
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_titles])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_titles])
print(X_train[0])
print(y_train[0])
# Print an training data example
x_example, y_example = X_train[0], y_train[0]
print("x:\n%s\n%s" % (" ".join([index_to_word[x] for x in x_example]), x_example))
print("\ny:\n%s\n%s" % (" ".join([index_to_word[x] for x in y_example]), y_example))

[0, 104, 55, 4, 23, 7, 9, 249]
[104, 55, 4, 23, 7, 9, 249, 1]
x:
TITLE_START object detection with deep learning a UNKNOWN_TOKEN
[0, 104, 55, 4, 23, 7, 9, 249]

y:
object detection with deep learning a UNKNOWN_TOKEN TITLE_END
[104, 55, 4, 23, 7, 9, 249, 1]


In [0]:
class KerasBatchGenerator(object):

  def __init__(self, data, num_steps, batch_size, vocabulary, skip_step=5):
    self.data = data
    self.num_steps = num_steps
    self.batch_size = batch_size
    self.vocabulary = vocabulary
    # this will track the progress of the batches sequentially through the
    # data set - once the data reaches the end of the data set it will reset
    # back to zero
    self.current_idx = 0
    # skip_step is the number of words which will be skipped before the next
    # batch is skimmed from the data set
    self.skip_step = skip_step

  def generate(self):
    x = np.zeros((self.batch_size, self.num_steps))
    y = np.zeros((self.batch_size, self.num_steps, self.vocabulary))
    while True:
      i = 0
      while i < self.batch_size:
        # I don't want to see in x a title end token to predict y 
        if self.current_idx < len(self.data) and self.data[self.current_idx] == word_to_index[title_end_token]:
          self.current_idx += self.skip_step
        if self.current_idx + self.num_steps >= len(self.data):
          # reset the index back to the start of the data set
          self.current_idx = 0
        x[i, :] = self.data[self.current_idx:self.current_idx + self.num_steps]
        temp_y = self.data[self.current_idx + 1:self.current_idx + self.num_steps + 1]
        # convert all of temp_y into a one hot representation
        y[i, :, :] = to_categorical(temp_y, num_classes=self.vocabulary)
        self.current_idx += self.skip_step
        i += 1
      yield x, y


In [0]:
num_steps = 1
skip_step = 1
batch_size = 10

# set seeds for reproducibility
from numpy.random import seed
seed(123)
from tensorflow import set_random_seed
set_random_seed(234)

# Create the training data
# A concatenation of all tokens as integers (indices)
X = list(itertools.chain(*np.asarray([[word_to_index[w] for w in sent] for sent in tokenized_titles])))
# Create 2 batch generators out of the concatenation
train_data_generator = KerasBatchGenerator(X[:10000], num_steps, batch_size, vocabulary_size, skip_step)
valid_data_generator = KerasBatchGenerator(X[10001:], num_steps, batch_size, vocabulary_size, skip_step)

In [0]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, Dropout, TimeDistributed
from keras.layers import LSTM
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint

hidden_size = 250

model = Sequential()
model.add(Embedding(vocabulary_size, hidden_size, input_length=num_steps))
model.add(LSTM(hidden_size, return_sequences=True))
model.add(LSTM(hidden_size, return_sequences=True))
model.add(Dropout(rate=0.5))
model.add(TimeDistributed(Dense(vocabulary_size)))
model.add(Activation('softmax'))

In [0]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])

In [45]:
num_epochs = 10

model.fit_generator(train_data_generator.generate(), len(X[:10000])//(batch_size*num_steps), num_epochs,
                        validation_data=valid_data_generator.generate(),
                        validation_steps=len(X[10001:])//(batch_size*num_steps))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fb9ab639470>

In [46]:
def generate_title(model):
    # We start the sentence with the start token
    new_title = [word_to_index[title_start_token]]
    # Repeat until we get an end token
    while not new_title[-1] == word_to_index[title_end_token]:
        x = np.zeros((1,1))
        x[0, :] = new_title[-1]
        next_word_probs = model.predict(x)[0][0]
        sampled_word = word_to_index[unknown_token]
        # We don't want to sample unknown words
        while sampled_word == word_to_index[unknown_token]:
            samples = np.random.multinomial(1, next_word_probs)
            sampled_word = np.argmax(samples)
        new_title.append(sampled_word)
    title_str = [index_to_word[x] for x in new_title[1:-1]]
    return title_str

num_sentences = 30
senten_min_length = 7
senten_max_length = 15

for i in range(num_sentences):
    sent = []
    # We want long sentences, not sentences with one or two words
    while len(sent) < senten_min_length or len(sent) > senten_max_length:
        sent = generate_title(model)
    print(" ".join(sent))

a flexible neural networks with deep output regulation for support vector optimization
output regulation of linear systems using approximate dynamic programming data
dynamic regularized constrained on adaptive neural networks
a unified embedding of multiple delays for efficient regularization using a switching of hierarchical problem
a exponential synchronization of the the learning based on bayesian learning
feature selection by an classification with domain selection for linear optimization for the an synchronization
new unified and based on communication constraints for the an deep learning control
a recurrent neural networks with input delay
neural networks approximation data with switching methods
a weighted linear systems through neural data
a novel boosting based on neural networks for networked uncertain classification
nonlinear systems on uncertain systems based on neural networks
face classification using neural networks with uncertain systems
multiview optimal nonlinear syst