## Building the Model

In [30]:
def model_inputs():
    '''Create placeholders for inputs to the model'''
    
    input_data = tf.placeholder(tf.int32, [None, None], name='input')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    lr = tf.placeholder(tf.float32, name='learning_rate')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    #summary_length = tf.placeholder(tf.int32, (None,), name='summary_length')
    #max_summary_length = tf.reduce_max(summary_length, name='max_dec_len')
    text_length = tf.placeholder(tf.int32, (None,), name='text_length')

    return input_data, targets, lr, keep_prob, text_length

In [31]:
def process_encoding_input(target_data, vocab_to_int, batch_size):
    '''Remove the last word id from each batch and concat the <GO> to the begining of each batch'''
    
    ending = tf.strided_slice(target_data, [0, 0], [batch_size, -1], [1, 1])
    dec_input = tf.concat([tf.fill([batch_size, 1], vocab_to_int['<GO>']), ending], 1)

    return dec_input

In [32]:
def encoding_layer(rnn_size, sequence_length, num_layers, rnn_inputs, keep_prob):
    '''Create the encoding layer'''
    
    for layer in range(num_layers):
        with tf.variable_scope('encoder_{}'.format(layer)):
            cell_fw = tf.contrib.rnn.LSTMCell(rnn_size,
                                              initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
            cell_fw = tf.contrib.rnn.DropoutWrapper(cell_fw, 
                                                    input_keep_prob = keep_prob)

            cell_bw = tf.contrib.rnn.LSTMCell(rnn_size,
                                              initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
            cell_bw = tf.contrib.rnn.DropoutWrapper(cell_bw, 
                                                    input_keep_prob = keep_prob)

            enc_output, enc_state = tf.nn.bidirectional_dynamic_rnn(cell_fw, 
                                                                    cell_bw, 
                                                                    rnn_inputs,
                                                                    sequence_length,
                                                                    dtype=tf.float32)
    # Join outputs since we are using a bidirectional RNN
    enc_output = tf.concat(enc_output,2)
    
    return enc_output, enc_state

In [224]:
def training_decoding_layer(dec_embed_input, summary_length, dec_cell, initial_state, output_layer, 
                            vocab_size, max_summary_length):
    '''Create the training logits'''
    
    training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=dec_embed_input,
                                                        sequence_length=summary_length,
                                                        time_major=False)

    training_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,
                                                       training_helper,
                                                       initial_state,
                                                       output_layer) 

    training_logits, _ = tf.contrib.seq2seq.dynamic_decode(training_decoder,
                                                           output_time_major=False,
                                                           impute_finished=True,
                                                           maximum_iterations=max_summary_length)
    return training_logits

In [225]:
def inference_decoding_layer(embeddings, start_token, end_token, dec_cell, initial_state, output_layer,
                             max_summary_length, batch_size):
    '''Create the inference logits'''
    
    start_tokens = tf.tile(tf.constant([start_token], dtype=tf.int32), [batch_size], name='start_tokens')
    
    inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embeddings,
                                                                start_tokens,
                                                                end_token)
                
    inference_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,
                                                        inference_helper,
                                                        initial_state,
                                                        output_layer)
                
    inference_logits, _ = tf.contrib.seq2seq.dynamic_decode(inference_decoder,
                                                            output_time_major=False,
                                                            impute_finished=True,
                                                            maximum_iterations=max_summary_length)
    
    return inference_logits

In [226]:
def decoding_layer(dec_embed_input, embeddings, enc_output, enc_state, vocab_size, text_length, summary_length, 
                   max_summary_length, rnn_size, vocab_to_int, keep_prob, batch_size, num_layers):
    '''Create the decoding cell and attention for the training and inference decoding layers'''
    
    for layer in range(num_layers):
        with tf.variable_scope('decoder_{}'.format(layer)):
            lstm = tf.contrib.rnn.LSTMCell(rnn_size,
                                           initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
            dec_cell = tf.contrib.rnn.DropoutWrapper(lstm, 
                                                     input_keep_prob = keep_prob)
    
    output_layer = Dense(vocab_size,
                         kernel_initializer = tf.truncated_normal_initializer(mean = 0.0, stddev=0.1))
    
    attn_mech = tf.contrib.seq2seq.BahdanauAttention(rnn_size,
                                                  enc_output,
                                                  text_length,
                                                  normalize=False,
                                                  name='BahdanauAttention')

    dec_cell = tf.contrib.seq2seq.DynamicAttentionWrapper(dec_cell,
                                                          attn_mech,
                                                          rnn_size)
            
    initial_state = tf.contrib.seq2seq.DynamicAttentionWrapperState(enc_state[0],
                                                                    _zero_state_tensors(rnn_size, 
                                                                                        batch_size, 
                                                                                        tf.float32)) 
    with tf.variable_scope("decode"):
        training_logits = training_decoding_layer(dec_embed_input, 
                                                  summary_length, 
                                                  dec_cell, 
                                                  initial_state,
                                                  output_layer,
                                                  vocab_size, 
                                                  max_summary_length)
    with tf.variable_scope("decode", reuse=True):
        inference_logits = inference_decoding_layer(embeddings,  
                                                    vocab_to_int['<GO>'], 
                                                    vocab_to_int['<EOS>'],
                                                    dec_cell, 
                                                    initial_state, 
                                                    output_layer,
                                                    max_summary_length,
                                                    batch_size)

    return training_logits, inference_logits

In [33]:
def seq2seq_model(input_data, target_data, keep_prob, text_length, vocab_size, rnn_size, num_layers, 
                  vocab_to_int, batch_size):
    '''Use the previous functions to create the training and inference logits'''
    
    # Use Numberbatch's embeddings and the newly created ones as our embeddings
    embeddings = word_embedding_matrix
    
    enc_embed_input = tf.nn.embedding_lookup(embeddings, input_data)
    enc_output, enc_state = encoding_layer(rnn_size, text_length, num_layers, enc_embed_input, keep_prob)
    
#     dec_input = process_encoding_input(target_data, vocab_to_int, batch_size)
#     dec_embed_input = tf.nn.embedding_lookup(embeddings, dec_input)
    
#     training_logits, inference_logits  = decoding_layer(dec_embed_input, 
#                                                         embeddings,
#                                                         enc_output,
#                                                         enc_state, 
#                                                         vocab_size, 
#                                                         text_length,
#                                                         rnn_size, 
#                                                         vocab_to_int, 
#                                                         keep_prob, 
#                                                         batch_size,
#                                                         num_layers)
    
    return enc_output

In [34]:
def pad_sentence_batch(sentence_batch):
    """Pad sentences with <PAD> so that each sentence of a batch has the same length"""
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [vocab_to_int['<PAD>']] * (max_sentence - len(sentence)) for sentence in sentence_batch]

In [35]:
def get_batches(texts, batch_size):
    """Batch summaries, texts, and the lengths of their sentences together"""
    for batch_i in range(0, len(texts)//batch_size):
        start_i = batch_i * batch_size
        texts_batch = texts[start_i:start_i + batch_size]
        pad_texts_batch = np.array(pad_sentence_batch(texts_batch))
        
        # Need the lengths for the _lengths parameters
        pad_texts_lengths = []
        for text in pad_texts_batch:
            pad_texts_lengths.append(len(text))
        
        yield pad_texts_batch, pad_texts_lengths

In [36]:
# Set the Hyperparameters
epochs = 10
batch_size = 100
rnn_size = 256
num_layers = 2
learning_rate = 0.005
keep_probability = 0.75

In [37]:
# Build the graph
train_graph = tf.Graph()
# Set the graph to default to ensure that it is ready for training
with train_graph.as_default():
    
    # Load the model inputs    
    input_data, targets, lr, keep_prob, text_length = model_inputs()

    # Create the training and inference logits
#     training_logits, inference_logits = seq2seq_model(tf.reverse(input_data, [-1]),
#                                                       targets, 
#                                                       keep_prob,   
#                                                       text_length,
#                                                       len(vocab_to_int)+1,
#                                                       rnn_size, 
#                                                       num_layers, 
#                                                       vocab_to_int,
#                                                       batch_size)
    encoding_output = seq2seq_model(tf.reverse(input_data, [-1]),
                                                      targets, 
                                                      keep_prob,   
                                                      text_length,
                                                      len(vocab_to_int)+1,
                                                      rnn_size, 
                                                      num_layers, 
                                                      vocab_to_int,
                                                      batch_size)
    
#     # Create tensors for the training logits and inference logits
#     training_logits = tf.identity(training_logits.rnn_output, 'logits')
#     inference_logits = tf.identity(inference_logits.sample_id, name='predictions')
    
#     # Create the weights for sequence_loss
#     #masks = tf.sequence_mask(summary_length, max_summary_length, dtype=tf.float32, name='masks')

#     with tf.name_scope("optimization"):
#         # Loss function
#         cost = tf.contrib.seq2seq.sequence_loss(
#             training_logits,
#             targets,
#             masks)

#         # Optimizer
#         optimizer = tf.train.AdamOptimizer(learning_rate)

#         # Gradient Clipping
#         gradients = optimizer.compute_gradients(cost)
#         capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
#         train_op = optimizer.apply_gradients(capped_gradients)
print("Graph is built.")

  if d.decorator_argspec is not None), _inspect.getargspec(target))
  if d.decorator_argspec is not None), _inspect.getargspec(target))
  if d.decorator_argspec is not None), _inspect.getargspec(target))
  if d.decorator_argspec is not None), _inspect.getargspec(target))
  if d.decorator_argspec is not None), _inspect.getargspec(target))
  if d.decorator_argspec is not None), _inspect.getargspec(target))
  if d.decorator_argspec is not None), _inspect.getargspec(target))
  if d.decorator_argspec is not None), _inspect.getargspec(target))
  if d.decorator_argspec is not None), _inspect.getargspec(target))
  if d.decorator_argspec is not None), _inspect.getargspec(target))
  if d.decorator_argspec is not None), _inspect.getargspec(target))
  if d.decorator_argspec is not None), _inspect.getargspec(target))
  if d.decorator_argspec is not None), _inspect.getargspec(target))
  if d.decorator_argspec is not None), _inspect.getargspec(target))
  if d.decorator_argspec is not None), _inspect.

  if d.decorator_argspec is not None), _inspect.getargspec(target))
  if d.decorator_argspec is not None), _inspect.getargspec(target))
  if d.decorator_argspec is not None), _inspect.getargspec(target))
  if d.decorator_argspec is not None), _inspect.getargspec(target))
  if d.decorator_argspec is not None), _inspect.getargspec(target))
  if d.decorator_argspec is not None), _inspect.getargspec(target))
  if d.decorator_argspec is not None), _inspect.getargspec(target))
  if d.decorator_argspec is not None), _inspect.getargspec(target))
  if d.decorator_argspec is not None), _inspect.getargspec(target))


Graph is built.


  if d.decorator_argspec is not None), _inspect.getargspec(target))
  if d.decorator_argspec is not None), _inspect.getargspec(target))
  if d.decorator_argspec is not None), _inspect.getargspec(target))


## Training the Model

Since I am training this model on my MacBook Pro, it would take me days if I used the whole dataset. For this reason, I am only going to use a subset of the data, so that I can train it over night. Normally I use [FloydHub's](https://www.floydhub.com/) services for my GPU needs, but it would take quite a bit of time to upload the dataset and ConceptNet Numberbatch, so I'm not going to bother with that for this project.

I chose not use use the start of the subset because I didn't want to make it too easy for my model. The texts that I am using are closer to the median lengths; I thought this would be more fair.

In [38]:
# Subset the data for training
start = 200000
end = start + 50000
#sorted_summaries_short = sorted_summaries[start:end]
sorted_texts_short = sorted_texts[start:end]
print("The shortest text length:", len(sorted_texts_short[0]))
print("The longest text length:",len(sorted_texts_short[-1]))

The shortest text length: 112
The longest text length: 29


In [51]:
with tf.Session() as sess:
    sess.run(input_data.shape)

RuntimeError: The Session graph is empty.  Add operations to the graph before calling run().

In [39]:
outp = np.random.random(64)

In [40]:
outp.reshape((64,1)).shape

(64, 1)

In [43]:
# Train the Model
learning_rate_decay = 0.95
min_learning_rate = 0.0005
display_step = 20 # Check training loss after every 20 batches
stop_early = 0 
stop = 3 # If the update loss does not decrease in 3 consecutive update checks, stop training
per_epoch = 3 # Make 3 update checks per epoch
update_check = (len(sorted_texts_short)//batch_size//per_epoch)-1

update_loss = 0 
batch_loss = 0
summary_update_loss = [] # Record the update losses for saving improvements in the model

checkpoint = "best_model.ckpt" 
with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    
    # If we want to continue training a previous session
    #loader = tf.train.import_meta_graph("./" + checkpoint + '.meta')
    #loader.restore(sess, checkpoint)
    
    for epoch_i in range(2):
        for batch_i, (texts_batch, texts_lengths) in enumerate(
                get_batches(sorted_texts_short, 1000)):
            start_time = time.time()
            enc_out = sess.run(
                encoding_output,
                {input_data: texts_batch,
                 targets: np.random.random(len(texts_batch)).reshape((len(texts_batch),1)),
                 lr: learning_rate,
                 #summary_length: summaries_lengths,
                 text_length: texts_lengths,
                 keep_prob: keep_probability})

            end_time = time.time()
        print("epoch number %d" %(epoch_i))

#             if batch_i % display_step == 0 and batch_i > 0:
#                 print('Epoch {:>3}/{} Batch {:>4}/{} - Loss: {:>6.3f}, Seconds: {:>4.2f}'
#                       .format(epoch_i,
#                               epochs, 
#                               batch_i, 
#                               len(sorted_texts_short) // batch_size, 
#                               batch_loss / display_step, 
#                               batch_time*display_step))
#                 batch_loss = 0

#             if batch_i % update_check == 0 and batch_i > 0:
#                 print("Average loss for this update:", round(update_loss/update_check,3))
#                 summary_update_loss.append(update_loss)
                
#                 # If the update loss is at a new minimum, save the model
#                 if update_loss <= min(summary_update_loss):
#                     print('New Record!') 
#                     stop_early = 0
#                     saver = tf.train.Saver() 
#                     saver.save(sess, checkpoint)

#                 else:
#                     print("No Improvement.")
#                     stop_early += 1
#                     if stop_early == stop:
#                         break
#                 update_loss = 0
            
                    
        # Reduce learning rate, but not below its minimum value
#         learning_rate *= learning_rate_decay
#         if learning_rate < min_learning_rate:
#             learning_rate = min_learning_rate
        
#         if stop_early == stop:
#             print("Stopping Training.")
#             break

epoch number 0
epoch number 1


In [52]:
enc_out.shape

(1000, 148, 512)

In [158]:
# Train the Model
learning_rate_decay = 0.95
min_learning_rate = 0.0005
display_step = 20 # Check training loss after every 20 batches
stop_early = 0 
stop = 3 # If the update loss does not decrease in 3 consecutive update checks, stop training
per_epoch = 3 # Make 3 update checks per epoch
update_check = (len(sorted_texts_short)//batch_size//per_epoch)-1

update_loss = 0 
batch_loss = 0
summary_update_loss = [] # Record the update losses for saving improvements in the model

checkpoint = "best_model.ckpt" 
with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    
    # If we want to continue training a previous session
    #loader = tf.train.import_meta_graph("./" + checkpoint + '.meta')
    #loader.restore(sess, checkpoint)
    
    for epoch_i in range(1, epochs+1):
        update_loss = 0
        batch_loss = 0
        for batch_i, (texts_batch, texts_lengths) in enumerate(
                get_batches(sorted_texts_short, batch_size)):
            start_time = time.time()
            _, loss = sess.run(
                [train_op, cost],
                {input_data: texts_batch,
                 targets: np.random.random(len(texts_batch)),
                 lr: learning_rate,
                 #summary_length: summaries_lengths,
                 text_length: texts_lengths,
                 keep_prob: keep_probability})

            batch_loss += loss
            update_loss += loss
            end_time = time.time()
            batch_time = end_time - start_time

            if batch_i % display_step == 0 and batch_i > 0:
                print('Epoch {:>3}/{} Batch {:>4}/{} - Loss: {:>6.3f}, Seconds: {:>4.2f}'
                      .format(epoch_i,
                              epochs, 
                              batch_i, 
                              len(sorted_texts_short) // batch_size, 
                              batch_loss / display_step, 
                              batch_time*display_step))
                batch_loss = 0

#             if batch_i % update_check == 0 and batch_i > 0:
#                 print("Average loss for this update:", round(update_loss/update_check,3))
#                 summary_update_loss.append(update_loss)
                
#                 # If the update loss is at a new minimum, save the model
#                 if update_loss <= min(summary_update_loss):
#                     print('New Record!') 
#                     stop_early = 0
#                     saver = tf.train.Saver() 
#                     saver.save(sess, checkpoint)

#                 else:
#                     print("No Improvement.")
#                     stop_early += 1
#                     if stop_early == stop:
#                         break
#                 update_loss = 0
            
                    
        # Reduce learning rate, but not below its minimum value
        learning_rate *= learning_rate_decay
        if learning_rate < min_learning_rate:
            learning_rate = min_learning_rate
        
        if stop_early == stop:
            print("Stopping Training.")
            break

Epoch   1/100 Batch   20/781 - Loss:  4.470, Seconds: 156.00
Epoch   1/100 Batch   40/781 - Loss:  2.863, Seconds: 105.20
Epoch   1/100 Batch   60/781 - Loss:  2.652, Seconds: 151.58
Epoch   1/100 Batch   80/781 - Loss:  2.736, Seconds: 117.19
Epoch   1/100 Batch  100/781 - Loss:  2.686, Seconds: 118.42
Epoch   1/100 Batch  120/781 - Loss:  2.423, Seconds: 140.21
Epoch   1/100 Batch  140/781 - Loss:  2.696, Seconds: 152.89
Epoch   1/100 Batch  160/781 - Loss:  2.606, Seconds: 128.19
Epoch   1/100 Batch  180/781 - Loss:  2.525, Seconds: 151.52
Epoch   1/100 Batch  200/781 - Loss:  2.597, Seconds: 140.84
Epoch   1/100 Batch  220/781 - Loss:  2.515, Seconds: 130.87
Epoch   1/100 Batch  240/781 - Loss:  2.402, Seconds: 131.02
Average loss for this update: 2.734
New Record!
Epoch   1/100 Batch  260/781 - Loss:  2.382, Seconds: 106.18
Epoch   1/100 Batch  280/781 - Loss:  2.354, Seconds: 124.90
Epoch   1/100 Batch  300/781 - Loss:  2.306, Seconds: 148.73
Epoch   1/100 Batch  320/781 - Loss: 

KeyboardInterrupt: 

## Making Our Own Summaries

To see the quality of the summaries that this model can generate, you can either create your own review, or use a review from the dataset. You can set the length of the summary to a fixed value, or use a random value like I have here.

In [114]:
def text_to_seq(text):
    '''Prepare the text for the model'''
    
    text = clean_text(text)
    return [vocab_to_int.get(word, vocab_to_int['<UNK>']) for word in text.split()]

In [167]:
# Create your own review or use one from the dataset
#input_sentence = "I have never eaten an apple before, but this red one was nice. \
                  #I think that I will try a green apple next time."
#text = text_to_seq(input_sentence)
random = np.random.randint(0,len(clean_texts))
input_sentence = clean_texts[random]
text = text_to_seq(clean_texts[random])

checkpoint = "./best_model.ckpt"

loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # Load saved model
    loader = tf.train.import_meta_graph(checkpoint + '.meta')
    loader.restore(sess, checkpoint)

    input_data = loaded_graph.get_tensor_by_name('input:0')
    logits = loaded_graph.get_tensor_by_name('predictions:0')
    text_length = loaded_graph.get_tensor_by_name('text_length:0')
    summary_length = loaded_graph.get_tensor_by_name('summary_length:0')
    keep_prob = loaded_graph.get_tensor_by_name('keep_prob:0')
    
    #Multiply by batch_size to match the model's input parameters
    answer_logits = sess.run(logits, {input_data: [text]*batch_size, 
                                      summary_length: [np.random.randint(5,8)], 
                                      text_length: [len(text)]*batch_size,
                                      keep_prob: 1.0})[0] 

# Remove the padding from the tweet
pad = vocab_to_int["<PAD>"] 

print('Original Text:', input_sentence)

print('\nText')
print('  Word Ids:    {}'.format([i for i in text]))
print('  Input Words: {}'.format(" ".join([int_to_vocab[i] for i in text])))

print('\nSummary')
print('  Word Ids:       {}'.format([i for i in answer_logits if i != pad]))
print('  Response Words: {}'.format(" ".join([int_to_vocab[i] for i in answer_logits if i != pad])))

INFO:tensorflow:Restoring parameters from ./best_model.ckpt
Original Text: love individual oatmeal cups found years ago sam quit selling sound big lots quit selling found target expensive buy individually trilled get entire case time go anywhere need water microwave spoon know quaker flavor packets

Text
  Word Ids:    [70595, 18808, 668, 45565, 51927, 51759, 32488, 13510, 32036, 59599, 11693, 444, 23335, 32036, 59599, 51927, 67316, 726, 24842, 50494, 48492, 1062, 44749, 38443, 42344, 67973, 14168, 7759, 5347, 29528, 58763, 18927, 17701, 20232, 47328]
  Input Words: love individual oatmeal cups found years ago sam quit selling sound big lots quit selling found target expensive buy individually trilled get entire case time go anywhere need water microwave spoon know quaker flavor packets

Summary
  Word Ids:       [70595, 28738]
  Response Words: love it


Examples of reviews and summaries:
- Review(1): The coffee tasted great and was at such a good price! I highly recommend this to everyone!
- Summary(1): great coffee


- Review(2): This is the worst cheese that I have ever bought! I will never buy it again and I hope you won't either!
- Summary(2): omg gross gross


- Review(3): love individual oatmeal cups found years ago sam quit selling sound big lots quit selling found target expensive buy individually trilled get entire case time go anywhere need water microwave spoon know quaker flavor packets
- Summary(3): love it

## Summary

I hope that you found this project to be rather interesting and informative. One of my main recommendations for working with this dataset and model is either use a GPU, a subset of the dataset, or plenty of time to train your model. As you might be able to expect, the model will not be able to make good predictions just by seeing many reviews, it needs so see the reviews many times to be able to understand the relationship between words and between descriptions & summaries. 

In short, I'm pleased with how well this model performs. After creating numerous reviews and checking those from the dataset, I can happily say that most of the generated summaries are appropriate, some of them are great, and some of them make mistakes. I'll try to improve this model and if it gets better, I'll update my GitHub.

Thanks for reading!