[View in Colaboratory](https://colab.research.google.com/github/hamil168/Chatbots/blob/master/Seq2Seq.ipynb)

In [1]:
# For a fresh Colab instance, clone fresh:
!pip install -q xlrd
!git clone https://github.com/hamil168/Chatbots

fatal: destination path 'Chatbots' already exists and is not an empty directory.


In [1]:
# Change to Colab directory:
cd Chatbots/

/content/Chatbots


In [2]:
# For an existing Colab instance, pull from master, uncomment this:

!git pull https://github.com/hamil168/Chatbots master

From https://github.com/hamil168/Chatbots
 * branch            master     -> FETCH_HEAD
Already up-to-date.


In [3]:
# Files as they appear in the repo clone
ls

Cornell Movie Script Database EDA.ipynb  Preproc.ipynb  README.md
movie_conversations.txt                  preproc.py     Seq2Seq.ipynb
movie_lines.txt                          [0m[01;34m__pycache__[0m/


In [0]:
import numpy as np
import tensorflow as tf
import time
import re

In [0]:
#!python preproc.py
from preproc import *

In [0]:
lines = open('movie_lines.txt', encoding = 'utf-8', errors = 'ignore').read().split('\n')
conversations = open('movie_conversations.txt', encoding = 'utf-8', errors = 'ignore').read().split('\n')

In [0]:
id2l, cid, questions, answers, clean_questions, clean_answers, word2count, sorted_clean_questions, sorted_clean_answers = preproc_steps(lines,conversations)

questionswords2int, answerswords2int = map_questions_and_answers_to_integers(w2c)

In [51]:
conversations[0:5]

["u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L198', 'L199']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L200', 'L201', 'L202', 'L203']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L204', 'L205', 'L206']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L207', 'L208']"]

In [52]:
get_conversations_ids(conversations[0:5])

[['L194', 'L195', 'L196', 'L197'],
 ['L198', 'L199'],
 ['L200', 'L201', 'L202', 'L203'],
 ['L204', 'L205', 'L206']]

In [53]:
print(questions[0:3])
print(clean_questions[0:3])
print(sorted_clean_questions[0:3])

['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.', "Well, I thought we'd start with pronunciation, if that's okay with you.", 'Not the hacking and gagging and spitting part.  Please.']
['can we make this quick  roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad  again', 'well i thought we would start with pronunciation if that is okay with you', 'not the hacking and gagging and spitting part  please']
[[48], [63], [124]]


In [0]:
# Create placeholder for inputs and the targets
# in TF, all variables are tensors
# need to go from NP --> TF tensors
# need placeholders for every TF variables inputs and targets

def model_inputs():
  #inputs and targets are 2D matrices
  inputs = tf.placeholder(tf.int32, [None, None], name = 'inputs') 
  targets = tf.placeholder(tf.int32, [None, None], name = 'targets')
  keep_prob = tf.placeholder(tf.float32, name = 'dropout_rate') #dropout
  
  lr = tf.placeholder(tf.float32, name = 'learning_rate')
  
  encoder_sequence_length = tf.placeholder(tf.int32, (None, ), name='encoder_seq_len')
  decoder_sequence_length = tf.placeholder(tf.int32, (None, ), name='decoder_seq_len')
  max_sequence_length = tf.reduce_max(decoder_seq_len, name='max_seq_len')
  
  return inputs, targets, keep_prob, encoder_sequence_length, decoder_sequence_length, max_sequence_length

In [0]:
# Create encoder RNN layer
def encoder_rnn(rnn_inputs, rnn_size, num_layers, keep_prob, sequence_length):
  
  
  # LSTM cell class
  # rnn_size: number of input tensors
  # sequence_length: length of each question in the atch
  
  
  def cell(units, rate):
    layer = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    return tf.contrib.rnn.DropoutWrapper(layer, rate)

  encoder_cell_fw = tf.contrib.rnn.MultiRNNCell([cell(rnn_size, keep_prob) for _ in range(num_layers)])
  encoder_cell_bw = tf.contrib.rnn.MultiRNNCell([cell(rnn_size, keep_prob) for _ in range(num_layers)])

  
  # bidirection rnn function (creates dynamic bidirectional network)
  # builds independent forward and backward rnn
  # need ot make sure the ends match
  # (first element is encoder_output)
  encoder_outputs, encoder_states = tf.nn.bidirectional_dynamic_rnn(cell_fw = encoder_cell_fw,
                                                   cell_bw = encoder_cell_bw,
                                                   inputs = rnn_inputs,
                                                   sequence_length = sequence_length,
                                                   dtype = tf.float32)
  
  return encoder_outputs, encoder_states


In [0]:
# preprocessing the targets
# need batches, 
# need each to start with <SOS> token

def preprocess_decoder_inputs(targets, word2int_dict, batch_size):
  """

  Prepares the decoder inputs (i.e. the 'targets') for use
     
     Inputs: 
        targets: the input for the decoder for training.
        word2int_dict: one of the dictionaries used to map a word to its integer
        batch_size: size of each batch for model training
        
     Outputs:
        preprocessed_targets: the processed version of the decoder inputs
  
  """
  
  # Using <SOS> for "start of string", create a tensor with one per batch element
  left_side = tf.fill([batch_size, 1], word2int_dict['<SOS>'])
    
  # Take the targets and remove the last member of each sample (it is blank)
  right_side = tf.strided_slice(targets, [0,0], [batch_size, -1], [1,1])
  
  # Add the <SOS> to the left side of every target phrase
  preprocessed_targets = tf.concat([left_side, right_side], 1)
  
  return preprocessed_targets

### Attention
- (warning for later, when I add Beam Search) **NOTE** If you are using the `BeamSearchDecoder` with a cell wrapped in
`AttentionWrapper`
- will also need to return here with DeviceWrapper for multiple GPUs

In [0]:
def decoder(encoder_state, decoder_cell, decoder_inputs, 
            vocabulary_size, decoder_sequence_length, max_sequence_length,
            word2id_dict, batch_size, keep_prob):  #dropout?
  
  
  embedding_layer = tf.Variable(tf.random_uniform([vocabulary_size, decoder_sequence_length]))
  embeddings = tf.nn.embedding_lookup(embedding_layer, decoder_inputs)
  
  output_layer = Dense(vocab_size, kernel_initializer=tf.truncated_normal_initializer(0.0, 0.1))
    
  with tf.variable_scope('decoder'):
  
    # Get attention states
    # shaped [batch_size, memory_max_time, memory_depth]
    #attention_states = tf.zeros([batch_size, 1, decoder_cell.output_size])

    # linear attention
    #attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(num_units = decoder_cell.output_size, 
                                                               #memory = attention_states

    # 256 look up.
    #attention_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, attention_mechanism)

    train_helper = tf.contrib.seq2seq.TrainingHelper(decoder_inputs, sequence_length = decoder_sequence_length)

    train_decoder = tf.contrib.seq2seq.BasicDecoder(cell = decoder_cell,
                                             helper = train_helper,
                                             initial_state = encoder_state[0])


    # returns (final_outputs, final_state, final_sequence_lengths)
    train_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder = train_decoder,
                                                             impute_finished = True,
                                                             maximum_iterations = max_sequence_length)
    ###########################
    #decoder_output_dropout is handled in a attention wrapper function outside of this functinon                                                                
                                                   
  with tf.variable_scope('decoder', reuse=True):
  
    starting_id_vector = tf.tile(tf.constant([word_to_id['<SOS>']], dtype=tf.int32), [batch_size], name = 'starting_id_vector')                                               
                                                   
    infer_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embedding_layer, 
                                                            starting_id_vector,
                                                           word2id_dict['<EOS>'])                                                   

    infer_decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell,
                                                    helper = infer_helper,
                                                    initial_state = encoder_state,
                                                    output_layer=output_layer)


    # returns (final_outputs, final_state, final_sequence_lengths)
    infer_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(infer_decoder,
                                                             impute_finished = True,
                                                             maximum_iterations = max_sequence_length)
    
                                                   
  return train_decoder_output, infer_decoder_output
                                                   
                                                   
                                              
  

def decoder_rnn(decoder_embedded_input, decoder_embeddings_matrix, encoder_state, num_words, sequence_length, rnn_size, num_layers, word2int, keep_prob, batch_size):
  with tf.variable_scope('decoding', reuse=tf.AUTO_REUSE) as decoding_scope:
    lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    lstm_dropout = tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob = keep_prob)
    decoder_cell = tf.contrib.rnn.MultiRNNCell([lstm_dropout] * num_layers)
    
    weights = tf.truncated_normal_initializer(stddev = 0.1)
    biases = tf.zeros_initializer()
    
    # create FCL
    output_function = lambda x: tf.contrib.layers.fully_connected(x,
                                                                 num_words,
                                                                 None,
                                                                 scope = decoding_scope,
                                                                 weights_initializer = weights,
                                                                 baises_initializer = biases)
    
    training_predictions = decode_training_set(encoder_state, decoder_cell,
                                              decoder_embedded_input,
                                              sequence_length,
                                              decoding_scope,
                                              output_function,
                                              keep_prob,
                                              batch_size)
    
    #decoding_scope.reuse_variables()
    test_predictions = decode_test_set(encoder_state,
                                      decoder_cell,
                                      decoder_embeddings_matrix,
                                      word2int['<SOS>'],
                                      word2int['<EOS>'],
                                      sequence_length - 1,  #exclude last token
                                      num_words,
                                      decoding_scope,
                                      sequence_length,
                                      output_function,
                                      keep_prob,
                                      batch_size)
    
    
  return training_predictions, test_predictions    
    

In [0]:

def attention_mechanism(rnn_size, keep_prob, encoder_outputs, encoder_states, encoder_sequence_length, batch_size):
  
  
  def cell(units, probs):
    layer = tf.contrib.rnn.BasicLSTMCell(units)
    return tf.contrib.rnn.DropoutWrapper(layer, probs)
  
  decoder_cell = cell(rnn_size, keep_prob)
  
  attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(rnn_size, 
                                                            encoder_outputs,
                                                            encoder_sequence_length)
  
  decoder_cell_wrapped = tf.contrib.seq2seq.AttentionWrapper(decoder_cell,
                                                            attention_mechanism,
                                                            rnn_size / 2)
  
  attention_ought = decoder_cell_wrapped.zero_state(batch_size = batch_size, dtype = tf.float32)
  
  encoder_state_new = attention_ought.clone(cell_state = encoder_states[-1])
  
  return decoder_cell_wrapped, encoder_state_new
  


In [0]:
# Build the Model

def seq2seq_model(inputs, targets, keep_prob, batch_size, sequence_length, answers_num_words, questions_num_words,
                 encoder_embedding_size, decoder_embedding_size, rnn_size, num_layers, questionswords2int):
  
  encoder_embedded_input = tf.contrib.layers.embed_sequence(inputs,
                                                           answers_num_words + 1,
                                                           encoder_embedding_size,
                                                           initializer = tf.random_uniform_initializer(0,1))
  
  encoder_state = encoder_rnn(encoder_embedded_input, rnn_size, num_layers, keep_prob, sequence_length)
  
  ## check here for nonetype
  preprocessed_targets = preprocess_targets(targets, questionswords2int, batch_size)
  
  decoder_embeddings_matrix = tf.Variable(tf.random_uniform([questions_num_words + 1, decoder_embedding_size], 0, 1))
  
  decoder_embedded_input = tf.nn.embedding_lookup(decoder_embeddings_matrix, preprocessed_targets)
  
  ## check here for nonetype
  training_predictions, test_predictions = decoder_rnn(decoder_embedded_input,
                                                      decoder_embeddings_matrix,
                                                      encoder_state,
                                                      questions_num_words,
                                                      sequence_length,
                                                      rnn_size,
                                                      num_layers,
                                                      questionswords2int,
                                                      keep_prob,
                                                      batch_size)

In [0]:
# Next up: hyper parameters
epochs = 100
batch_size = 64  # make bigger to make faster
rnn_size = 512 
num_layers = 3
encoding_embedding_size = 512  # 512 col in embedding matrix
decoding_embedding_size = 512
learning_rate = 0.01
learning_rate_decay = 0.9
min_learning_rate = 0.0001
#keep_prob = 0.5
keep_probability = 0.5  # based on hinton paper '14'


In [0]:
# Here for convenience if I need it
session.close()

In [114]:

# define a TF session

# reset graphs


tf.reset_default_graph()
session = tf.InteractiveSession()

# load model inputs

inputs, targets, lr, keep_prob = model_inputs()

# sequence length
#sequence_length = tf.placeholder(tf.int32, shape=(batch_size,), name = 'sequence_length')

sequence_length = tf.placeholder_with_default(batch_size, (20,1), name = 'sequence_length')

print(tf.shape(sequence_length, name = 'sequence_length_shape'))  # error sayings sequence_length is supposed to be a vector.

# get shape of input tensor
input_shape = tf.shape(inputs, name = 'input_shape')
print(input_shape)                                   
print(tf.shape(tf.reverse(inputs,[-1])))



ValueError: ignored

In [113]:
# Getting training and test predictions
#CHECK THE REVERSE SEQUENCE PART LATER
training_predictions, test_predictions = seq2seq_model(tf.reverse(inputs, [-1]), 
                                                       targets, keep_prob, batch_size, sequence_length, 
                                                       len(answerswords2int.items()), len(questionswords2int.items()), 
                                                       encoding_embedding_size, decoding_embedding_size, rnn_size,
                                                       num_layers, questionswords2int)

#reverse_sequence(inputs, sequence_length, seq_axis=-1, batch_axis = 0), 

TypeError: ignored

In [0]:
with tf.name_scope("optimization"):
  loss_error = tf.contrib.seq2seq.sequence_loss(training_predictions,
                                               targets,
                                               tf.ones([input_shape[0], sequence_length]))
  #Adam optimizer object
  optimizer = tf.train.AdamOptimizer(learning_rate)
  gradients = optimizer.compute_gradients(loss_error)
  # clip the gradients
  clipped_gradients = [(tf.clip_by_value(grad_tensor, -5., 5.), grad_variable) for grad_tensor, grad_variable in gradients if grad_tensor is not None]
  optimizer_gradient_clipping = optimizer.apply_gradients(clipped_gradients)

NameError: ignored

In [0]:
# Apply padding to the sequences so the question and answer sequences are the same

def apply_padding(batch_of_sequences, word2int):
  max_sequence_length = max([len(sequence) for sequence in batch_of_sequences])
  return [sequence + [word2int['<PAD>']] * (max_sequence_length - len(sequence)) for sequence in batch_of_sequences]

                                            

In [0]:
# Generator to split data itno batches for batch gradient descent                                            
def split_into_batches(questions, answers, batch_size):
  for batch_index in range(0,len(questions) // batch_size):
      start_index = batch_index * batch_size
                                            
      questions_in_batch = questions[start_index : start_index + batch_size]
      answers_in_batch = answers[start_index : start_index + batch_size]
                                            
      padded_questions_in_batch = np.array(apply_padding(questions_in_batch, questionswords2int))
      padded_answers_in_batch = np.array(apply_padding(answers_in_batch, answerswords2int))
      
      yield padded_questions_in_batch, padded_answers_in_batch

In [0]:
# Split data into training and validation sets
# needs more sophistication later.
# scq/sca mean sorted clean questions / answers from the preprocessign step.

training_validation_split = int(len(scq) * 0.2)

train_Q = scq[training_validation_split:]
train_A = sca[training_validation_split:]

val_Q = scq[:training_validation_split]
val_A = sca[:training_validation_split]




In [0]:
#TRAINING STEPS

batch_index_check_training_loss = 100
batch_index_check_validation_loss = ((len(train_Q)) // batch_size // 2) - 1
total_training_loss_error = 0
list_validation_loss_error = []
early_stopping_check = 0
early_stopping_stop = 1000

checkpoint = "./chatbot_weights.ckpt"

In [0]:
session.run(tf.global_variables_initializer())

In [0]:
for epoch in range(1, epochs +1):
  for batch_index, (padded_questions_in_batch, 
                    padded_answers_in_batch) in enumerate (split_into_batches(train_Q, 
                                                                              train_A, batch_size)):
    starting_time = time.time()
    
    # run the current epoch using the parameters thus far
    _, batch_training_loss_error = session.run([optimizer_gradient_clipping, loss_error],
                                              {inputs: padded_questions_in_batch,
                                              targets: padded_answers_in_batch,
                                              lr: learning_rate,
                                              sequence-length: padded_answers_in_batch.shape[1],
                                              keep_prob: keep_probability})
    
    # add the existing batch loss to the totall loss
    total_training_loss_erroor += batch_training_loss_error
    
    # calculate epoch run time
    ending_time = time.time()
    batch_time = ending_time - starting_time
    
    if batch_index % batch_index_check_training_loss == 0:
      print('Epoch: {:>3}/{}, Batch: {:>4}/{}, ' + 
            'Training Loss Error: {:>6.3f}, ' + 
            'Training Time on 100 Batches: {:d} seconds'.format(epoch, epochs,
                                                                batch_index, len(train_Q),
                                                                total_training_loss_error / batch_index_check_training_loss,
                                                                int(batch_time * batch_index_check_training_loss)))
                                                                                                                                 
      total_training_loss_error = 0                                                                                                  
    
    

NameError: ignored