[View in Colaboratory](https://colab.research.google.com/github/hamil168/Chatbots/blob/master/Seq2Seq.ipynb)

In [0]:
# For a fresh Colab instance, clone fresh:
!pip install -q xlrd
!git clone https://github.com/hamil168/Chatbots

Cloning into 'Chatbots'...
remote: Counting objects: 114, done.[K
remote: Compressing objects: 100% (101/101), done.[K
remote: Total 114 (delta 62), reused 35 (delta 12), pack-reused 0[K
Receiving objects: 100% (114/114), 9.53 MiB | 13.35 MiB/s, done.
Resolving deltas: 100% (62/62), done.


In [1]:
# Change to Colab directory:
cd Chatbots/

/content/Chatbots


In [2]:
# For an existing Colab instance, pull from master, uncomment this:

!git pull https://github.com/hamil168/Chatbots master

From https://github.com/hamil168/Chatbots
 * branch            master     -> FETCH_HEAD
Already up-to-date.


In [3]:
# Files as they appear in the repo clone
ls


Cornell Movie Script Database EDA.ipynb  Preproc.ipynb  README.md
movie_conversations.txt                  preproc.py     Seq2Seq.ipynb
movie_lines.txt                          [0m[01;34m__pycache__[0m/


In [4]:
!pip install tqdm



In [0]:
import numpy as np
import tensorflow as tf
import time
import re
from tensorflow.python.layers.core import Dense
from tqdm import tqdm

global graph1, model

graph1 = tf.get_default_graph()

In [0]:
#!python preproc.py
from preproc import *

In [0]:
lines = open('movie_lines.txt', encoding = 'utf-8', errors = 'ignore').read().split('\n')
conversations = open('movie_conversations.txt', encoding = 'utf-8', errors = 'ignore').read().split('\n')

In [0]:
id2l, cid, questions, answers, clean_questions, clean_answers, word2count, sorted_clean_questions, sorted_clean_answers = preproc_steps(lines,conversations)

questionswords2int, answerswords2int = map_questions_and_answers_to_integers(word2count)

In [33]:
conversations[0:5]

["u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L198', 'L199']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L200', 'L201', 'L202', 'L203']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L204', 'L205', 'L206']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L207', 'L208']"]

In [10]:
get_conversations_ids(conversations[0:5])

[['L194', 'L195', 'L196', 'L197'],
 ['L198', 'L199'],
 ['L200', 'L201', 'L202', 'L203'],
 ['L204', 'L205', 'L206']]

In [11]:
print(questions[0:3])
print(clean_questions[0:3])
print(sorted_clean_questions[0:3])

['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.', "Well, I thought we'd start with pronunciation, if that's okay with you.", 'Not the hacking and gagging and spitting part.  Please.']
['can we make this quick  roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad  again', 'well i thought we would start with pronunciation if that is okay with you', 'not the hacking and gagging and spitting part  please']
[[48], [63], [124]]


In [0]:
# Create placeholder for inputs and the targets
# in TF, all variables are tensors
# need to go from NP --> TF tensors
# need placeholders for every TF variables inputs and targets

def model_inputs():
  #inputs and targets are 2D matrices
  inputs = tf.placeholder(tf.int32, [None, None], name = 'inputs') 
  targets = tf.placeholder(tf.int32, [None, None], name = 'targets')
  keep_prob = tf.placeholder(tf.float32, name = 'dropout_rate') #dropout
  
  #lr = tf.placeholder(tf.float32, name = 'learning_rate')
  
  encoder_sequence_length = tf.placeholder(tf.int32, (None, ), name='encoder_sequence_length')
  decoder_sequence_length = tf.placeholder(tf.int32, (None, ), name='decoder_sequence_length')
  max_sequence_length = tf.reduce_max(decoder_sequence_length, name='max_sequence_length')
  
  return inputs, targets, keep_prob, encoder_sequence_length, decoder_sequence_length, max_sequence_length

In [0]:
# Create encoder RNN layer
def encoder_rnn(rnn_inputs, rnn_size, num_layers, 
                encoder_sequence_length, keep_prob, encoder_embedding_size, encoder_word_count):
  
  
  # LSTM cell class
  # rnn_size: number of input tensors
  # sequence_length: length of each question in the atch
  
  
  def cell(units, rate):
    layer = tf.contrib.rnn.BasicLSTMCell(units)
    return tf.contrib.rnn.DropoutWrapper(layer, rate)

  encoder_cell_fw = tf.contrib.rnn.MultiRNNCell([cell(rnn_size, keep_prob) for _ in range(num_layers)])
  #encoder_cell_bw = tf.contrib.rnn.MultiRNNCell([cell(rnn_size, keep_prob) for _ in range(num_layers)])

  encoder_embeddings = tf.contrib.layers.embed_sequence(rnn_inputs, encoder_word_count, encoder_embedding_size)
  
  # bidirection rnn function (creates dynamic bidirectional network)
  # builds independent forward and backward rnn
  # need ot make sure the ends match
  # (first element is encoder_output)
  #encoder_outputs, encoder_states = tf.nn.bidirectional_dynamic_rnn(encoder_cell_fw,
  encoder_outputs, encoder_states = tf.nn.dynamic_rnn(encoder_cell_fw,                                                                    
                                                   #cell_bw = encoder_cell_bw,
                                                   inputs = encoder_embeddings,
                                                   sequence_length = encoder_sequence_length,
                                                   dtype = tf.float32)
  
  return encoder_outputs, encoder_states


In [0]:
# preprocessing the targets
# need batches, 
# need each to start with <SOS> token

def preprocess_decoder_inputs(targets, word2int_dict, batch_size):
  """

  Prepares the decoder inputs (i.e. the 'targets') for use
     
     Inputs: 
        targets: the input for the decoder for training.
        word2int_dict: one of the dictionaries used to map a word to its integer
        batch_size: size of each batch for model training
        
     Outputs:
        preprocessed_targets: the processed version of the decoder inputs
  
  """
  
  # Using <SOS> for "start of string", create a tensor with one per batch element
  left_side = tf.fill([batch_size, 1], word2int_dict['<SOS>'])
    
  # Take the targets and remove the last member of each sample (it is blank)
  right_side = tf.strided_slice(targets, [0,0], [batch_size, -1], [1,1])
  
  # Add the <SOS> to the left side of every target phrase
  return tf.concat([left_side, right_side], 1)

### Attention
- (warning for later, when I add Beam Search) **NOTE** If you are using the `BeamSearchDecoder` with a cell wrapped in
`AttentionWrapper`
- will also need to return here with DeviceWrapper for multiple GPUs

In [0]:
def decoder(decoder_inputs, encoder_state, decoder_cell, decoder_embedding_size,
            vocabulary_size, decoder_sequence_length, max_sequence_length,
            word2id_dict, batch_size):
  
  
  embedding_layer = tf.Variable(tf.random_uniform([vocabulary_size, decoder_embedding_size]))
  embeddings = tf.nn.embedding_lookup(embedding_layer, decoder_inputs)
  
  output_layer = Dense(vocabulary_size, kernel_initializer=tf.truncated_normal_initializer(0.0, 0.1))
    
  with tf.variable_scope('decoder'):
  
    train_helper = tf.contrib.seq2seq.TrainingHelper(embeddings, sequence_length = decoder_sequence_length)

    train_decoder = tf.contrib.seq2seq.BasicDecoder(cell = decoder_cell,
                                             helper = train_helper,
                                             initial_state = encoder_state, 
                                                    output_layer = output_layer)


    # returns (final_outputs, final_state, final_sequence_lengths)
    train_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder = train_decoder,
                                                             impute_finished = True,
                                                             maximum_iterations = max_sequence_length)
    ###########################
    #decoder_output_dropout is handled in a attention wrapper function outside of this functinon                                                                
                                                   
  with tf.variable_scope('decoder', reuse=True):
  
    starting_id_vector = tf.tile(tf.constant([word2id_dict['<SOS>']], dtype=tf.int32), [batch_size], name = 'starting_id_vector')                                               
                                                   
    infer_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embedding_layer, 
                                                            starting_id_vector,
                                                           word2id_dict['<EOS>'])                                                   

    infer_decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell,
                                                    helper = infer_helper,
                                                    initial_state = encoder_state,
                                                    output_layer=output_layer)


    # returns (final_outputs, final_state, final_sequence_lengths)
    infer_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(infer_decoder,
                                                             impute_finished = True,
                                                             maximum_iterations = max_sequence_length)
    
                                                   
  return train_decoder_output, infer_decoder_output
                                                   
                                                   
                                              
  

In [0]:

def attention_mechanism(rnn_size, keep_prob, encoder_outputs, encoder_states, encoder_sequence_length, batch_size):
  
  
  def cell(units, probs):
    layer = tf.contrib.rnn.BasicLSTMCell(units)
    return tf.contrib.rnn.DropoutWrapper(layer, probs)
  
  decoder_cell = cell(rnn_size, keep_prob)
  
  attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(rnn_size, 
                                                            encoder_outputs,
                                                            encoder_sequence_length)
  
  decoder_cell_wrapped = tf.contrib.seq2seq.AttentionWrapper(decoder_cell,
                                                            attention_mechanism,
                                                            rnn_size / 2)
  
  attention_ought = decoder_cell_wrapped.zero_state(batch_size = batch_size, dtype = tf.float32)
  
  encoder_state_new = attention_ought.clone(cell_state = encoder_states[-1])
  
  return decoder_cell_wrapped, encoder_state_new
  


In [0]:
def optimizer_loss(outputs, targets, decoder_sequence_length, max_sequence_length, learning_rate, clip_rate):
    '''
	
		Function used to define optimizer and loss function
		Inputs:
			outputs - outputs got from decoder part of the network
			targets - expected outputs/ labels
			dec_seq_len -
			max_seq_len - 
			learning_rate - small nubmer used to decrease value of gradients used to update our network
			clip_rate - tolerance boundries for clipping gradients
		Outputs:
			loss -
			trained_opt - optimizer with clipped gradients
    '''
    logits = tf.identity(outputs.rnn_output)
    
    mask_weights = tf.sequence_mask(decoder_sequence_length, max_sequence_length, dtype=tf.float32)
    
    with tf.variable_scope('opt_loss'):
        #using sequence_loss to optimize the seq2seq model
        loss = tf.contrib.seq2seq.sequence_loss(logits, 
                                                targets, 
                                                mask_weights)
        
        #Define optimizer
        optimizer = tf.train.AdamOptimizer(learning_rate)

        #Next 3 lines used to clip gradients {Prevent gradient explosion problem}
        gradients = tf.gradients(loss, tf.trainable_variables())
        clipped_grads, _ = tf.clip_by_global_norm(gradients, clip_rate)
        trained_opt = optimizer.apply_gradients(zip(clipped_grads, tf.trainable_variables()))
        
    return loss, trained_opt


In [0]:
class Seq2Seq_Model(object):
    
    def __init__(self, learning_rate, batch_size, encoder_embedded_size, decoder_embedded_size, rnn_size, 
                 number_of_layers, vocab_size, word2id_dict, clip_rate):
        
        #tf.reset_default_graph()
        
        self.inputs, self.targets, self.keep_prob, self.encoder_sequence_length, self.decoder_sequence_length, max_sequence_length = model_inputs()
        
        
        enc_outputs, enc_states = encoder_rnn(self.inputs, 
                                          rnn_size,
                                          number_of_layers, 
                                          self.encoder_sequence_length, 
                                          self.keep_prob, 
                                          encoder_embedded_size, 
                                          vocab_size)
        
        dec_inputs = preprocess_decoder_inputs(self.targets, 
                                                  word2id_dict, 
                                                  batch_size)
        
        
        decoder_cell, encoder_states_new = attention_mechanism(rnn_size, 
                                                          self.keep_prob, 
                                                          enc_outputs, 
                                                          enc_states, 
                                                          self.encoder_sequence_length, 
                                                          batch_size)
        
        train_outputs, inference_output = decoder(dec_inputs, 
                                                  encoder_states_new, 
                                                  decoder_cell,
                                                  decoder_embedded_size, 
                                                  vocab_size, 
                                                  self.decoder_sequence_length, 
                                                  max_sequence_length, 
                                                  word2id_dict, 
                                                  batch_size)
        
        self.predictions  = tf.identity(inference_output.sample_id, name='preds')
        
        self.loss, self.opt = optimizer_loss(train_outputs, 
                                       self.targets, 
                                       self.decoder_sequence_length, 
                                       max_sequence_length, 
                                       learning_rate, 
                                       clip_rate)

In [0]:
# Next up: hyper parameters
epochs = 5 #100
batch_size = 10  #64 make bigger to make faster
rnn_size = 64 # 512
num_layers = 2  #3
encoding_embedding_size = 64 #512  # 512 col in embedding matrix
decoding_embedding_size = 64 #512
learning_rate = 0.1 # 0.01
learning_rate_decay = 0.9
min_learning_rate = 0.01 #0.0001
keep_prob = 0.5
keep_probability = 0.5  # based on hinton paper '14'
clip= 5

In [0]:
tf.reset_default_graph()
graph1 = tf.get_default_graph()

with graph1.as_default():
  
  model = Seq2Seq_Model(learning_rate, batch_size, encoding_embedding_size, decoding_embedding_size,
                       rnn_size, num_layers, len(word2count), questionswords2int, clip)

In [0]:
session.close()

In [0]:
def get_accuracy(target, logits):
    """
    Calculate accuracy
    """
    max_seq = max(target.shape[1], logits.shape[1])
    if max_seq - target.shape[1]:
        target = np.pad(
            target,
            [(0,0),(0,max_seq - target.shape[1])],
            'constant')
    if max_seq - logits.shape[1]:
        logits = np.pad(
            logits,
            [(0,0),(0,max_seq - logits.shape[1])],
            'constant')

    return np.mean(np.equal(target, logits))

In [0]:
# Apply padding to the sequences so the question and answer sequences are the same

def apply_padding(batch_of_sequences, word2int):
  max_sequence_length = max([len(sequence) for sequence in batch_of_sequences])
  return [sequence + [word2int['<PAD>']] * (max_sequence_length - len(sequence)) for sequence in batch_of_sequences]

                                            

In [0]:
# Fxn to split data itno batches for batch gradient descent                                            
def split_into_batches(questions, answers, batch_size):
  padded_questions_in_batch = []
  padded_answers_in_batch = []
  final_question_batches = []
  final_answer_batches = []
  
  for batch_index in range(0,len(questions) // batch_size):
      start_index = batch_index * batch_size
                                            
      questions_in_batch = questions[start_index : start_index + batch_size]
      answers_in_batch = answers[start_index : start_index + batch_size]
      
                         
      final_question_batches.append(questions_in_batch)
      final_answer_batches.append(answers_in_batch)
       
  return final_question_batches, final_answer_batches
   

In [0]:
# Split data into training and validation sets
# needs more sophistication later.
# scq/sca mean sorted clean questions / answers from the preprocessign step.

training_validation_split = int(len(sorted_clean_questions) * 0.2)

train_Q = sorted_clean_questions[training_validation_split:]
train_A = sorted_clean_answers[training_validation_split:]

#val_Q = apply_padding(sorted_clean_questions[:training_validation_split],questionswords2int)
#val_A = apply_padding(sorted_clean_answers[:training_validation_split],questionswords2int)

In [0]:
#PQIB, PAIB = split_into_batches(train_Q, train_A, batch_size) #apply_padding(train_Q,questionswords2int),apply_padding(train_A,questionswords2int), batch_size)

PQIB, PAIB = split_into_batches(apply_padding(train_Q,questionswords2int),apply_padding(train_A,questionswords2int), batch_size)


In [0]:
session.close()

In [0]:
#tf.reset_default_graph()
#graph = tf.get_default_graph()

In [0]:
session = tf.Session(graph = graph1)
#session.run(tf.global_variables_initializer())

In [84]:
session.run(tf.global_variables_initializer())
PRINT_ERROR = True

epochs = 3

epoch_accuracy = []
epoch_loss = []

for i in range(epochs):
  
  bucket_accuracy = []
  bucket_loss = []
  
  # for bucket index 
  for b_idx in tqdm(range(3)):#len(PQIB))):
  
    X_batch = np.asarray(PQIB[b_idx])
    y_batch = np.asarray(PAIB[b_idx])

    feed_dict = {model.inputs:X_batch, 
         model.targets:y_batch, 
         model.keep_prob:keep_prob, 
         model.decoder_sequence_length:[len(y_batch[0])]*batch_size,
         model.encoder_sequence_length:[len(X_batch[0])]*batch_size}
    
    cost, _, preds = session.run([model.loss, model.opt, model.predictions], feed_dict=feed_dict)

    bucket_loss.append(cost)
    bucket_accuracy.append(get_accuracy(y_batch, preds))

    if(PRINT_ERROR == True and b_idx%100 == 0):    
      print(" Bucket {}:".format(b_idx+1), 
          " | Loss: {}".format(np.mean(bucket_loss)), 
          " | Accuracy: {}".format(np.mean(bucket_accuracy)))

      
  epoch_loss.append(np.mean(bucket_loss))
  epoch_accuracy.append(np.mean(bucket_accuracy))
      
  if(PRINT_ERROR == True and i%100 == 0):
      print("EPOCH: {}/{}".format(i, epochs), 
          " | Epoch loss: {}".format(np.mean(epoch_loss)), 
          " | Epoch accuracy: {}".format(np.mean(epoch_accuracy)))

    
session.close()
    




  0%|          | 0/3 [00:00<?, ?it/s][A[A[A


 33%|███▎      | 1/3 [00:16<00:32, 16.30s/it][A[A[A

 Bucket 1:  | Loss: 11.147171974182129  | Accuracy: 0.5942492012779552





 67%|██████▋   | 2/3 [00:31<00:15, 15.59s/it][A[A[A


100%|██████████| 3/3 [00:46<00:00, 15.36s/it][A[A[A


[A[A[A


  0%|          | 0/3 [00:00<?, ?it/s][A[A[A

EPOCH: 0/3  | Epoch loss: 5.577220439910889  | Epoch accuracy: 0.8315228966986156





 33%|███▎      | 1/3 [00:14<00:29, 14.86s/it][A[A[A

 Bucket 1:  | Loss: 0.6047214865684509  | Accuracy: 0.37731629392971244





 67%|██████▋   | 2/3 [00:22<00:11, 11.38s/it][A[A[A


100%|██████████| 3/3 [00:26<00:00,  8.95s/it][A[A[A


[A[A[A


  0%|          | 0/3 [00:00<?, ?it/s][A[A[A


 33%|███▎      | 1/3 [00:14<00:29, 14.91s/it][A[A[A

 Bucket 1:  | Loss: 1.017672061920166  | Accuracy: 0.9313099041533547





 67%|██████▋   | 2/3 [00:29<00:14, 14.90s/it][A[A[A


100%|██████████| 3/3 [00:44<00:00, 14.90s/it][A[A[A


[A[A[A

epochs = 1

for i in range(epochs):

    epoch_accuracy = []
    epoch_loss = []

    for b in range(len(sorted_clean_questions)):   # count each bucket once
        questions_bucket = []
        answers_bucket = []    
        bucket_accuracy = []
        bucket_loss = []
        
        for questions_bucket, answers_bucket in split_into_batches(sorted_clean_questions, sorted_clean_answers, batch_size):
  

        #for k in range(len(questions_bucket)):
        #    questions_bucket.append(np.array(sorted_clean_questions[k]))
        #    answers_bucket.append(np.array(sorted_clean_answers[k]))
            
          # tqdm is a progress bar
          # 
          for ii in tqdm(range(len(questions_bucket) //  batch_size)):

              starting_id = ii * batch_size

              X_batch = questions_bucket[starting_id:starting_id+batch_size]
              y_batch = answers_bucket[starting_id:starting_id+batch_size]

              feed_dict = {model.inputs:X_batch, 
                           model.targets:y_batch, 
                           model.keep_prob:keep_prob, 
                           model.decoder_sequence_length:[len(y_batch[0])]*batch_size,
                           model.encoder_sequence_length:[len(X_batch[0])]*batch_size}

              cost, _, preds = session.run([model.loss, model.opt, model.predictions], feed_dict=feed_dict)

              epoch_accuracy.append(get_accuracy(np.array(y_batch), np.array(preds)))
              bucket_accuracy.append(get_accuracy(np.array(y_batch), np.array(preds)))

              bucket_loss.append(cost)
              epoch_loss.append(cost)
        if(b%100 == 0):    
          print("Bucket {}:".format(b+1), 
                " | Loss: {}".format(np.mean(bucket_loss)), 
                " | Accuracy: {}".format(np.mean(bucket_accuracy)))
    if(i%100 == 0):
      print("EPOCH: {}/{}".format(i, epochs), 
            " | Epoch loss: {}".format(np.mean(epoch_loss)), 
            " | Epoch accuracy: {}".format(np.mean(epoch_accuracy)))
    
    saver.save(session, "checkpoint/chatbot_{}.ckpt".format(i))

In [0]:
#TRAINING STEPS

batch_index_check_training_loss = 100
batch_index_check_validation_loss = ((len(train_Q)) // batch_size // 2) - 1
total_training_loss_error = 0
list_validation_loss_error = []
early_stopping_check = 0
early_stopping_stop = 1000

checkpoint = "./chatbot_weights.ckpt"

NameError: ignored

In [0]:
session.run(tf.global_variables_initializer())