[View in Colaboratory](https://colab.research.google.com/github/hamil168/Chatbots/blob/master/Seq2Seq.ipynb)

In [7]:
# For a fresh Colab instance, clone fresh:
!pip install -q xlrd
!git clone https://github.com/hamil168/Chatbots

fatal: destination path 'Chatbots' already exists and is not an empty directory.


In [8]:
# Change to Colab directory:
cd Chatbots/


/content/Chatbots


In [0]:

# For an existing Colab instance, pull from master
#!git pull https://github.com/hamil168/Chatbots master

In [10]:
# Files as they appear in the repo clone
ls

Cornell Movie Script Database EDA.ipynb  movie_lines.txt  preproc.py
movie_conversations.txt                  Preproc.ipynb    [0m[01;34m__pycache__[0m/


In [0]:

import numpy as np
import tensorflow as tf
import time


In [0]:
run preproc

In [0]:
# -*- coding: utf-8 -*-
"""

Data preprocessing steps for Cornell Movie Script
Chatbot 

using movie_conversations.text and 
movie_lines.txt from the Cornell Movie Script Database


Created on Sat Jul 14 14:16:00 2018

@author: Ben Hamilton

#########
"""


# Importing the libraries
import numpy as np
import tensorflow as tf
import re
import time

# - DATA PREPROCESSING ##########

# Importing the dataset
lines = open('movie_lines.txt', encoding = 'utf-8', errors = 'ignore').read().split('\n')
conversations = open('movie_conversations.txt', encoding = 'utf-8', errors = 'ignore').read().split('\n')


def id_to_line(lines):
    #create a dictionary mapping ids to lines
    # iterate through each line, split into different elements, get key, get value
    id2line = {}
    
    for line in lines:
      _line = line.split(' +++$+++ ')
      
      if len(_line) == 5:
        id2line[_line[0]] = _line[4]
    
    return id2line


def get_conversations_ids(conversations):
 
    # create a list of the conversations
    conversations_ids = []
    
    # The last row of data set is empty, so skip it
    for conversation in conversations[:-1]:
      
      # Split and remove brackets
      # Remove single quote
      # Remove spaces
      _conversation = conversation.split(' +++$+++ ')[-1][1:-1].replace("'","").replace(" ","")
      
      # Append as list by spliting on commas
      conversations_ids.append(_conversation.split(","))
      
    return conversations_ids
  
def get_questions_and_answers(conversations_ids,id2line):
    # return unclean questions and answers
    # using the rule that every line that is responded to is a 'question'
    # and every line that is a response is the corresponding 'answer'
    # So it is expected that some lines appear on both lists, but not in 
    # parallel to themselves.
    # Getting separately the questions and the answers
    questions = []
    answers = []
    
    for conversation in conversations_ids:
      
      for i in range(len(conversation) - 1):
        questions.append(id2line[conversation[i]])
        answers.append(id2line[conversation[i+1]])
        
    return questions, answers
    

# first cleaning of the texts
def clean_text(text):
  text = text.lower()
  text = re.sub(r"i'm", "i am", text)
  text = re.sub(r"he's", "he is", text)
  text = re.sub(r"she's", "she is", text)
  text = re.sub(r"that's", "that is", text)
  text = re.sub(r"what's", "what is", text)
  text = re.sub(r"where's", "where is", text)
  
  text = re.sub(r"\'ll", " will", text)
  text = re.sub(r"\'ve", " have", text)
  text = re.sub(r"\'d", " would", text)
  text = re.sub(r"won't", "will not", text)
  text = re.sub(r"can't", "can not", text)
  text = re.sub(r"don't", "do not", text)
  
  text = re.sub(r"\'re", " are", text)
  
  text = re.sub(r"[-()\'#/@;:<>{}'\+\=\-\|.?,\!]", "", text) 

  return text

# Apply on all "questions" and "answers"





#############################################################


def word_to_counts(clean_questions, clean_answers):
# Creating a dictionary that maps each word to its number of occurrences
    word2count = {}
    for question in clean_questions:
      for word in question.split():
        if word not in word2count:
          word2count[word] = 1
        else:
          word2count[word] += 1
      
# This double counts some entries... this may be OK. come back to think on it later.
    for answer in clean_answers:
      for word in answer.split():
        if word not in word2count:
          word2count[word] = 1
        else:
          word2count[word] += 1
          
    return word2count

#############################################################


# Create 2 dictionaries that map questions words and answer words to integers
def map_questions_and_answers_to_integers(word2count):

    # set threshold for significance in word count
    threshold = 20
    
    questionswords2int = {}
    word_number = 0
    
    for word, count in word2count.items():
        if count >= threshold:
          questionswords2int[word] = word_number
          word_number += 1
    
    
    answerswords2int = {}
    word_number = 0
    
    for word, count in word2count.items():
        if count >= threshold:
          answerswords2int[word] = word_number
          word_number += 1    

    # Adding the last tokens to these two dictionaries
    # create entries for tokens for questionswords2int and answerswords2uint
    
    tokens = ['<PAD>', '<EOS>', '<OUT>', '<SOS>']
    for token in tokens:
      questionswords2int[token] = len(questionswords2int) + 1
      answerswords2int[token] = len(answerswords2int) + 1

    return questionswords2int, answerswords2int

#############################################################

def map_invert_answers_to_ints(answerswords2int):
    # Create dictionary that maps integers back to the answers
    # invert answersword2int using dict comprehension and .items
    
    answersints2word = {word_int: word for word, word_int in answerswords2int.items()}

#############################################################


def preproc_steps(lines, conversations):
    
    id2line = id_to_line(lines)
    
    conversations_ids = get_conversations_ids(conversations)
    
    questions, answers = get_questions_and_answers(conversations_ids,id2line)
    
    clean_questions = [clean_text(question) for question in questions]
    clean_answers = [clean_text(answer) for answer in answers]
    
    word2count = word_to_counts(clean_questions, clean_answers)
    
    questionswords2int, answerswords2int = map_questions_and_answers_to_integers(word2count)
    
    answersints2words = map_invert_answers_to_ints(answerswords2int)

    # Conccatenate <EOS> to every cleaned answer
    # needed for seq2seq model 
    
    for i in range(len(clean_answers)):
      clean_answers[i] += ' <EOS>'
    
    # Translating cleaned questions into integers using 
    # replace alal words filtered out by token with <OUT>
    
    questions_into_int = []
    
    for question in clean_questions:
      ints = []
      
      # translate question into integers
      for word in question.split():
        
        if word in questionswords2int:
          ints.append(questionswords2int[word])
        else:
          ints.append(questionswords2int['<OUT>'])
    
      
      questions_into_int.append(ints)
      
    answers_into_int = []
    
    for answer in clean_answers:
      ints = []
      
      # translate answer into integers
      for word in answer.split():
        
        if word in answerswords2int:
          ints.append(answerswords2int[word])
        else:
          ints.append(answerswords2int['<OUT>'])
    
      
      answers_into_int.append(ints)
    
    # Sort questions by length of questions to speed up training
    # Reduces amount of padding during training
    
    sorted_clean_questions = []
    sorted_clean_answers = []
    
    # limit input to short sentences
    MAX_SENTENCE_LENGTH = 25
    
    # loop over possible lengths of questions
    for length in range(1, MAX_SENTENCE_LENGTH + 1):
       
      # use enumerate to loop 2 elements: index of question and question as list of ints
      for i in enumerate(questions_into_int):
      
        # if length of current question is equal to length we are checking...
        # append it to the sorted list by catching via the enumerated index
        if len(i[1]) == length:
          sorted_clean_questions.append(questions_into_int[i[0]])
          
          # keeps answer well aligned:
          sorted_clean_answers.append(answers_into_int[i[0]])
    
    return id2line, conversations_ids, questions, answers, clean_questions, clean_answers, word2count, sorted_clean_questions, sorted_clean_answers

In [0]:
id2l, cid, q, a, cq, ca, w2c, scq, sqa = preproc_steps(lines,conversations)

In [45]:
conversations[0:5]

["u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L198', 'L199']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L200', 'L201', 'L202', 'L203']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L204', 'L205', 'L206']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L207', 'L208']"]

In [46]:
get_conversations_ids(conversations[0:5])

[['L194', 'L195', 'L196', 'L197'],
 ['L198', 'L199'],
 ['L200', 'L201', 'L202', 'L203'],
 ['L204', 'L205', 'L206']]

In [0]:

# Create placeholder for inputs and the targets
# in TF, all variables are tensors
# need to go from NP --> TF tensors
# need placeholders for every TF variables inputs and targets

def model_inputs():
  #inputs and targets are 2D matrices
  inputs = tf.placeholder(tf.int32, [None, None], name = 'input') 
  targets = tf.placeholder(tf.int32, [None, None], name = 'target')
  
  
  lr = tf.placeholder(tf.float32, name = 'learning_rate')
  keep_prob = tf.placeholder(tf.float32, name = 'keep_prop') #dropout
  
  return inputs, targets, lr, keep_prob

In [0]:
# preprocessing the targets
# need batches, 
# need each to start with <SOS> token

def preproc_targets(targets, word2int, batch_size):
  left_side = tf.fill([batch_size, 1], word2int['<SOS>'])
    
  # up to batch size less the last token
  # slide of [1,1]
  right_side = tf.strided_slice(targets, [0,0], [batch_size, -1], [1,1])
  preprocessed_target = tf.concat([left_side,right_side], axis=1)
  return preproc_targets

In [0]:
# Create encoder RNN layer
def encoder_rnn(rnn_inputs, rnn_size, num_layers, keep_prob, sequence_length):
  # LSTM cell class
  
  # rnn_size: number of input tensors
  # sequence_length: length of each question in the atch
  
  lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
  
  # dropout wrapper class
  lstm_dropout = tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob = keep_prob)
  
  #
  encoder_cell = tf.contrib.rnn.MultiRNNCell([lstm_dropout] * num_layers)
  
  # bidirection rnn function (creates dynamic bidirectional network)
  # builds independent forward and backward rnn
  # need ot make sure the ends match
  # (first element is encoder_output)
  _, encoder_state = tf.nn.bdirectional_dynamic_rnn(cell_fw = encouder_cell,
                                                   cell_bw = encoder_cell,
                                                   sequence_length = sequence_length,
                                                   inputs = rnn_inputs,
                                                   dtype = tf.float32)
  return encoder_state


In [0]:
def decode_training_set(encoder_state, decoder_cell, 
                        decoder_embedded_input, 
                        sequence_length, decoding_scope, output_function,
                       keep_prob, batch_size):
  
  # Get attention states
  attention_states = tf.zeros([batch_size, 1, decoder_cell.output_size])
  
  # preprocess data to prepare it for attention 
  attention_keys, attention_values, attention_score_functions, attention_construct_function = tf.contrib.seq2seq.prepare_attention(attetion_states,
                                         attention_options = 'bahdanau',
                                         num_units = decoder_cell.output_size)
    
  training_decoder_function = tf.contrib.seq2seq.attention_decoder_fn_train(encoder_state[0],
                                                                            attention_keys,
                                                                            attention_values,
                                                                            attention_score_function,
                                                                            attention_construct_function,
                                                                            name = 'attn_dec_train')
 
  # second arg is decoder_final_state, 3rd is decoder_final_context_state
  decoder_output, _, _, = tf.contrib.seq2seq.dynamic_rnn_decoder(decoder_cell,
                                                                training_decoder_function,
                                                                decoder_embedded_input,
                                                                sequence_length,
                                                                scope = decoding_scope)
  
  decoder_output_dropout = tf.nn.dropout(decoder_output, keep_prob)
  return output_function(decoder_output_dropout)
  

In [0]:
# decoding the test/validation set

def decode_test_set(encoder_state, decoder_cell, 
                        decoder_embeddings_matrix,
                        sos_id, eos_id, maximum_length, num_words,
                        sequence_length, decoding_scope, output_function,
                       keep_prob, batch_size):
  
  # Get attention states
  attention_states = tf.zeros([batch_size, 1, decoder_cell.output_size])
  
  # preprocess data to prepare it for attention 
  attention_keys, attention_values, attention_score_functions, attention_construct_function = tf.contrib.seq2seq.prepare_attention(attetion_states,
                                         attention_options = 'bahdanau',
                                         num_units = decoder_cell.output_size)
    
  test_decoder_function = tf.contrib.seq2seq.attention_decoder_fn_inference(output_function,
                                                                            encoder_state[0],
                                                                            attention_keys,
                                                                            attention_values,
                                                                            attention_score_function,
                                                                            attention_construct_function,
                                                                            decoder_embeddings_matrix,
                                                                            sos_id, 
                                                                            eos_id, 
                                                                            maximum_length, 
                                                                            num_words,
                                                                            name = 'attn_dec_inf')
 
  # second arg is decoder_final_state, 3rd is decoder_final_context_state
  test_predictions, _, _, = tf.contrib.seq2seq.dynamic_rnn_decoder(decoder_cell,
                                                                test_decoder_function,
                                                                scope = decoding_scope)
  
  # no dropout for test part
  return test_predictions
  

In [0]:
def decoder_rnn(decoder_embedded_input, decoder_embeddings_matrix, encoder_state, num_words, sequence_length, rnn_size, num_layers, word2int, keep_prob, batch_size):
  with tf.variable_scope('decoding') as decoding_scope:
    lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    lstm_dropout = tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob = keep_prob)
    decoder_cell = tf.contrib.rnn.MultiRNNCell([lstm_dropout] * num_layers)
    
    weights = tf.truncated_normal_initializer(stddev = 0.1)
    biases = tf.zeros_initializer()
    
    # create FCL
    output_function = lambda x: tf.contrib.layers.fully_connected(x,
                                                                 num_words,
                                                                 None,
                                                                 scope = decoding_scope,
                                                                 weights_initializer = weights,
                                                                 baises_initializer = biases)
    
    training_predictions = decode_training_set(encoder_state, decoder_cell,
                                              decoder_embedded_input,
                                              sequence_length,
                                              decoding_scope,
                                              output_function,
                                              keep_prob,
                                              batch_size)
    
    decoding_scope.reuse_variables()
    test_predictions = decode_test_set(encoder_state,
                                      decoder_cell,
                                      decoder_embeddings_matrix,
                                      word2int['<SOS>'],
                                      word2int['<EOS>'],
                                      sequence_length - 1,  #exclude last token
                                      num_words,
                                      decoding_scope,
                                      output_function,
                                      keep_prob,
                                      batch_size)
    
    
  return training_predictions, test_predictions    
    

In [0]:
# Build the Model

def seq2seq_model(inputs, targets, keep_prob, batch_size, sequence_length, answers_num_words, questions_num_words,
                 encoder_embedding_size, decoder_embedding_size, rnn_size, num_layers, questionswords2int):
  
  encoder_embedded_input = tf.contrib.layers.embed_sequence(inputs,
                                                           answers_num_words + 1,
                                                           encoder_embedding_size,
                                                           initializer = tf.random_uniform_initializer(0,1))
  
  encoder_state - encoder_rnn(encoder_embedded_input, rnn_size, num_layers, keep_prob, sequence_length)
  
  preprocessed_targets = preprocessed_targets(targets, questionswords2int, batch_size)
  
  decoder_embeddings_matrix = tf.Variable(tf.random_uniform([questions_num_words + 1, decoder_embedding_size], 0, 1))
  
  decoder_embedded_input = tf.nn.embedding_lookup(decoder_embeddings_matrix, preprocessed_targets)
  
  training_predictions, test_predictions = decoder_rnn(decoder_embedded_input,
                                                      decoder_embeddings_matrix,
                                                      encoder_state,
                                                      questions_num_words,
                                                      sequence_length,
                                                      rnn_size,
                                                      num_layers,
                                                      questionswords2int,
                                                      keep_prob,
                                                      batch_size)

In [0]:
# Next up: hyper parameters