# Part - 1 ( Data Preprocessing )

## Importing the libraries

In [1]:
import numpy as np
import tensorflow as tf
import re
import time

In [2]:
tf.__version__

'1.0.0'

## Importing the dataset

In [3]:
lines = open('dataset/movie_lines.txt', encoding='utf-8', errors='ignore').read().split('\n')
conversations = open('dataset/movie_conversations.txt', encoding='utf-8', errors='ignore').read().split('\n')

## Create a dictionary that maps each lines and its id

In [4]:
id2line = {}

for line in lines:
    _line = line.split(' +++$+++ ') # _line : throwaway variables
    if len(_line) == 5:
        id2line[_line[0]] = _line[4]

## Create a list of all the conversation

In [5]:
conversations_ids = []

for conversation in conversations[:-1]: # last row of conversations is empty
    _conversation = conversation.split(' +++$+++ ')[-1][1:-1].replace("'","").replace(" ", "")
    #                                               |     |             |               |______ remove spaces
    #                                               |     |             |______________________ remove '
    #                                               |     |____________________________________ remove the '[' , ']'
    #                                               |__________________________________________ get last element of split (ids)
    conversations_ids.append(_conversation.split(','))

## Get questions and answers

In [6]:
questions = []
answers = []

for conversation in conversations_ids:
    for i in range(len(conversation) -1 ):
        questions.append( id2line[conversation[i]] )
        answers.append( id2line[conversation[i+1]] )

In [7]:
def printQA(questions, answers, startIndex, endIndex):
    for i in range(startIndex, endIndex):
        print(' Q : ',questions[i],'\n', 'A : ' ,answers[i], '\n')

In [8]:
printQA(questions, answers, 0, 10)

 Q :  Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again. 
 A :  Well, I thought we'd start with pronunciation, if that's okay with you. 

 Q :  Well, I thought we'd start with pronunciation, if that's okay with you. 
 A :  Not the hacking and gagging and spitting part.  Please. 

 Q :  Not the hacking and gagging and spitting part.  Please. 
 A :  Okay... then how 'bout we try out some French cuisine.  Saturday?  Night? 

 Q :  You're asking me out.  That's so cute. What's your name again? 
 A :  Forget it. 

 Q :  No, no, it's my fault -- we didn't have a proper introduction --- 
 A :  Cameron. 

 Q :  Cameron. 
 A :  The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does. 

 Q :  The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does. 
 A :  Seems like she could ge

## Cleaning the text

In [9]:
def cleanText(text):
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"[-()\"#/@;:<>{}+=~|.?,]", "", text)
    return text

In [10]:
clean_questions = []
for question in questions:
    clean_questions.append(cleanText(question))

In [11]:
clean_answers = []
for answer in answers:
    clean_answers.append(cleanText(answer))

In [12]:
printQA(clean_questions, clean_answers, 0, 10)

 Q :  can we make this quick  roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad  again 
 A :  well i thought we would start with pronunciation if that is okay with you 

 Q :  well i thought we would start with pronunciation if that is okay with you 
 A :  not the hacking and gagging and spitting part  please 

 Q :  not the hacking and gagging and spitting part  please 
 A :  okay then how 'bout we try out some french cuisine  saturday  night 

 Q :  you're asking me out  that is so cute what is your name again 
 A :  forget it 

 Q :  no no it's my fault  we didn't have a proper introduction  
 A :  cameron 

 Q :  cameron 
 A :  the thing is cameron  i am at the mercy of a particularly hideous breed of loser  my sister  i can't date until she does 

 Q :  the thing is cameron  i am at the mercy of a particularly hideous breed of loser  my sister  i can't date until she does 
 A :  seems like she could get a date easy enough 

 Q :  wh

## Vectorize words

In [13]:
word2count = {}

for question in clean_questions:
    for word in question.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1

for answer in clean_answers:
    for word in answer.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1

### Give uniqu int to word and remove non frequent words

In [14]:
threshold = 20

questionsWords2ints = {}

word_num = 0
for word, count in word2count.items():
    if count >= threshold:
        questionsWords2ints[word] = word_num
        word_num += 1

answersWords2ints = {}

word_num = 0
for word, count in word2count.items():
    if count >= threshold:
        answersWords2ints[word] = word_num
        word_num += 1

### Adding last two tokens to these two dictionaries

In [15]:
tokens = ['<PAD>', '<EOS>', '<OUT>', '<SOS>'] 
# SOS : start of string
# EOS : end of string
# OUT : lest frequent words
# PAD : padding to make same length

for token in tokens:
    questionsWords2ints[token] = len(questionsWords2ints) + 1

for token in tokens:
    answersWords2ints[token] = len(answersWords2ints) + 1

### Create the inverse dictionary of answersWords2int

In [16]:
answersInts2Words = {w_i : w for w, w_i in answersWords2ints.items()}

In [17]:
i = 0
for w, w_i in answersWords2ints.items():
    print(w, w_i)
    i += 1
    if i == 5:
        break

denying 1188
passenger 7728
venkman 2310
5 1189
noir 7034


In [18]:
i = 0
for w_i, w in answersInts2Words.items():
    print(w_i,w)
    i += 1
    if i == 5:
        break

0 hoping
1 bateman
2 pills
3 charlie!
4 trucks


### Add EOS token to the end of every answer

In [19]:
for i in range(len(clean_answers)):
    clean_answers[i] += ' <EOS>'

In [20]:
printQA(clean_questions, clean_answers, 0, 1)

 Q :  can we make this quick  roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad  again 
 A :  well i thought we would start with pronunciation if that is okay with you <EOS> 



### Translate all the questions and answers into integers
### Replace all the filtered words by <OUT>

In [21]:
questions_into_int = []

for question in clean_questions:
    ints = []
    for word in question.split():
        if word not in questionsWords2ints:
            ints.append(questionsWords2ints['<OUT>'])
        else:
            ints.append(questionsWords2ints[word])
    questions_into_int.append(ints)

answers_into_int = []

for answer in clean_answers:
    ints = []
    for word in answer.split():
        if word not in questionsWords2ints:
            ints.append(answersWords2ints['<OUT>'])
        else:
            ints.append(answersWords2ints[word])
    answers_into_int.append(ints)

## Short questions and answers by length of quections

### This speedup training process and helps reduse the loss
### Reduse the amount of padding during the training 

In [27]:
sorted_clean_quections = []
sorted_clean_answers = []

for length in range(1, 25 + 1): # upto 25
    for i in enumerate(questions_into_int): # i[0] : index, i[1] : quection
        if len(i[1]) == length:
            sorted_clean_quections.append(questions_into_int[i[0]])
            sorted_clean_answers.append(answers_into_int[i[0]])

# Part - 2 ( Building Seq-2-Seq model )

## Creating placeholders for the inputs and the targets

In [5]:
def model_inputs():
    inputs = tf.placeholder(tf.int32, shape=[None, None], name='input')
    #                                          |_________ Two dimensional matrix
    targets = tf.placeholder(tf.int32, shape=[None, None], name='traget')
    lr = tf.placeholder(tf.float32, name='learning_rate')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob') # control the dropout rate
    return inputs, targets, lr, keep_prob

## Preprocessing the traget

### Neural network expect targets with batches (answers)

In [4]:
def preprocess_tragets(targets, word2int, batch_size):
    left_side = tf.fill([batch_size, 1], word2int['<SOS>']) # (dims, values, name=None)
    right_side = tf.strided_slice(targets, [0,0], [batch_size,-1], strides=[1,1]) # [batch_size,-1] : all lines except the last element
    #(input_, begin, end, strides=None, begin_mask=0, end_mask=0, ellipsis_mask=0, new_axis_mask=0, shrink_axis_mask=0, var=None, name=None)
    preprocessed_tragets = tf.concat([left_side, right_side], 1) # (values, axis, name='concat')
    return preprocessed_tragets

## Creating the encoder RNN layer

In [6]:
def encoder_rnn_layer(rnn_inputs, rnn_size, num_layers, keep_prob, sequence_length):
    lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    lstm_dropout = tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob=keep_prob)
    encoder_cell = tf.contrib.rnn.MultiRNNCell([lstm_dropout] * num_layers )
    _, encoder_state = tf.nn.bidirectional_dynamic_rnn(
#   |__________________________  encoder output
        cell_fw=encoder_cell,
        cell_bw=encoder_cell,
        sequence_length=sequence_length,
        input=rnn_inputs,
        dtype=tf.float32
        ) # Creates a dynamic version of bidirectional recurrent neural network
    return encoder_state

## Decoding the training set

In [9]:
def decode_training_set(
    encoder_state, 
    decoder_cell, 
    decoder_embedded_input, 
    sequence_length, 
    decoding_scope,
    output_function,
    keep_prob,
    batch_size):

    attention_states = tf.zeros([batch_size, 1, decoder_cell.output_size])
    attention_keys, attention_values, attention_score_function, attention_construct_function = tf.contrib.seq2seq.prepare_attention(
        attention_states,
        attention_option='bahdanau',
        num_units=decoder_cell.output_size
    )
    training_decoder_function = tf.contrib.seq2seq.attention_decoder_fn_train(
        encoder_state[0],
        attention_keys,
        attention_values,
        attention_score_function,
        attention_construct_function,
        name='attn_dec_train'
    )
    decoder_output, _, _ = tf.contrib.seq2seq.dynamic_rnn_decoder(
    #               |  |___ decoder final context state
    #               |______ decoder final state
        decode_cell, 
        training_decoder_function, 
        decoder_embedded_input, 
        sequence_length,
        scope = decoding_scope,
    )
    decoder_output_dropout = tf.nn.dropout(decoder_output, keep_prob)
    return output_function(decoder_output_dropout)