## Importing the libraries

In [1]:
import numpy as np
import tensorflow as tf
import re
import time

In [2]:
tf.__version__

'1.0.0'

## Importing the dataset

In [3]:
lines = open('dataset/movie_lines.txt', encoding='utf-8', errors='ignore').read().split('\n')
conversations = open('dataset/movie_conversations.txt', encoding='utf-8', errors='ignore').read().split('\n')

## Create a dictionary that maps each lines and its id

In [4]:
id2line = {}

for line in lines:
    _line = line.split(' +++$+++ ') # _line : throwaway variables
    if len(_line) == 5:
        id2line[_line[0]] = _line[4]

## Create a list of all the conversation

In [5]:
conversations_ids = []

for conversation in conversations[:-1]: # last row of conversations is empty
    _conversation = conversation.split(' +++$+++ ')[-1][1:-1].replace("'","").replace(" ", "")
    #                                               |     |             |               |______ remove spaces
    #                                               |     |             |______________________ remove '
    #                                               |     |____________________________________ remove the '[' , ']'
    #                                               |__________________________________________ get last element of split (ids)
    conversations_ids.append(_conversation.split(','))

## Get questions and answers

In [10]:
questions = []
answers = []

for conversation in conversations_ids:
    for i in range(len(conversation) -1 ):
        questions.append( id2line[conversation[i]] )
        answers.append( id2line[conversation[i+1]] )

In [31]:
def printQA(questions, answers, startIndex, endIndex):
    for i in range(startIndex, endIndex):
        print(' Q : ',questions[i],'\n', 'A : ' ,answers[i], '\n')

In [33]:
printQA(questions, answers, 0, 10)

 Q :  Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again. 
 A :  Well, I thought we'd start with pronunciation, if that's okay with you. 

 Q :  Well, I thought we'd start with pronunciation, if that's okay with you. 
 A :  Not the hacking and gagging and spitting part.  Please. 

 Q :  Not the hacking and gagging and spitting part.  Please. 
 A :  Okay... then how 'bout we try out some French cuisine.  Saturday?  Night? 

 Q :  You're asking me out.  That's so cute. What's your name again? 
 A :  Forget it. 

 Q :  No, no, it's my fault -- we didn't have a proper introduction --- 
 A :  Cameron. 

 Q :  Cameron. 
 A :  The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does. 

 Q :  The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does. 
 A :  Seems like she could ge

## Cleaning the text

In [34]:
def cleanText(text):
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"[-()\"#/@;:<>{}+=~|.?,]", "", text)
    return text

In [35]:
clean_questions = []
for question in questions:
    clean_questions.append(cleanText(question))

In [36]:
clean_answers = []
for answer in answers:
    clean_answers.append(cleanText(answer))

In [40]:
printQA(clean_questions, clean_answers, 0, 10)

 Q :  can we make this quick  roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad  again 
 A :  well i thought we would start with pronunciation if that is okay with you 

 Q :  well i thought we would start with pronunciation if that is okay with you 
 A :  not the hacking and gagging and spitting part  please 

 Q :  not the hacking and gagging and spitting part  please 
 A :  okay then how 'bout we try out some french cuisine  saturday  night 

 Q :  you're asking me out  that is so cute what is your name again 
 A :  forget it 

 Q :  no no it's my fault  we didn't have a proper introduction  
 A :  cameron 

 Q :  cameron 
 A :  the thing is cameron  i am at the mercy of a particularly hideous breed of loser  my sister  i can't date until she does 

 Q :  the thing is cameron  i am at the mercy of a particularly hideous breed of loser  my sister  i can't date until she does 
 A :  seems like she could get a date easy enough 

 Q :  wh

## Vectorize words

In [41]:
word2count = {}

for question in clean_questions:
    for word in question.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1

for answer in clean_answers:
    for word in answer.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1

### Give uniqu int to word and remove non frequent words

In [49]:
threshold = 20

questionsWords2int = {}

word_num = 0
for word, count in word2count.items():
    if count >= threshold:
        questionsWords2int[word] = word_num
        word_num += 1

answersWords2int = {}

word_num = 0
for word, count in word2count.items():
    if count >= threshold:
        answersWords2int[word] = word_num
        word_num += 1

### Adding last two tokens to these two dictionaries

In [54]:
tokens = ['<PAD>', '<EOS>', '<OUT>', '<SOS>'] 
# SOS : start of string
# EOS : end of string
# OUT : lest frequent words
# PAD : padding to make same length

for token in tokens:
    questionsWords2int[token] = len(questionsWords2int) + 1

for token in tokens:
    answersWords2int[token] = len(answersWords2int) + 1