In [2]:
#Importing libraries
import numpy as np
import tensorflow as tf
import re 
import time

In [3]:
# Importing the dataset
lines = open('movie_lines.txt', encoding='utf-8', errors='ignore').read().split('\n')
conversations = open('movie_conversations.txt', encoding='utf-8', errors='ignore').read().split('\n')

In [4]:
#Creating python dictionary that maps each lines and its id
id2line = {}
for line in lines:
    _line = line.split(' +++$+++ ')
    if len(_line) == 5:
        id2line[_line[0]] = _line[4]

In [5]:
# Creating a list of all of the conversation
conversations_ids = []
for conversation in conversations[:-1]: #doesnt include the last row, cause its empty
    _conversation = conversation.split(' +++$+++ ')[-1][1:-1].replace("'", "").replace(" ", "")   #get the last column of each row #[1:-1] doesnt include the square bracket as the first and last element of the last column of each row    
    conversations_ids.append(_conversation.split(','))
        

In [6]:
#Getting separately the questions and the answers from the conversations_ids list
questions = []
answers = []
for conversation in conversations_ids:
    for i in range(len(conversation) - 1):
        questions.append(id2line[conversation[i]])
        answers.append(id2line[conversation[i+1]])

In [8]:
questions

['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.',
 "Well, I thought we'd start with pronunciation, if that's okay with you.",
 'Not the hacking and gagging and spitting part.  Please.',
 "You're asking me out.  That's so cute. What's your name again?",
 "No, no, it's my fault -- we didn't have a proper introduction ---",
 'Cameron.',
 "The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.",
 'Why?',
 'Unsolved mystery.  She used to be really popular when she started high school, then it was just like she got sick of it or something.',
 'Gosh, if only we could find Kat a boyfriend...',
 "C'esc ma tete. This is my head",
 "Right.  See?  You're ready for the quiz.",
 "I don't want to know how to say that though.  I want to know useful things. Like where the good stores are.  How much does champagne cost?  Stuff like Chat.  I have n

In [9]:
answers

["Well, I thought we'd start with pronunciation, if that's okay with you.",
 'Not the hacking and gagging and spitting part.  Please.',
 "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?",
 'Forget it.',
 'Cameron.',
 "The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.",
 'Seems like she could get a date easy enough...',
 'Unsolved mystery.  She used to be really popular when she started high school, then it was just like she got sick of it or something.',
 "That's a shame.",
 'Let me see what I can do.',
 "Right.  See?  You're ready for the quiz.",
 "I don't want to know how to say that though.  I want to know useful things. Like where the good stores are.  How much does champagne cost?  Stuff like Chat.  I have never in my life had to point out my head to someone.",
 "That's because it's such a nice one.",
 'Forget French.',
 "Well, there's someone I think might be --",
 'Where?',
 "I 

In [31]:
# Doing a first cleaning of the text
def clean_text(text):
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"[-()\"#/@;:<>{}+=~|.?,']", "", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"doesn't", "does not", text)
    text = re.sub(r"didn't", "did not", text)
    return text

In [32]:
# Clean the questions
clean_questions = []
for question in questions:
    clean_questions.append(clean_text(question))

In [33]:
# Clean the answers
clean_answers = []
for answer in answers:
    clean_answers.append(clean_text(answer))

In [34]:
clean_questions

['can we make this quick  roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad  again',
 'well i thought we would start with pronunciation if that is okay with you',
 'not the hacking and gagging and spitting part  please',
 'you are asking me out  that is so cute what is your name again',
 'no no it is my fault  we didnt have a proper introduction ',
 'cameron',
 'the thing is cameron  i am at the mercy of a particularly hideous breed of loser  my sister  i cannot date until she does',
 'why',
 'unsolved mystery  she used to be really popular when she started high school then it was just like she got sick of it or something',
 'gosh if only we could find kat a boyfriend',
 'cesc ma tete this is my head',
 'right  see  you are ready for the quiz',
 'i dont want to know how to say that though  i want to know useful things like where the good stores are  how much does champagne cost  stuff like chat  i have never in my life had to point out m

In [30]:
clean_answers

['well i thought we would start with pronunciation if that is okay with you',
 'not the hacking and gagging and spitting part  please',
 'okay then how bout we try out some french cuisine  saturday  night',
 'forget it',
 'cameron',
 'the thing is cameron  i am at the mercy of a particularly hideous breed of loser  my sister  i cannot date until she does',
 'seems like she could get a date easy enough',
 'unsolved mystery  she used to be really popular when she started high school then it was just like she got sick of it or something',
 'that is a shame',
 'let me see what i can do',
 'right  see  you are ready for the quiz',
 'i dont want to know how to say that though  i want to know useful things like where the good stores are  how much does champagne cost  stuff like chat  i have never in my life had to point out my head to someone',
 'that is because it is such a nice one',
 'forget french',
 'well there is someone i think might be ',
 'where',
 'i counted on you to help my cause 

In [35]:
# Creating a dictionary that maps each word to its number of occurance
word2count = {}
for question in clean_questions:
    for word in question.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1
for answer in clean_answers:
    for word in answer.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1

In [36]:
word2count

{'can': 15983,
 'we': 40522,
 'make': 6752,
 'this': 33596,
 'quick': 339,
 'roxanne': 1,
 'korrine': 1,
 'and': 65615,
 'andrew': 56,
 'barrett': 19,
 'are': 54583,
 'having': 1217,
 'an': 10179,
 'incredibly': 60,
 'horrendous': 4,
 'public': 368,
 'break': 895,
 'up': 16062,
 'on': 27248,
 'the': 140736,
 'quad': 2,
 'again': 3197,
 'well': 14119,
 'i': 204632,
 'thought': 4553,
 'would': 20013,
 'start': 1656,
 'with': 24962,
 'pronunciation': 2,
 'if': 18970,
 'that': 67116,
 'is': 105807,
 'okay': 6102,
 'you': 213033,
 'not': 32338,
 'hacking': 18,
 'gagging': 9,
 'spitting': 16,
 'part': 1421,
 'please': 3214,
 'asking': 746,
 'me': 44943,
 'out': 18486,
 'so': 19066,
 'cute': 272,
 'what': 55228,
 'your': 29945,
 'name': 3122,
 'no': 27622,
 'it': 93345,
 'my': 29698,
 'fault': 483,
 'didnt': 8812,
 'have': 46604,
 'a': 102049,
 'proper': 138,
 'introduction': 19,
 'cameron': 35,
 'thing': 5736,
 'am': 37862,
 'at': 15293,
 'mercy': 68,
 'of': 56298,
 'particularly': 111,
 'hi

In [37]:
# Creating two dictionaries that maps the question words ans the answer words to a unique integer
threshold = 20
questionswords2int = {}
word_number = 0
for word, count in word2count.items():
    if count>=threshold:
        questionswords2int[word] = word_number
        word_number += 1
answerswords2int = {}
word_number = 0
for word, count in word2count.items():
    if count>=threshold:
        answerswords2int[word] = word_number
        word_number += 1

In [38]:
questionswords2int

{'can': 0,
 'we': 1,
 'make': 2,
 'this': 3,
 'quick': 4,
 'and': 5,
 'andrew': 6,
 'are': 7,
 'having': 8,
 'an': 9,
 'incredibly': 10,
 'public': 11,
 'break': 12,
 'up': 13,
 'on': 14,
 'the': 15,
 'again': 16,
 'well': 17,
 'i': 18,
 'thought': 19,
 'would': 20,
 'start': 21,
 'with': 22,
 'if': 23,
 'that': 24,
 'is': 25,
 'okay': 26,
 'you': 27,
 'not': 28,
 'part': 29,
 'please': 30,
 'asking': 31,
 'me': 32,
 'out': 33,
 'so': 34,
 'cute': 35,
 'what': 36,
 'your': 37,
 'name': 38,
 'no': 39,
 'it': 40,
 'my': 41,
 'fault': 42,
 'didnt': 43,
 'have': 44,
 'a': 45,
 'proper': 46,
 'cameron': 47,
 'thing': 48,
 'am': 49,
 'at': 50,
 'mercy': 51,
 'of': 52,
 'particularly': 53,
 'breed': 54,
 'loser': 55,
 'sister': 56,
 'cannot': 57,
 'date': 58,
 'until': 59,
 'she': 60,
 'does': 61,
 'why': 62,
 'mystery': 63,
 'used': 64,
 'to': 65,
 'be': 66,
 'really': 67,
 'popular': 68,
 'when': 69,
 'started': 70,
 'high': 71,
 'school': 72,
 'then': 73,
 'was': 74,
 'just': 75,
 'like': 

In [39]:
answerswords2int

{'can': 0,
 'we': 1,
 'make': 2,
 'this': 3,
 'quick': 4,
 'and': 5,
 'andrew': 6,
 'are': 7,
 'having': 8,
 'an': 9,
 'incredibly': 10,
 'public': 11,
 'break': 12,
 'up': 13,
 'on': 14,
 'the': 15,
 'again': 16,
 'well': 17,
 'i': 18,
 'thought': 19,
 'would': 20,
 'start': 21,
 'with': 22,
 'if': 23,
 'that': 24,
 'is': 25,
 'okay': 26,
 'you': 27,
 'not': 28,
 'part': 29,
 'please': 30,
 'asking': 31,
 'me': 32,
 'out': 33,
 'so': 34,
 'cute': 35,
 'what': 36,
 'your': 37,
 'name': 38,
 'no': 39,
 'it': 40,
 'my': 41,
 'fault': 42,
 'didnt': 43,
 'have': 44,
 'a': 45,
 'proper': 46,
 'cameron': 47,
 'thing': 48,
 'am': 49,
 'at': 50,
 'mercy': 51,
 'of': 52,
 'particularly': 53,
 'breed': 54,
 'loser': 55,
 'sister': 56,
 'cannot': 57,
 'date': 58,
 'until': 59,
 'she': 60,
 'does': 61,
 'why': 62,
 'mystery': 63,
 'used': 64,
 'to': 65,
 'be': 66,
 'really': 67,
 'popular': 68,
 'when': 69,
 'started': 70,
 'high': 71,
 'school': 72,
 'then': 73,
 'was': 74,
 'just': 75,
 'like': 

In [40]:
# Adding the last tokens to questionswords2int and answerswords2int
tokens = ['<PAD>', '<EOS>', '<OUT>', '<SOS>']
for token in tokens:
    questionswords2int[token] = len(questionswords2int) + 1
for token in tokens:
    answerswords2int[token] = len(answerswords2int) + 1

In [44]:
# Creating the inverse dictionary of the answerswords2int dictionary
answersints2word = {w_i: w for w, w_i in answerswords2int.items()}

In [43]:
answersints2word

{0: 'can',
 1: 'we',
 2: 'make',
 3: 'this',
 4: 'quick',
 5: 'and',
 6: 'andrew',
 7: 'are',
 8: 'having',
 9: 'an',
 10: 'incredibly',
 11: 'public',
 12: 'break',
 13: 'up',
 14: 'on',
 15: 'the',
 16: 'again',
 17: 'well',
 18: 'i',
 19: 'thought',
 20: 'would',
 21: 'start',
 22: 'with',
 23: 'if',
 24: 'that',
 25: 'is',
 26: 'okay',
 27: 'you',
 28: 'not',
 29: 'part',
 30: 'please',
 31: 'asking',
 32: 'me',
 33: 'out',
 34: 'so',
 35: 'cute',
 36: 'what',
 37: 'your',
 38: 'name',
 39: 'no',
 40: 'it',
 41: 'my',
 42: 'fault',
 43: 'didnt',
 44: 'have',
 45: 'a',
 46: 'proper',
 47: 'cameron',
 48: 'thing',
 49: 'am',
 50: 'at',
 51: 'mercy',
 52: 'of',
 53: 'particularly',
 54: 'breed',
 55: 'loser',
 56: 'sister',
 57: 'cannot',
 58: 'date',
 59: 'until',
 60: 'she',
 61: 'does',
 62: 'why',
 63: 'mystery',
 64: 'used',
 65: 'to',
 66: 'be',
 67: 'really',
 68: 'popular',
 69: 'when',
 70: 'started',
 71: 'high',
 72: 'school',
 73: 'then',
 74: 'was',
 75: 'just',
 76: 'lik

In [45]:
# Adding the end of string token the end of every answer
for i in range(len(clean_answers)):
    clean_answers[i] += ' <EOS>'

In [46]:
clean_answers

['well i thought we would start with pronunciation if that is okay with you <EOS>',
 'not the hacking and gagging and spitting part  please <EOS>',
 'okay then how bout we try out some french cuisine  saturday  night <EOS>',
 'forget it <EOS>',
 'cameron <EOS>',
 'the thing is cameron  i am at the mercy of a particularly hideous breed of loser  my sister  i cannot date until she does <EOS>',
 'seems like she could get a date easy enough <EOS>',
 'unsolved mystery  she used to be really popular when she started high school then it was just like she got sick of it or something <EOS>',
 'that is a shame <EOS>',
 'let me see what i can do <EOS>',
 'right  see  you are ready for the quiz <EOS>',
 'i dont want to know how to say that though  i want to know useful things like where the good stores are  how much does champagne cost  stuff like chat  i have never in my life had to point out my head to someone <EOS>',
 'that is because it is such a nice one <EOS>',
 'forget french <EOS>',
 'well

In [47]:
# Translating all the questions and the answers into integers
# and replacing all the words that were filtered out by <OUT>
questions_into_int = []
for question in clean_questions:
    ints = []
    for word in question.split():
        if word not in  questionswords2int:
            ints.append(questionswords2int['<OUT>'])
        else:
            ints.append(questionswords2int[word])
    questions_into_int.append(ints)
answers_into_int = []
for answer in clean_answers:
    ints = []
    for word in answer.split():
        if word not in  answerswords2int:
            ints.append(answerswords2int['<OUT>'])
        else:
            ints.append(answerswords2int[word])
    answers_into_int.append(ints)

In [48]:
questions_into_int

[[0,
  1,
  2,
  3,
  4,
  8806,
  8806,
  5,
  6,
  8806,
  7,
  8,
  9,
  10,
  8806,
  11,
  12,
  13,
  14,
  15,
  8806,
  16],
 [17, 18, 19, 1, 20, 21, 22, 8806, 23, 24, 25, 26, 22, 27],
 [28, 15, 8806, 5, 8806, 5, 8806, 29, 30],
 [27, 7, 31, 32, 33, 24, 25, 34, 35, 36, 25, 37, 38, 16],
 [39, 39, 40, 25, 41, 42, 1, 43, 44, 45, 46, 8806],
 [47],
 [15,
  48,
  25,
  47,
  18,
  49,
  50,
  15,
  51,
  52,
  45,
  53,
  8806,
  54,
  52,
  55,
  41,
  56,
  18,
  57,
  58,
  59,
  60,
  61],
 [62],
 [8806,
  63,
  60,
  64,
  65,
  66,
  67,
  68,
  69,
  60,
  70,
  71,
  72,
  73,
  40,
  74,
  75,
  76,
  60,
  77,
  78,
  52,
  40,
  79,
  80],
 [81, 23, 82, 1, 83, 84, 85, 45, 86],
 [8806, 87, 8806, 3, 25, 41, 88],
 [89, 90, 27, 7, 91, 92, 15, 8806],
 [18,
  93,
  94,
  65,
  95,
  96,
  65,
  97,
  24,
  98,
  18,
  94,
  65,
  95,
  99,
  100,
  76,
  101,
  15,
  102,
  103,
  7,
  96,
  104,
  61,
  105,
  106,
  107,
  76,
  108,
  18,
  44,
  109,
  110,
  41,
  111,
  112

In [49]:
answers_into_int

[[17, 18, 19, 1, 20, 21, 22, 8806, 23, 24, 25, 26, 22, 27, 8805],
 [28, 15, 8806, 5, 8806, 5, 8806, 29, 30, 8805],
 [26, 73, 96, 1529, 1, 858, 33, 481, 385, 8806, 208, 240, 8805],
 [243, 40, 8805],
 [47, 8805],
 [15,
  48,
  25,
  47,
  18,
  49,
  50,
  15,
  51,
  52,
  45,
  53,
  8806,
  54,
  52,
  55,
  41,
  56,
  18,
  57,
  58,
  59,
  60,
  61,
  8805],
 [397, 76, 60, 83, 128, 45, 58, 863, 287, 8805],
 [8806,
  63,
  60,
  64,
  65,
  66,
  67,
  68,
  69,
  60,
  70,
  71,
  72,
  73,
  40,
  74,
  75,
  76,
  60,
  77,
  78,
  52,
  40,
  79,
  80,
  8805],
 [24, 25, 45, 1700, 8805],
 [285, 32, 90, 36, 18, 0, 127, 8805],
 [89, 90, 27, 7, 91, 92, 15, 8806, 8805],
 [18,
  93,
  94,
  65,
  95,
  96,
  65,
  97,
  24,
  98,
  18,
  94,
  65,
  95,
  99,
  100,
  76,
  101,
  15,
  102,
  103,
  7,
  96,
  104,
  61,
  105,
  106,
  107,
  76,
  108,
  18,
  44,
  109,
  110,
  41,
  111,
  112,
  65,
  113,
  33,
  41,
  88,
  65,
  114,
  8805],
 [24, 25, 115, 40, 25, 116, 45

In [50]:
# Sorting questions and answers by the length of questions
sorted_clean_questions = []
sorted_clean_answers = []
for length in range(1, 25 + 1):
    for i in enumerate(questions_into_int):
        if len(i[1]) == length:
            sorted_clean_questions.append(questions_into_int[i[0]])
            sorted_clean_answers.append(answers_into_int[i[0]])

In [51]:
sorted_clean_questions

[[47],
 [62],
 [122],
 [146],
 [134],
 [39],
 [174],
 [39],
 [181],
 [182],
 [219],
 [36],
 [62],
 [134],
 [62],
 [109],
 [373],
 [131],
 [295],
 [209],
 [36],
 [220],
 [181],
 [26],
 [62],
 [181],
 [8806],
 [453],
 [248],
 [181],
 [192],
 [8806],
 [8806],
 [667],
 [96],
 [8806],
 [8806],
 [36],
 [39],
 [36],
 [8806],
 [146],
 [89],
 [36],
 [769],
 [8806],
 [635],
 [39],
 [39],
 [937],
 [8806],
 [1118],
 [39],
 [229],
 [69],
 [209],
 [39],
 [141],
 [1263],
 [209],
 [1110],
 [1110],
 [1110],
 [1110],
 [209],
 [338],
 [148],
 [26],
 [91],
 [667],
 [36],
 [1137],
 [1258],
 [8806],
 [8806],
 [1507],
 [209],
 [1550],
 [36],
 [36],
 [209],
 [1600],
 [1600],
 [1600],
 [1600],
 [1600],
 [1600],
 [26],
 [667],
 [67],
 [667],
 [229],
 [96],
 [1629],
 [1600],
 [1600],
 [1600],
 [8806],
 [1600],
 [1600],
 [67],
 [1670],
 [672],
 [1790],
 [209],
 [38],
 [209],
 [1221],
 [209],
 [1345],
 [209],
 [17],
 [122],
 [36],
 [62],
 [1839],
 [1772],
 [209],
 [1847],
 [209],
 [209],
 [1822],
 [220],
 [1145],


In [58]:
sorted_clean_answers[len(sorted_clean_answers)-900]

[27, 7, 366, 8806, 352, 27, 8805]