In [87]:
import numpy as np
import tensorflow as tf
import re
import time
import ast
from collections import Counter

# Loading the dataset

We have two files for the dataset.

**movie_lines.txt** contains the actual conversations between characters

**movie_conversations.txt** contains the indices for conversations in movie_lines.txt.

In [2]:
lines = open('dataset/movie_lines.txt', encoding='utf-8', errors='ignore').read().split('\n')
conversations = open('dataset/movie_conversations.txt', encoding='utf-8', errors='ignore').read().split('\n')

### Create a dictionary that maps each line to its id

In [3]:
id2lines = {}

for line in lines:
    temp = line.split(' +++$+++ ')
    # We only need to get lines which have 5 elements when splitted by +++$+++.
    # Due to some error in the dataset, we may have lines which containe fewer or more items when the split is performed.
    # We will delete such invalid lines and only use valid lines with length of 5.
    if len(temp) == 5:
        id2lines[temp[0]] = temp[-1]

### Creating a list of all the conversations

In [4]:
conversations_index = []

# Last item in conversations is empty so, we need to skip it
for c in conversations[:-1]:
    _temp = c.split(' +++$+++')[-1].strip()
    _temp = ast.literal_eval(_temp)
    conversations_index.append(_temp)

## Get the qestions and answers from the dataset

Each **line** in **conversations_index** consists of query and reply.

Each item in the **line** is a query and the succedding item is a reply.

For example, in a sample line [a, b, c]:

    [a, b] is a query-reply pair, where [a] is the query and [b] the reply
    [b, c] is a query-reply pair, where [b] is the query and [c] the reply

In [11]:
query = []
reply = []
for c in conversations_index:
    for i in range(len(c) - 1):
        query.append(id2lines[c[i]].lower())
        reply.append(id2lines[c[i + 1]].lower())

## Cleaning the text to remove some unnecessary symbols and abbreviations

In [83]:
def clean_text(text):
    cleaning = {
        r"'m" : ' am',
        r"'s" : ' is',
        r"'re" : ' are',
        r"'ll" : ' will',
        r"'ve" : ' have',
        r"'d" : ' would',
        r"won't" : 'will not',
        r"can't" : 'can not',
        r"([\W])|(^[\s])" : ' ',
        r"\s+" : ' ',
    }
    for i in cleaning:
        text = re.sub(i, cleaning[i], text)
    return text.strip()

In [86]:
query = [clean_text(q) for q in query]
reply = [clean_text(r) for r in reply]

## Construct a count for each word in our corpus
This will be used to remove words that do not appear at least **x** times in the dataset.

In [94]:
word2count = {}
for i in query + reply:
    for word in i.split():
        word2count[word] = word2count.get(word, 0) + 1

## Removing the words that do not occur at least x times

## Also, we are creating a mapping for each word to a unique index

In [109]:
THRESHOLD = 20

word2int = {}
idx = 0
for word, count in word2count.items():
    if count >= THRESHOLD:
        word2int[word] = idx
        idx += 1

## Appending special tokens and their unique mappings

In [110]:
TOKENS = ('<PAD>', '<EOS>', '<OUT>', '<SOS>')

for token in TOKENS:
    word2int[token] = len(word2int) + 1

## Creating an inverse mapping from id to word

In [111]:
int2word = {value : key for key, value in word2int.items()}

## Appending EOS token to all replies

In [114]:
reply = [a + ' <EOS>' for a in reply]