# Data processing

In [1]:
import re
import os
import json

import numpy as np

import pandas as pd
import tqdm

In [2]:
DATA_PATH = "cornell movie-dialogs corpus"
FIELD_SEP = " +++$+++ "

In this notebook we are going to preprocess Cornell Movie dialogues dataset.

In [3]:
translation = {}
translation.update({ord(k): ""   for k in '_'})
translation.update({ord(k): "fl" for k in 'ß'})
translation.update({ord(k): "n"  for k in 'ñ'})
translation.update({ord(k): "i"  for k in 'ï'})
translation.update({ord(k): "c"  for k in 'ç'})
translation.update({ord(k): "C"  for k in 'Ç'})
translation.update({ord(k): "o"  for k in 'ó'})
translation.update({ord(k): '"'  for k in '²³'})
translation.update({ord(k): " "  for k in '\x85\t'})
translation.update({ord(k): "u"  for k in 'ùúûüÜ'})
translation.update({ord(k): "a"  for k in 'àáâä'})
translation.update({ord(k): "e"  for k in 'èéêÈÉ'})
translation.update({ord(k): "'"  for k in 'í¹ÒÓÔÕ'})

alphanum = re.compile(r"[^\w\s\?!.,;:\'\"\-]+")  # \(\)\[\] are meaningless
blankpunct = re.compile(r"\s+([\?!.,;:])")
repeated = re.compile(r"([^\w.])\1+")
ellipsis = re.compile(r"\.{4,}")

replies = {}
with open(os.path.join(DATA_PATH, "movie_lines.txt"), "r", encoding="latin-1") as fin:
    for line in tqdm.tqdm(fin):
        split_ = line.split(FIELD_SEP, 4)
        line_id, character_id, movie_id, character_name, text = split_

        # clean up the bad encoding artifacts
        text = text.translate(translation)
        
        # strip non-alphanum or punctuation, then eliminate repeated non-alphanum
        text = blankpunct.sub(r"\1", alphanum.sub("", text.strip()))
        text = ellipsis.sub(r"...", repeated.sub(r"\1", text))

        replies[line_id] = text

304713it [00:04, 68681.86it/s]


Load the dialogues

In [4]:
dialogues = []
with open(os.path.join(DATA_PATH, "movie_conversations.txt"), "r", encoding="latin-1") as fin:
    for line in tqdm.tqdm(fin):
        line_ids = line.rsplit(FIELD_SEP, 1)[1]
        dialogues.append(line_ids.strip().replace("'", '"'))
dialogues = json.loads("[%s]" % (",\n".join(dialogues)))

83097it [00:00, 570041.30it/s]


Save the cleaned up dataset: first the database of lines

In [5]:
with open("processed_lines.json", "w", encoding="utf-8") as fout:
    json.dump(replies, fout)

And now the dialogues

In [6]:
with open("processed_dialogues.json", "w", encoding="utf-8") as fout:
    json.dump(dialogues, fout)

Transform the dialogues into Q&A pairs.

In [7]:
# qa_pairs = []
# for lines in dialogues:
#     qa_pairs.extend(zip(lines[:-1], lines[1:]))

Build the vocabulary

In [8]:
from collections import Counter

vocab = Counter(char for line in replies.values() for char in line)