In [3]:
from os import path, walk

from xml.sax.handler import ContentHandler
from xml.sax import SAXException, make_parser


class OpenSubtitlesHandler(ContentHandler):
    def initialize(self):
        self.sentences = []
        self.within_word = False

    def startDocument(self):
        self.initialize()

    def startElement(self, tag, attrs):
        if tag == 's':
            self.sentences.append([])
        if tag == 'w':
            self.within_word = True

    def endElement(self, tag):
        if tag == 'w':
            self.within_word = False

    def characters(self, content):
        if self.within_word:
            self.sentences[-1].append(content)


def parse_corpus(text_root):
    handler = OpenSubtitlesHandler()
    parser = make_parser()
    parser.setContentHandler(handler)

    parsed_corpus = {}
    for root, dirs, files in walk(text_root):
        for filename in files:
            if not filename.endswith('xml'):
                continue
            full_filename = path.join(root, filename)
            parser.parse(full_filename)
            parsed_corpus[full_filename] = handler.sentences
    return parsed_corpus

In [4]:
from collections import defaultdict
from operator import itemgetter


def make_vocabulary(in_parsed_docs, limit=100000):
    wordcount = defaultdict(lambda: 0)
    for doc in in_parsed_docs:
        for sentence in doc:
            for word in sentence:
                wordcount[word.lower() if word != 'I' else word] += 1
    wordcount_sorted = sorted(wordcount.items(), key=itemgetter(1), reverse=True)
    result = set(map(itemgetter(0), wordcount_sorted[:limit]))
    return result

In [18]:
UNK = '__UNK__'


def preprocess_text(in_parsed_docs):
    docs = in_parsed_docs.values()
    vocabulary = make_vocabulary(docs)
    filtered_get = lambda word: word if word in vocabulary else UNK
    result = []
    for content in docs:
        processed_content = []
        for sentence in content:
            processed_sentence = [word.lower() if word != 'I' else word for word in sentence]
            filtered_sentence = [filtered_get(word) for word in processed_sentence]
            processed_content.append(filtered_sentence)
        result.append(processed_content)
    return result

In [20]:
from random import shuffle
from os import path, makedirs

TESTSET_SIZE_RATIO = 0.2


def prepare_seq2seq_files(in_processed_docs, in_result_path):
    if not path.exists(in_result_path):
        makedirs(in_result_path)

    qa_data = []
    for doc in in_processed_docs:
        for question, answer in zip(doc[::2], doc[1::2]):
            qa_data.append((question, answer))

    shuffle(qa_data)
    
    trainset_size = int((1 - TESTSET_SIZE_RATIO) * len(qa_data))
    qa_train, qa_test = qa_data[:trainset_size], qa_data[trainset_size:]

    # open files
    with \
        open(path.join(in_result_path, 'train.enc'), 'w') as train_enc, \
        open(path.join(in_result_path, 'train.dec'), 'w') as train_dec, \
        open(path.join(in_result_path, 'test.enc'), 'w') as test_enc, \
        open(path.join(in_result_path, 'test.dec'), 'w') as test_dec:

        for question_train, answer_train in qa_train:
            print >>train_enc, ' '.join(question_train).encode('utf-8')
            print >>train_dec, ' '.join(answer_train).encode('utf-8')
        for question_test, answer_test in qa_test:
            print >>test_enc, ' '.join(question_test).encode('utf-8')
            print >>test_dec, ' '.join(answer_test).encode('utf-8')

In [21]:
ORIGINAL_CORPUS_ROOT = 'OpenSubtitles'
parsed_texts = parse_corpus(ORIGINAL_CORPUS_ROOT)
processed_texts = preprocess_text(parsed_texts)
prepare_seq2seq_files(processed_texts, 'opensubtitles_seq2seq_dataset')

Corpus Info
==

In [23]:
sentences_number = 0
words_number = 0
for doc in processed_texts:
    sentences_number += len(doc)
    for sentence in doc:
        words_number += len(sentence)

print 'Documents number:\t{}'.format(len(processed_texts))
print 'Sentences number:\t{}'.format(sentences_number)
print 'Words number:\t{}'.format(words_number)

Documents number:	2317
Sentences number:	2739528
Words number:	19922136
