In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from nltk import ngrams
from nltk.tokenize import RegexpTokenizer, sent_tokenize

## List of documents
* data/trainW_raw.txt: word list from https://github.com/dwyl/english-words/blob/master/words_alpha.txt (370103 words)
* data/trainT_raw.txt: text from George Martin's [A Storm of Swords](https://en.wikipedia.org/wiki/A_Storm_of_Swords) (~424k words*)
* data/test1_raw.txt: text from George Martin's [A Dance with Dragons](https://fr.wikipedia.org/wiki/A_Dance_with_Dragons) (~422k words*)
* data/test2_raw.txt: text from Margaret Mitchell's [Gone with the Wind](https://en.wikipedia.org/wiki/Gone_with_the_Wind_(novel)) (~418k words*)

\* https://blog.nathanbransford.com/2018/04/all-about-novel-word-counts

## Tokenize each document and save it to a separate text file

### trainW

In [3]:
with open('data/trainW_raw.txt') as read_handle, open('data/trainW_token.txt', 'w') as write_handle:
    words = read_handle.read().splitlines()
    word_list_count = len(words)
    write_handle.write(','.join(words))

In [4]:
word_list_count

370103

### trainT, test1, test2

These texts will be trimmed at the end so that the number of tokens of each text match that of trainW

In [5]:
def generate_tokenized_sentences(content):
    # Only consider sequence of alphanumeric characters, dash (gray-brown), and apostrophe (that's) as token
    tokenizer = RegexpTokenizer(r'[-\'\w]+')
    
    # Split document into paragraphs (separated by new lines), 
    # then sentences (separated by punctuation characters as defined by sent_tokenize),
    # then tokens (using RegexpTokenizer)
    paragraphs = content.split('\n')
    for paragraph in paragraphs:
        sentences = sent_tokenize(paragraph)
        for sentence in sentences:
            tokenized_sentence = tokenizer.tokenize(sentence)
            if tokenized_sentence:
                yield tokenized_sentence

In [6]:
def tokenize_raw_text(raw_text_path, token_text_path, word_limit):
    with open(raw_text_path) as read_handle, open(token_text_path, 'w') as write_handle:
        # Read raw text
        content = read_handle.read().lower()
        
        # Replace characters that messes up tokenizers
        replacement_rules = {'“': '"', '”': '"', '’': "'", '--': ','}
        for symbol, replacement_symbol in replacement_rules.items():
            content = content.replace(symbol, replacement_symbol)
        
        # Generate tokenized sentence one at a time and write them to file until word limit is met
        tokenized_sentence_generator = generate_tokenized_sentences(content)
        
        for tokenized_sentence in tokenized_sentence_generator:
            sentence_length = len(tokenized_sentence)
            if sentence_length <= word_limit:
                write_handle.write(','.join(tokenized_sentence))
                write_handle.write('\n')
                word_limit -= len(tokenized_sentence)
            else:
                write_handle.write(','.join(tokenized_sentence[:word_limit]))
                break

In [7]:
tokenize_raw_text('data/trainT_raw.txt', 'data/trainT_token.txt', word_list_count)

In [8]:
tokenize_raw_text('data/test1_raw.txt', 'data/test1_token.txt', word_list_count)

In [9]:
tokenize_raw_text('data/test2_raw.txt', 'data/test2_token.txt', word_list_count)

## Verify token counts of tokenized texts

In [10]:
def get_tokenized_sentences(tokenized_file_name):
    with open(tokenized_file_name) as file_handle:
        sentences = file_handle.read().splitlines()
        for sentence in sentences:
            if sentence:
                tokenized_sentences = sentence.split(',')
                yield tokenized_sentences

In [11]:
def count_token(tokenized_file_name):
    return sum(len(tokenized_sentence) for tokenized_sentence in get_tokenized_sentences(tokenized_file_name))

In [12]:
count_token('data/trainW_token.txt')

370103

In [13]:
count_token('data/trainT_token.txt')

370103

In [14]:
count_token('data/test1_token.txt')

370103

In [15]:
count_token('data/test2_token.txt')

370103