# Preprocessing (run only once)

In [None]:
# PRE-PROCESSING STEP 1
# WARNING: This cell will take a while to run, and uses a significant amount of memory

from nltk.corpus.reader.bnc import BNCCorpusReader
import os
import json

# load all sentences from corpus
BNC_root_dir = os.path.join('DATA', 'ota_20.500.12024_2554', 'download', 'Texts')
bcr = BNCCorpusReader(BNC_root_dir, fileids=r'[A-K]/\w*/\w*\.xml')
all_sents = bcr.sents()

# filter for sentences ending in periods (to filter out questions, titles, etc.)
sents = list(filter(lambda s: len(s) > 0 and s[-1] == '.', all_sents))

# save list of sentences to JSON file
with open(os.path.join('DATA', 'sents.json'), 'w') as f:
    json.dump(sents, f)

You may reset the kernel to free memory between preprocessing steps 2 and 3.

In [None]:
# PRE-PROCESSING STEP 2
# WARNING: This cell will take a while to run, and uses a significant amount of memory

import os
import json
import itertools

with open(os.path.join('DATA', 'sents.json'), 'r') as f:
    sents = json.load(f)

all_tokens_list = list(itertools.chain.from_iterable(sents))
all_tokens_set = set(all_tokens_list)
token_freqs = {token : 0 for token in all_tokens_set}

for token in all_tokens_list:
    token_freqs[token] += 1
    
top_10000 = sorted(token_freqs.items(), key=lambda x: x[1], reverse=True)[:10000]
top_10000_dict = {x[0] : idx for idx, x in enumerate(top_10000)}

numberize = {}
for token in all_tokens_set:
    if token in top_10000_dict.keys():
        numberize[token] = top_10000_dict[token]
    else:
        numberize[token] = 10000

with open(os.path.join('DATA', 'numberize.json'), 'w') as f:
    json.dump(numberize, f)
        
reverse_numberize = {10000 : '<UNK>', 10001 : '<EOS>'}
for token, idx in top_10000_dict.items():
    reverse_numberize[idx] = token
    
with open(os.path.join('DATA', 'reverse_numberize.json'), 'w') as f:
    json.dump(reverse_numberize, f)

You may reset the kernel to free memory between pre-processing steps 2 and 3.

In [None]:
# PRE-PROCESSING STEP 3
# WARNING: This cell will take a while to run, and uses a significant amount of memory

import os
import json

with open(os.path.join('DATA', 'sents.json'), 'r') as f:
    sents = json.load(f)

with open(os.path.join('DATA', 'numberize.json'), 'r') as f:
    numberize = json.load(f)

# We filter out sentence 30 tokens or longer, and fill the end with <EOS> tokens.
numberized_sents = [[numberize[token] for token in sent] + [10001]*(30 - len(sent)) for sent in sents if len(sent) < 30]

with open(os.path.join('DATA', 'numberized_sents.json'), 'w') as f:
    json.dump(numberized_sents, f)

You may reset the kernel to free memory between pre-processing steps 3 and 4.

In [None]:
# PRE-PROCESSING STEP 4
# WARNING: This cell will take a while to run, and uses a significant amount of memory

import os
import json
import numpy as np
import tensorflow as tf

with open(os.path.join('DATA', 'numberized_sents.json'), 'r') as f:
    numberized_sents = json.load(f)

if not os.path.exists(os.path.join('DATA', 'MINIBATCHES')):
    os.makedirs(os.path.join('DATA', 'MINIBATCHES'))
minibatch_size = 500
num_full_minibatches = len(numberized_sents) // minibatch_size

for i in range(num_full_minibatches):
    minibatch = [
        [[1 if i == token_num else 0 for i in range(10002)] for token_num in sent]
        for sent in numberized_sents[i*minibatch_size:(i+1)*minibatch_size]
    ]
    minibatch_tensor = tf.constant(minibatch, dtype=tf.int32)
    sparse = tf.sparse.from_dense(minibatch_tensor)
    np.save(os.path.join('DATA', 'MINIBATCHES', f'{i}_indices.npy'), sparse.indices.numpy())
    np.save(os.path.join('DATA', 'MINIBATCHES', f'{i}_values.npy'), sparse.values.numpy())
    np.save(os.path.join('DATA', 'MINIBATCHES', f'{i}_shape.npy'), np.array(sparse.shape))
    
if len(numberized_sents) % minibatch_size != 0:
    minibatch = [
        [[1 if i == token_num else 0 for i in range(10002)] for token_num in sent]
        for sent in numberized_sents[(num_full_minibatches)*minibatch_size:len(numberized_sents)]
    ]
    minibatch_tensor = tf.constant(minibatch, dtype=tf.int32)
    sparse = tf.sparse.from_dense(minibatch_tensor)
    np.save(os.path.join('DATA', 'MINIBATCHES', f'{num_full_minibatches}_indices.npy'), sparse.indices.numpy())
    np.save(os.path.join('DATA', 'MINIBATCHES', f'{num_full_minibatches}_values.npy'), sparse.values.numpy())
    np.save(os.path.join('DATA', 'MINIBATCHES', f'{num_full_minibatches}_shape.npy'), np.array(sparse.shape))