In [1]:
import itertools
import math
import os

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

physical_devices = tf.config.list_physical_devices('GPU')
for physical_device in physical_devices:
    tf.config.experimental.set_memory_growth(physical_device, enable=True)

In [2]:
TYPE=np.int16

subword_text_encoder = tfds.features.text.SubwordTextEncoder.load_from_file('vocab_4096_simplified')

class Articles:
    EMPTY_ARTICLE = np.array([], dtype=TYPE) # used for padding
    
    def __init__(self, path):
        with open(path, 'rb') as text_file:
            data = text_file.read()

        self.articles = sorted(set(data.split(b'\0')[:20000]), key=len)
        self._encoded_articles = None

    @property
    def encoded_articles(self):
        if self._encoded_articles == None:
            self._encoded_articles = [np.array(subword_text_encoder.encode(article), dtype=TYPE) for article in self.articles]
        
        return self._encoded_articles

    def articles_generator(self, batch_size = 1, start = 0, end = None):
        end = end or len(self.articles)

        for _ in range(batch_size - ((end - start - 1) % batch_size + 1)):
            yield self.EMPTY_ARTICLE

        for article in itertools.islice(self.encoded_articles, start, end):
            yield article

    def subbatch_generator(self, batch_size, batch_length, start = 0, end = None):
        end = end or len(self.articles)

        dataset = tf.data.Dataset.from_generator(self.articles_generator, args=(batch_size, start, end), output_types=TYPE)
        dataset = dataset.padded_batch(batch_size, padded_shapes=([None]), drop_remainder=True)
        dataset = dataset.shuffle(100)

        for batch in dataset.as_numpy_iterator():
            remaining = batch
            while remaining.shape[1] > batch_length + 1:
                yield remaining[:, :batch_length + 1]
                remaining = remaining[:, batch_length:]

            if remaining.shape[1] == batch_length + 1:
                yield remaining
                yield np.zeros((batch_size, batch_length + 1), dtype=TYPE)
            else:
                yield np.hstack([remaining, np.zeros([batch_size, batch_length - remaining.shape[1] + 1])])

    def steps(self, batch_size, batch_length):
        articles = self.articles_generator(batch_size, batch_length)
        return sum(math.ceil(len(article) / batch_length + 1) for i, article in enumerate(articles) if (i + 1) % batch_size == 0)

    def dataset(self, batch_size, batch_length, start = 0, end = None):
        end = end or len(self.articles)

        dataset = tf.data.Dataset.from_generator(self.subbatch_generator, args=(batch_size, batch_length, start, end), output_types=TYPE, output_shapes=(batch_size, batch_length + 1))
        return dataset.map(lambda batch: (batch[:, :-1], batch[:, 1:]))

In [3]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

def average_final_batch_ratio(true_labels, predictions):
    return 0 ** tf.math.abs(true_labels[-1, -1])

class ModelStateResetter(tf.keras.callbacks.Callback):
    def __init__(self):
        pass

    def on_epoch_begin(self, epoch, logs=None):
        self.last_final_batch_count = 0
        
    def on_batch_end(self, batch, logs={}):
        average_final_batch_ratio = logs.get('average_final_batch_ratio', 0)
        final_batch_count = int(round(average_final_batch_ratio * (batch + 1)))
        is_final = final_batch_count - self.last_final_batch_count
        self.last_final_batch_count = final_batch_count
        
        if is_final:
            self.model.reset_states()

class Model:
    def __init__(self, articles, checkpoint_dir, vocab_size, embedding_dim, rnn_units):
        self._articles = articles
        self._batch_size = None
        self._batched_item_length = None
        self._training_model = None
        self._predicting_model = None
        self._vocab_size = vocab_size
        self._embedding_dim = embedding_dim
        self._rnn_units = rnn_units

        self._checkpoint_dir = checkpoint_dir
        self._checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}") # Name of the checkpoint files

    def training_model(self, batch_size, batched_item_length):
        if self._training_model == None or batch_size != self._batch_size or batched_item_length != self._batched_item_length:
            self._batch_size = batch_size
            self._batched_item_length = batched_item_length
            self._training_model = tf.keras.Sequential([
                tf.keras.layers.Masking(mask_value=0, batch_input_shape=[batch_size, batched_item_length]),
                tf.keras.layers.Embedding(self._vocab_size, self._embedding_dim),
                tf.keras.layers.LSTM(self._rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                tf.keras.layers.LSTM(self._rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                tf.keras.layers.Dense(self._vocab_size),
            ])

            if os.path.isdir(self._checkpoint_dir):
                self._training_model.load_weights(tf.train.latest_checkpoint(self._checkpoint_dir))

            self._training_model.compile(optimizer='adam', loss=loss, metrics=[average_final_batch_ratio])
            self._predicting_model = None
        
        return self._training_model

    @property
    def callbacks(self):
        checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=self._checkpoint_prefix, save_weights_only=True)
        model_state_resetter_callback = ModelStateResetter()
        
        return [checkpoint_callback, model_state_resetter_callback]
    
    def train(self, batch_size, batched_item_length, epochs=1):
        dataset = self._articles.dataset(batch_size, batched_item_length)

        model = self.training_model(batch_size, batched_item_length)

        model.fit(dataset, epochs=epochs, callbacks=self.callbacks)
    
    @property
    def predicting_model(self):
        if self._predicting_model == None:
            self._predicting_model = tf.keras.Sequential([
                tf.keras.layers.Masking(mask_value=0, batch_input_shape=[1, 1]),
                tf.keras.layers.Embedding(self._vocab_size, self._embedding_dim),
                tf.keras.layers.LSTM(self._rnn_units, stateful=True, return_sequences=True),
                tf.keras.layers.LSTM(self._rnn_units, stateful=True, return_sequences=True),
                tf.keras.layers.Dense(self._vocab_size),
            ])
            
            self._predicting_model.load_weights(tf.train.latest_checkpoint(self._checkpoint_dir))
            self._training_model = None
        
        return self._predicting_model
    
    def predict(self, input_eval):
        return self.predicting_model(input_eval)

In [4]:
articles = Articles('page_revisions_text_simplified')

In [5]:
steps = 13

for i in range(steps):
    batch_size = 4 * 2**i
    batch_item_length = 3 * 2**(steps - i - 1)
    count = articles.steps(batch_size, batch_item_length)
    print("batch size: %6d\t batch item length: %4d\tsteps per epoch: %6d" % (batch_size, batch_item_length, count))

batch size:      4	 batch item length: 12288	steps per epoch:   3445
batch size:      8	 batch item length: 6144	steps per epoch:   3598
batch size:     16	 batch item length: 3072	steps per epoch:   2640
batch size:     32	 batch item length: 1536	steps per epoch:   1923
batch size:     64	 batch item length:  768	steps per epoch:   1526
batch size:    128	 batch item length:  384	steps per epoch:   1370
batch size:    256	 batch item length:  192	steps per epoch:   1364
batch size:    512	 batch item length:   96	steps per epoch:   1573
batch size:   1024	 batch item length:   48	steps per epoch:   2066
batch size:   2048	 batch item length:   24	steps per epoch:   3152
batch size:   4096	 batch item length:   12	steps per epoch:   5362
batch size:   8192	 batch item length:    6	steps per epoch:  10010
batch size:  16384	 batch item length:    3	steps per epoch:  19471


In [6]:
model = Model(articles, './training_checkpoints-18',
              vocab_size = subword_text_encoder.vocab_size,
              embedding_dim=512,
              rnn_units=1024)

In [7]:
model.training_model(256, 192).summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking (Masking)            (256, 192)                0         
_________________________________________________________________
embedding (Embedding)        (256, 192, 512)           2071552   
_________________________________________________________________
lstm (LSTM)                  (256, 192, 1024)          6295552   
_________________________________________________________________
lstm_1 (LSTM)                (256, 192, 1024)          8392704   
_________________________________________________________________
dense (Dense)                (256, 192, 4046)          4147150   
Total params: 20,906,958
Trainable params: 20,906,958
Non-trainable params: 0
_________________________________________________________________


In [8]:
model.train(256, 192, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [9]:
import ctypes

class Huffman:
    huffman = ctypes.CDLL('x64/Release/huffman')
    
    huffman.create_tree.restype = ctypes.c_void_p
    huffman.destroy_tree.restype = None
    huffman.load_weights.restype = None
    huffman.create_code_string.restype = ctypes.c_char_p
    
    def __init__(self, category_count):
        self.category_count = category_count
        self.tree = ctypes.c_void_p(self.huffman.create_tree(category_count))

    def __del__(self):
        self.huffman.destroy_tree(self.tree)
        
    def load_weights(self, weights):
        self.huffman.load_weights(self.tree, weights.ctypes.data_as(ctypes.POINTER(ctypes.c_float)))
    
    def get_code_length(self, category):
        return self.huffman.get_code_length(self.tree, category)

    def get_code_zero_count(self, category):
        return self.huffman.get_code_zero_count(self.tree, category)
    
    def archive_size(self, model, text):
        archived_size = math.ceil(math.log2(self.category_count))
        input_eval = np.array([[text[0]]], dtype=TYPE)

        model.predicting_model.reset_states()

        for byte in text[1:]:
            predictions = model.predict(input_eval)
            predictions = tf.squeeze(predictions, 0) # remove the batch dimension

            weights = tf.nn.softmax(predictions[0]).numpy()
            self.load_weights(weights)
            archived_size += self.get_code_length(byte.item())

            input_eval = tf.expand_dims([byte], 0)

        return archived_size

In [11]:
total_raw = 0
total_compressed = 0

huffman = Huffman(subword_text_encoder.vocab_size)
for index, encoded_article in enumerate(articles.articles_generator(1)):
    if index % 100 == 0:
        article = subword_text_encoder.decode(encoded_article)
        if len(article) == 0:
            continue
        total_raw += len(article) * 8
        total_compressed += huffman.archive_size(model, encoded_article)
        print('Article %d:\tLength: %d\tAvg Compression: %f' % (index, len(article), total_compressed/total_raw))

Article 100:	Length: 19	Avg Compression: 0.203947
Article 200:	Length: 20	Avg Compression: 0.217949
Article 300:	Length: 20	Avg Compression: 0.226695
Article 400:	Length: 21	Avg Compression: 0.217188
Article 500:	Length: 22	Avg Compression: 0.215686
Article 600:	Length: 22	Avg Compression: 0.215726
Article 700:	Length: 23	Avg Compression: 0.209184
Article 800:	Length: 23	Avg Compression: 0.204412
Article 900:	Length: 24	Avg Compression: 0.211340
Article 1000:	Length: 24	Avg Compression: 0.210436
Article 1100:	Length: 25	Avg Compression: 0.208333
Article 1200:	Length: 25	Avg Compression: 0.207556
Article 1300:	Length: 26	Avg Compression: 0.204932
Article 1400:	Length: 26	Avg Compression: 0.203906
Article 1500:	Length: 27	Avg Compression: 0.202450
Article 1600:	Length: 27	Avg Compression: 0.198864
Article 1700:	Length: 26	Avg Compression: 0.198437
Article 1800:	Length: 28	Avg Compression: 0.199182
Article 1900:	Length: 28	Avg Compression: 0.196546
Article 2000:	Length: 29	Avg Compression

Article 15700:	Length: 15458	Avg Compression: 0.162551
Article 15800:	Length: 15863	Avg Compression: 0.162351
Article 15900:	Length: 16251	Avg Compression: 0.162102
Article 16000:	Length: 16659	Avg Compression: 0.162590
Article 16100:	Length: 17160	Avg Compression: 0.162667
Article 16200:	Length: 17803	Avg Compression: 0.162183
Article 16300:	Length: 18164	Avg Compression: 0.161528
Article 16400:	Length: 18721	Avg Compression: 0.161845
Article 16500:	Length: 19210	Avg Compression: 0.161548
Article 16600:	Length: 19953	Avg Compression: 0.161535
Article 16700:	Length: 20377	Avg Compression: 0.161883
Article 16800:	Length: 21243	Avg Compression: 0.162671
Article 16900:	Length: 21827	Avg Compression: 0.163234
Article 17000:	Length: 22641	Avg Compression: 0.162766
Article 17100:	Length: 23417	Avg Compression: 0.162687
Article 17200:	Length: 24276	Avg Compression: 0.163733
Article 17300:	Length: 25151	Avg Compression: 0.164242
Article 17400:	Length: 25969	Avg Compression: 0.164611
Article 17

In [12]:
model.train(256, 192, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [13]:
total_raw = 0
total_compressed = 0

huffman = Huffman(subword_text_encoder.vocab_size)
for index, encoded_article in enumerate(articles.articles_generator(1)):
    if index % 100 == 0:
        article = subword_text_encoder.decode(encoded_article)
        if len(article) == 0:
            continue
        total_raw += len(article) * 8
        total_compressed += huffman.archive_size(model, encoded_article)
        print('Article %d:\tLength: %d\tAvg Compression: %f' % (index, len(article), total_compressed/total_raw))

Article 100:	Length: 19	Avg Compression: 0.210526
Article 200:	Length: 20	Avg Compression: 0.221154
Article 300:	Length: 20	Avg Compression: 0.228814
Article 400:	Length: 21	Avg Compression: 0.218750
Article 500:	Length: 22	Avg Compression: 0.216912
Article 600:	Length: 22	Avg Compression: 0.216734
Article 700:	Length: 23	Avg Compression: 0.210034
Article 800:	Length: 23	Avg Compression: 0.208824
Article 900:	Length: 24	Avg Compression: 0.213918
Article 1000:	Length: 24	Avg Compression: 0.211009
Article 1100:	Length: 25	Avg Compression: 0.207819
Article 1200:	Length: 25	Avg Compression: 0.206623
Article 1300:	Length: 26	Avg Compression: 0.204932
Article 1400:	Length: 26	Avg Compression: 0.203125
Article 1500:	Length: 27	Avg Compression: 0.202450
Article 1600:	Length: 27	Avg Compression: 0.198864
Article 1700:	Length: 26	Avg Compression: 0.199375
Article 1800:	Length: 28	Avg Compression: 0.199182
Article 1900:	Length: 28	Avg Compression: 0.196272
Article 2000:	Length: 29	Avg Compression

Article 15700:	Length: 15458	Avg Compression: 0.161927
Article 15800:	Length: 15863	Avg Compression: 0.161715
Article 15900:	Length: 16251	Avg Compression: 0.161492
Article 16000:	Length: 16659	Avg Compression: 0.162004
Article 16100:	Length: 17160	Avg Compression: 0.162144
Article 16200:	Length: 17803	Avg Compression: 0.161739
Article 16300:	Length: 18164	Avg Compression: 0.161107
Article 16400:	Length: 18721	Avg Compression: 0.161421
Article 16500:	Length: 19210	Avg Compression: 0.161144
Article 16600:	Length: 19953	Avg Compression: 0.161127
Article 16700:	Length: 20377	Avg Compression: 0.161508
Article 16800:	Length: 21243	Avg Compression: 0.162296
Article 16900:	Length: 21827	Avg Compression: 0.162880
Article 17000:	Length: 22641	Avg Compression: 0.162426
Article 17100:	Length: 23417	Avg Compression: 0.162375
Article 17200:	Length: 24276	Avg Compression: 0.163411
Article 17300:	Length: 25151	Avg Compression: 0.163897
Article 17400:	Length: 25969	Avg Compression: 0.164279
Article 17

In [8]:
model.train(256, 192, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [11]:
total_raw = 0
total_compressed = 0

huffman = Huffman(subword_text_encoder.vocab_size)
for index, encoded_article in enumerate(articles.articles_generator(1)):
    if index % 200 == 0:
        article = subword_text_encoder.decode(encoded_article)
        if len(article) == 0:
            continue
        total_raw += len(article) * 8
        total_compressed += huffman.archive_size(model, encoded_article)
        print('Article %d:\tLength: %d\tAvg Compression: %f' % (index, len(article), total_compressed/total_raw))

Article 200:	Length: 20	Avg Compression: 0.218750
Article 400:	Length: 21	Avg Compression: 0.216463
Article 600:	Length: 22	Avg Compression: 0.214286
Article 800:	Length: 23	Avg Compression: 0.202035
Article 1000:	Length: 24	Avg Compression: 0.203409
Article 1200:	Length: 25	Avg Compression: 0.212963
Article 1400:	Length: 26	Avg Compression: 0.205745
Article 1600:	Length: 27	Avg Compression: 0.196809
Article 1800:	Length: 28	Avg Compression: 0.198495
Article 2000:	Length: 29	Avg Compression: 0.193878
Article 2200:	Length: 30	Avg Compression: 0.189545
Article 2400:	Length: 30	Avg Compression: 0.186066
Article 2600:	Length: 31	Avg Compression: 0.182664
Article 2800:	Length: 32	Avg Compression: 0.177649
Article 3000:	Length: 33	Avg Compression: 0.174875
Article 3200:	Length: 34	Avg Compression: 0.174713
Article 3400:	Length: 35	Avg Compression: 0.171809
Article 3600:	Length: 36	Avg Compression: 0.169960
Article 3800:	Length: 38	Avg Compression: 0.167279
Article 4000:	Length: 40	Avg Compre

In [12]:
model.train(256, 192, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [13]:
total_raw = 0
total_compressed = 0

huffman = Huffman(subword_text_encoder.vocab_size)
for index, encoded_article in enumerate(articles.articles_generator(1)):
    if index % 200 == 0:
        article = subword_text_encoder.decode(encoded_article)
        if len(article) == 0:
            continue
        total_raw += len(article) * 8
        total_compressed += huffman.archive_size(model, encoded_article)
        print('Article %d:\tLength: %d\tAvg Compression: %f' % (index, len(article), total_compressed/total_raw))

Article 200:	Length: 20	Avg Compression: 0.206250
Article 400:	Length: 21	Avg Compression: 0.207317
Article 600:	Length: 22	Avg Compression: 0.206349
Article 800:	Length: 23	Avg Compression: 0.196221
Article 1000:	Length: 24	Avg Compression: 0.200000
Article 1200:	Length: 25	Avg Compression: 0.210185
Article 1400:	Length: 26	Avg Compression: 0.204193
Article 1600:	Length: 27	Avg Compression: 0.195479
Article 1800:	Length: 28	Avg Compression: 0.193866
Article 2000:	Length: 29	Avg Compression: 0.190816
Article 2200:	Length: 30	Avg Compression: 0.186818
Article 2400:	Length: 30	Avg Compression: 0.184836
Article 2600:	Length: 31	Avg Compression: 0.181176
Article 2800:	Length: 32	Avg Compression: 0.176291
Article 3000:	Length: 33	Avg Compression: 0.174252
Article 3200:	Length: 34	Avg Compression: 0.173563
Article 3400:	Length: 35	Avg Compression: 0.170479
Article 3600:	Length: 36	Avg Compression: 0.168725
Article 3800:	Length: 38	Avg Compression: 0.165901
Article 4000:	Length: 40	Avg Compre

In [14]:
model.train(256, 192, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [15]:
total_raw = 0
total_compressed = 0

huffman = Huffman(subword_text_encoder.vocab_size)
for index, encoded_article in enumerate(articles.articles_generator(1)):
    if index % 200 == 0:
        article = subword_text_encoder.decode(encoded_article)
        if len(article) == 0:
            continue
        total_raw += len(article) * 8
        total_compressed += huffman.archive_size(model, encoded_article)
        print('Article %d:\tLength: %d\tAvg Compression: %f' % (index, len(article), total_compressed/total_raw))

Article 200:	Length: 20	Avg Compression: 0.187500
Article 400:	Length: 21	Avg Compression: 0.198171
Article 600:	Length: 22	Avg Compression: 0.196429
Article 800:	Length: 23	Avg Compression: 0.188953
Article 1000:	Length: 24	Avg Compression: 0.195455
Article 1200:	Length: 25	Avg Compression: 0.209259
Article 1400:	Length: 26	Avg Compression: 0.201863
Article 1600:	Length: 27	Avg Compression: 0.194814
Article 1800:	Length: 28	Avg Compression: 0.192708
Article 2000:	Length: 29	Avg Compression: 0.188265
Article 2200:	Length: 30	Avg Compression: 0.183636
Article 2400:	Length: 30	Avg Compression: 0.180328
Article 2600:	Length: 31	Avg Compression: 0.176339
Article 2800:	Length: 32	Avg Compression: 0.171535
Article 3000:	Length: 33	Avg Compression: 0.168641
Article 3200:	Length: 34	Avg Compression: 0.168966
Article 3400:	Length: 35	Avg Compression: 0.166223
Article 3600:	Length: 36	Avg Compression: 0.164032
Article 3800:	Length: 38	Avg Compression: 0.161075
Article 4000:	Length: 40	Avg Compre

In [16]:
model.train(256, 192, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [17]:
total_raw = 0
total_compressed = 0

huffman = Huffman(subword_text_encoder.vocab_size)
for index, encoded_article in enumerate(articles.articles_generator(1)):
    if index % 200 == 0:
        article = subword_text_encoder.decode(encoded_article)
        if len(article) == 0:
            continue
        total_raw += len(article) * 8
        total_compressed += huffman.archive_size(model, encoded_article)
        print('Article %d:\tLength: %d\tAvg Compression: %f' % (index, len(article), total_compressed/total_raw))

Article 200:	Length: 20	Avg Compression: 0.187500
Article 400:	Length: 21	Avg Compression: 0.189024
Article 600:	Length: 22	Avg Compression: 0.188492
Article 800:	Length: 23	Avg Compression: 0.183140
Article 1000:	Length: 24	Avg Compression: 0.188636
Article 1200:	Length: 25	Avg Compression: 0.200926
Article 1400:	Length: 26	Avg Compression: 0.196429
Article 1600:	Length: 27	Avg Compression: 0.188830
Article 1800:	Length: 28	Avg Compression: 0.187500
Article 2000:	Length: 29	Avg Compression: 0.183163
Article 2200:	Length: 30	Avg Compression: 0.178182
Article 2400:	Length: 30	Avg Compression: 0.176230
Article 2600:	Length: 31	Avg Compression: 0.173363
Article 2800:	Length: 32	Avg Compression: 0.169497
Article 3000:	Length: 33	Avg Compression: 0.167082
Article 3200:	Length: 34	Avg Compression: 0.167241
Article 3400:	Length: 35	Avg Compression: 0.164628
Article 3600:	Length: 36	Avg Compression: 0.162796
Article 3800:	Length: 38	Avg Compression: 0.160156
Article 4000:	Length: 40	Avg Compre

In [18]:
model.train(256, 192, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [10]:
total_raw = 0
total_compressed = 0

huffman = Huffman(subword_text_encoder.vocab_size)
for index, encoded_article in enumerate(articles.articles_generator(1)):
    if index % 200 == 0:
        article = subword_text_encoder.decode(encoded_article)
        if len(article) == 0:
            continue
        total_raw += len(article) * 8
        total_compressed += huffman.archive_size(model, encoded_article)
        print('Article %d:\tLength: %d\tAvg Compression: %f' % (index, len(article), total_compressed/total_raw))

Article 200:	Length: 20	Avg Compression: 0.250000
Article 400:	Length: 21	Avg Compression: 0.253049
Article 600:	Length: 22	Avg Compression: 0.232143
Article 800:	Length: 23	Avg Compression: 0.216570
Article 1000:	Length: 24	Avg Compression: 0.204545
Article 1200:	Length: 25	Avg Compression: 0.200000
Article 1400:	Length: 26	Avg Compression: 0.198758
Article 1600:	Length: 27	Avg Compression: 0.195479
Article 1800:	Length: 28	Avg Compression: 0.192130
Article 2000:	Length: 29	Avg Compression: 0.191837
Article 2200:	Length: 30	Avg Compression: 0.190455
Article 2400:	Length: 30	Avg Compression: 0.185656
Article 2600:	Length: 31	Avg Compression: 0.183036
Article 2800:	Length: 32	Avg Compression: 0.177310
Article 3000:	Length: 45	Avg Compression: 0.172215
Article 3200:	Length: 34	Avg Compression: 0.170582
Article 3400:	Length: 35	Avg Compression: 0.168309
Article 3600:	Length: 36	Avg Compression: 0.164093
Article 3800:	Length: 38	Avg Compression: 0.161421
Article 4000:	Length: 40	Avg Compre

In [11]:
model.train(256, 192, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [12]:
total_raw = 0
total_compressed = 0

huffman = Huffman(subword_text_encoder.vocab_size)
for index, encoded_article in enumerate(articles.articles_generator(1)):
    if index % 200 == 0:
        article = subword_text_encoder.decode(encoded_article)
        if len(article) == 0:
            continue
        total_raw += len(article) * 8
        total_compressed += huffman.archive_size(model, encoded_article)
        print('Article %d:\tLength: %d\tAvg Compression: %f' % (index, len(article), total_compressed/total_raw))

Article 200:	Length: 20	Avg Compression: 0.237500
Article 400:	Length: 21	Avg Compression: 0.253049
Article 600:	Length: 22	Avg Compression: 0.236111
Article 800:	Length: 23	Avg Compression: 0.218023
Article 1000:	Length: 24	Avg Compression: 0.204545
Article 1200:	Length: 25	Avg Compression: 0.199074
Article 1400:	Length: 26	Avg Compression: 0.196429
Article 1600:	Length: 27	Avg Compression: 0.192819
Article 1800:	Length: 28	Avg Compression: 0.188657
Article 2000:	Length: 29	Avg Compression: 0.189286
Article 2200:	Length: 30	Avg Compression: 0.188636
Article 2400:	Length: 30	Avg Compression: 0.183607
Article 2600:	Length: 31	Avg Compression: 0.181920
Article 2800:	Length: 32	Avg Compression: 0.176630
Article 3000:	Length: 45	Avg Compression: 0.173729
Article 3200:	Length: 34	Avg Compression: 0.172539
Article 3400:	Length: 35	Avg Compression: 0.170902
Article 3600:	Length: 36	Avg Compression: 0.166988
Article 3800:	Length: 38	Avg Compression: 0.164793
Article 4000:	Length: 40	Avg Compre

In [13]:
TYPE=np.int16

subword_text_encoder = tfds.features.text.SubwordTextEncoder.load_from_file('vocab_4096_simplified')

class Articles:
    EMPTY_ARTICLE = np.array([], dtype=TYPE) # used for padding
    
    def __init__(self, path):
        with open(path, 'rb') as text_file:
            data = text_file.read()

        self.articles = sorted(set(data.split(b'\0')), key=len)
        self._encoded_articles = None

    @property
    def encoded_articles(self):
        if self._encoded_articles == None:
            self._encoded_articles = [np.array(subword_text_encoder.encode(article), dtype=TYPE) for article in self.articles]
        
        return self._encoded_articles

    def articles_generator(self, batch_size = 1, start = 0, end = None):
        end = end or len(self.articles)

        for _ in range(batch_size - ((end - start - 1) % batch_size + 1)):
            yield self.EMPTY_ARTICLE

        for article in itertools.islice(self.encoded_articles, start, end):
            yield article

    def subbatch_generator(self, batch_size, batch_length, start = 0, end = None):
        end = end or len(self.articles)

        dataset = tf.data.Dataset.from_generator(self.articles_generator, args=(batch_size, start, end), output_types=TYPE)
        dataset = dataset.padded_batch(batch_size, padded_shapes=([None]), drop_remainder=True)
        dataset = dataset.shuffle(100)

        for batch in dataset.as_numpy_iterator():
            remaining = batch
            while remaining.shape[1] > batch_length + 1:
                yield remaining[:, :batch_length + 1]
                remaining = remaining[:, batch_length:]

            if remaining.shape[1] == batch_length + 1:
                yield remaining
                yield np.zeros((batch_size, batch_length + 1), dtype=TYPE)
            else:
                yield np.hstack([remaining, np.zeros([batch_size, batch_length - remaining.shape[1] + 1])])

    def steps(self, batch_size, batch_length):
        articles = self.articles_generator(batch_size, batch_length)
        return sum(math.ceil(len(article) / batch_length + 1) for i, article in enumerate(articles) if (i + 1) % batch_size == 0)

    def dataset(self, batch_size, batch_length, start = 0, end = None):
        end = end or len(self.articles)

        dataset = tf.data.Dataset.from_generator(self.subbatch_generator, args=(batch_size, batch_length, start, end), output_types=TYPE, output_shapes=(batch_size, batch_length + 1))
        return dataset.map(lambda batch: (batch[:, :-1], batch[:, 1:]))

In [14]:
articles = Articles('page_revisions_text_simplified')

In [15]:
model = Model(articles, './training_checkpoints-18',
              vocab_size = subword_text_encoder.vocab_size,
              embedding_dim=512,
              rnn_units=1024)

In [16]:
model.train(256, 192, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [17]:
total_raw = 0
total_compressed = 0

huffman = Huffman(subword_text_encoder.vocab_size)
for index, encoded_article in enumerate(articles.articles_generator(1)):
    if index % 200 == 0:
        article = subword_text_encoder.decode(encoded_article)
        if len(article) == 0:
            continue
        total_raw += len(article) * 8
        total_compressed += huffman.archive_size(model, encoded_article)
        print('Article %d:\tLength: %d\tAvg Compression: %f' % (index, len(article), total_compressed/total_raw))

Article 200:	Length: 17	Avg Compression: 0.250000
Article 400:	Length: 17	Avg Compression: 0.253676
Article 600:	Length: 18	Avg Compression: 0.262019
Article 800:	Length: 18	Avg Compression: 0.275000
Article 1000:	Length: 18	Avg Compression: 0.274148
Article 1200:	Length: 19	Avg Compression: 0.273364
Article 1400:	Length: 19	Avg Compression: 0.269841
Article 1600:	Length: 19	Avg Compression: 0.268966
Article 1800:	Length: 19	Avg Compression: 0.268293
Article 2000:	Length: 19	Avg Compression: 0.266393
Article 2200:	Length: 20	Avg Compression: 0.261700
Article 2400:	Length: 20	Avg Compression: 0.258408
Article 2600:	Length: 20	Avg Compression: 0.262860
Article 2800:	Length: 19	Avg Compression: 0.263359
Article 3000:	Length: 20	Avg Compression: 0.266401
Article 3200:	Length: 20	Avg Compression: 0.263245
Article 3400:	Length: 20	Avg Compression: 0.261258
Article 3600:	Length: 20	Avg Compression: 0.260234
Article 3800:	Length: 21	Avg Compression: 0.257231
Article 4000:	Length: 20	Avg Compre

Article 32400:	Length: 34	Avg Compression: 0.222865
Article 32600:	Length: 34	Avg Compression: 0.223532
Article 32800:	Length: 34	Avg Compression: 0.223708
Article 33000:	Length: 34	Avg Compression: 0.223403
Article 33200:	Length: 35	Avg Compression: 0.225285
Article 33400:	Length: 35	Avg Compression: 0.224673
Article 33600:	Length: 35	Avg Compression: 0.224731
Article 33800:	Length: 35	Avg Compression: 0.224488
Article 34000:	Length: 35	Avg Compression: 0.224410
Article 34200:	Length: 35	Avg Compression: 0.224119
Article 34400:	Length: 35	Avg Compression: 0.224419
Article 34600:	Length: 35	Avg Compression: 0.223974
Article 34800:	Length: 36	Avg Compression: 0.223331
Article 35000:	Length: 36	Avg Compression: 0.223036
Article 35200:	Length: 48	Avg Compression: 0.222890
Article 35400:	Length: 54	Avg Compression: 0.222219
Article 35600:	Length: 48	Avg Compression: 0.222009
Article 35800:	Length: 48	Avg Compression: 0.222103
Article 36000:	Length: 36	Avg Compression: 0.221557
Article 3620

Article 63800:	Length: 377	Avg Compression: 0.194896
Article 64000:	Length: 381	Avg Compression: 0.194901
Article 64200:	Length: 390	Avg Compression: 0.194466
Article 64400:	Length: 400	Avg Compression: 0.193340
Article 64600:	Length: 408	Avg Compression: 0.192040
Article 64800:	Length: 415	Avg Compression: 0.192138
Article 65000:	Length: 422	Avg Compression: 0.195673
Article 65200:	Length: 426	Avg Compression: 0.194977
Article 65400:	Length: 438	Avg Compression: 0.194834
Article 65600:	Length: 446	Avg Compression: 0.194878
Article 65800:	Length: 453	Avg Compression: 0.196065
Article 66000:	Length: 460	Avg Compression: 0.195409
Article 66200:	Length: 468	Avg Compression: 0.194322
Article 66400:	Length: 475	Avg Compression: 0.193912
Article 66600:	Length: 482	Avg Compression: 0.193135
Article 66800:	Length: 500	Avg Compression: 0.192136
Article 67000:	Length: 497	Avg Compression: 0.192127
Article 67200:	Length: 470	Avg Compression: 0.192955
Article 67400:	Length: 511	Avg Compression: 0.

Article 94600:	Length: 1600	Avg Compression: 0.178225
Article 94800:	Length: 1607	Avg Compression: 0.178110
Article 95000:	Length: 1595	Avg Compression: 0.178422
Article 95200:	Length: 1637	Avg Compression: 0.178366
Article 95400:	Length: 1633	Avg Compression: 0.178410
Article 95600:	Length: 1651	Avg Compression: 0.178221
Article 95800:	Length: 1634	Avg Compression: 0.177780
Article 96000:	Length: 1670	Avg Compression: 0.177774
Article 96200:	Length: 1679	Avg Compression: 0.178013
Article 96400:	Length: 1689	Avg Compression: 0.178002
Article 96600:	Length: 1690	Avg Compression: 0.177891
Article 96800:	Length: 1685	Avg Compression: 0.177981
Article 97000:	Length: 1718	Avg Compression: 0.177918
Article 97200:	Length: 1729	Avg Compression: 0.177751
Article 97400:	Length: 1752	Avg Compression: 0.177448
Article 97600:	Length: 1745	Avg Compression: 0.177519
Article 97800:	Length: 1759	Avg Compression: 0.177194
Article 98000:	Length: 1788	Avg Compression: 0.177177
Article 98200:	Length: 1778	

Article 124600:	Length: 2634	Avg Compression: 0.146594
Article 124800:	Length: 2635	Avg Compression: 0.146164
Article 125000:	Length: 2637	Avg Compression: 0.145735
Article 125200:	Length: 2638	Avg Compression: 0.145309
Article 125400:	Length: 2640	Avg Compression: 0.144909
Article 125600:	Length: 2641	Avg Compression: 0.144507
Article 125800:	Length: 2643	Avg Compression: 0.144088
Article 126000:	Length: 2644	Avg Compression: 0.143680
Article 126200:	Length: 2646	Avg Compression: 0.143301
Article 126400:	Length: 2649	Avg Compression: 0.142899
Article 126600:	Length: 2651	Avg Compression: 0.142511
Article 126800:	Length: 2610	Avg Compression: 0.142685
Article 127000:	Length: 2645	Avg Compression: 0.143522
Article 127200:	Length: 2654	Avg Compression: 0.143412
Article 127400:	Length: 2656	Avg Compression: 0.143020
Article 127600:	Length: 2658	Avg Compression: 0.142640
Article 127800:	Length: 2640	Avg Compression: 0.142888
Article 128000:	Length: 2664	Avg Compression: 0.142476
Article 12

Article 154400:	Length: 3608	Avg Compression: 0.134785
Article 154600:	Length: 3555	Avg Compression: 0.134997
Article 154800:	Length: 3627	Avg Compression: 0.134802
Article 155000:	Length: 3638	Avg Compression: 0.134637
Article 155200:	Length: 3652	Avg Compression: 0.134852
Article 155400:	Length: 3661	Avg Compression: 0.134684
Article 155600:	Length: 3683	Avg Compression: 0.134868
Article 155800:	Length: 3631	Avg Compression: 0.134991
Article 156000:	Length: 3699	Avg Compression: 0.135092
Article 156200:	Length: 3701	Avg Compression: 0.135354
Article 156400:	Length: 3676	Avg Compression: 0.135715
Article 156600:	Length: 3737	Avg Compression: 0.135550
Article 156800:	Length: 3735	Avg Compression: 0.135747
Article 157000:	Length: 3765	Avg Compression: 0.135894
Article 157200:	Length: 3809	Avg Compression: 0.135839
Article 157400:	Length: 3792	Avg Compression: 0.136016
Article 157600:	Length: 3801	Avg Compression: 0.136153
Article 157800:	Length: 3773	Avg Compression: 0.136407
Article 15

Article 184200:	Length: 6968	Avg Compression: 0.147595
Article 184400:	Length: 6972	Avg Compression: 0.147754
Article 184600:	Length: 7022	Avg Compression: 0.147925
Article 184800:	Length: 7089	Avg Compression: 0.147946
Article 185000:	Length: 7122	Avg Compression: 0.148027
Article 185200:	Length: 7132	Avg Compression: 0.148184
Article 185400:	Length: 7203	Avg Compression: 0.148370
Article 185600:	Length: 7206	Avg Compression: 0.148531
Article 185800:	Length: 7241	Avg Compression: 0.148672
Article 186000:	Length: 7349	Avg Compression: 0.148807
Article 186200:	Length: 7365	Avg Compression: 0.149030
Article 186400:	Length: 7418	Avg Compression: 0.148999
Article 186600:	Length: 7465	Avg Compression: 0.149095
Article 186800:	Length: 6993	Avg Compression: 0.149386
Article 187000:	Length: 7519	Avg Compression: 0.149382
Article 187200:	Length: 7600	Avg Compression: 0.149524
Article 187400:	Length: 7648	Avg Compression: 0.149605
Article 187600:	Length: 7678	Avg Compression: 0.149822
Article 18

Article 213800:	Length: 29493	Avg Compression: 0.165482
Article 214000:	Length: 30224	Avg Compression: 0.165585
Article 214200:	Length: 31124	Avg Compression: 0.165541
Article 214400:	Length: 32008	Avg Compression: 0.165545
Article 214600:	Length: 33170	Avg Compression: 0.165428
Article 214800:	Length: 34078	Avg Compression: 0.165346
Article 215000:	Length: 34951	Avg Compression: 0.165276
Article 215200:	Length: 36205	Avg Compression: 0.165104
Article 215400:	Length: 37382	Avg Compression: 0.165446
Article 215600:	Length: 38921	Avg Compression: 0.165460
Article 215800:	Length: 40871	Avg Compression: 0.165646
Article 216000:	Length: 42918	Avg Compression: 0.165725
Article 216200:	Length: 45047	Avg Compression: 0.165848
Article 216400:	Length: 47661	Avg Compression: 0.165842
Article 216600:	Length: 50700	Avg Compression: 0.166249
Article 216800:	Length: 56066	Avg Compression: 0.165780
Article 217000:	Length: 63529	Avg Compression: 0.165694
Article 217200:	Length: 83740	Avg Compression: 0

In [18]:
model.train(256, 192, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [19]:
total_raw = 0
total_compressed = 0

huffman = Huffman(subword_text_encoder.vocab_size)
for index, encoded_article in enumerate(articles.articles_generator(1)):
    if index % 200 == 0:
        article = subword_text_encoder.decode(encoded_article)
        if len(article) == 0:
            continue
        total_raw += len(article) * 8
        total_compressed += huffman.archive_size(model, encoded_article)
        print('Article %d:\tLength: %d\tAvg Compression: %f' % (index, len(article), total_compressed/total_raw))

Article 200:	Length: 17	Avg Compression: 0.242647
Article 400:	Length: 17	Avg Compression: 0.253676
Article 600:	Length: 18	Avg Compression: 0.252404
Article 800:	Length: 18	Avg Compression: 0.264286
Article 1000:	Length: 18	Avg Compression: 0.261364
Article 1200:	Length: 19	Avg Compression: 0.262850
Article 1400:	Length: 19	Avg Compression: 0.260913
Article 1600:	Length: 19	Avg Compression: 0.260345
Article 1800:	Length: 19	Avg Compression: 0.259146
Article 2000:	Length: 19	Avg Compression: 0.256831
Article 2200:	Length: 20	Avg Compression: 0.250616
Article 2400:	Length: 20	Avg Compression: 0.246637
Article 2600:	Length: 20	Avg Compression: 0.250514
Article 2800:	Length: 19	Avg Compression: 0.251431
Article 3000:	Length: 20	Avg Compression: 0.253546
Article 3200:	Length: 20	Avg Compression: 0.250414
Article 3400:	Length: 20	Avg Compression: 0.248059
Article 3600:	Length: 20	Avg Compression: 0.247442
Article 3800:	Length: 21	Avg Compression: 0.244490
Article 4000:	Length: 20	Avg Compre

Article 32200:	Length: 34	Avg Compression: 0.210586
Article 32400:	Length: 34	Avg Compression: 0.210117
Article 32600:	Length: 34	Avg Compression: 0.210913
Article 32800:	Length: 34	Avg Compression: 0.211100
Article 33000:	Length: 34	Avg Compression: 0.210892
Article 33200:	Length: 35	Avg Compression: 0.212676
Article 33400:	Length: 35	Avg Compression: 0.212107
Article 33600:	Length: 35	Avg Compression: 0.212151
Article 33800:	Length: 35	Avg Compression: 0.211977
Article 34000:	Length: 35	Avg Compression: 0.211805
Article 34200:	Length: 35	Avg Compression: 0.211501
Article 34400:	Length: 35	Avg Compression: 0.211895
Article 34600:	Length: 35	Avg Compression: 0.211490
Article 34800:	Length: 36	Avg Compression: 0.210915
Article 35000:	Length: 36	Avg Compression: 0.210713
Article 35200:	Length: 48	Avg Compression: 0.210431
Article 35400:	Length: 54	Avg Compression: 0.209847
Article 35600:	Length: 48	Avg Compression: 0.209681
Article 35800:	Length: 48	Avg Compression: 0.209968
Article 3600

Article 63600:	Length: 370	Avg Compression: 0.184649
Article 63800:	Length: 377	Avg Compression: 0.183826
Article 64000:	Length: 381	Avg Compression: 0.183853
Article 64200:	Length: 390	Avg Compression: 0.183404
Article 64400:	Length: 400	Avg Compression: 0.182247
Article 64600:	Length: 408	Avg Compression: 0.180980
Article 64800:	Length: 415	Avg Compression: 0.181029
Article 65000:	Length: 422	Avg Compression: 0.184366
Article 65200:	Length: 426	Avg Compression: 0.183540
Article 65400:	Length: 438	Avg Compression: 0.183402
Article 65600:	Length: 446	Avg Compression: 0.183530
Article 65800:	Length: 453	Avg Compression: 0.184526
Article 66000:	Length: 460	Avg Compression: 0.183993
Article 66200:	Length: 468	Avg Compression: 0.182951
Article 66400:	Length: 475	Avg Compression: 0.182668
Article 66600:	Length: 482	Avg Compression: 0.181961
Article 66800:	Length: 500	Avg Compression: 0.180955
Article 67000:	Length: 497	Avg Compression: 0.180993
Article 67200:	Length: 470	Avg Compression: 0.

Article 94400:	Length: 1595	Avg Compression: 0.168271
Article 94600:	Length: 1600	Avg Compression: 0.168329
Article 94800:	Length: 1607	Avg Compression: 0.168188
Article 95000:	Length: 1595	Avg Compression: 0.168532
Article 95200:	Length: 1637	Avg Compression: 0.168483
Article 95400:	Length: 1633	Avg Compression: 0.168535
Article 95600:	Length: 1651	Avg Compression: 0.168389
Article 95800:	Length: 1634	Avg Compression: 0.167956
Article 96000:	Length: 1670	Avg Compression: 0.167968
Article 96200:	Length: 1679	Avg Compression: 0.168217
Article 96400:	Length: 1689	Avg Compression: 0.168267
Article 96600:	Length: 1690	Avg Compression: 0.168130
Article 96800:	Length: 1685	Avg Compression: 0.168278
Article 97000:	Length: 1718	Avg Compression: 0.168248
Article 97200:	Length: 1729	Avg Compression: 0.168098
Article 97400:	Length: 1752	Avg Compression: 0.167808
Article 97600:	Length: 1745	Avg Compression: 0.167930
Article 97800:	Length: 1759	Avg Compression: 0.167601
Article 98000:	Length: 1788	

Article 124400:	Length: 2610	Avg Compression: 0.139885
Article 124600:	Length: 2634	Avg Compression: 0.139486
Article 124800:	Length: 2635	Avg Compression: 0.139075
Article 125000:	Length: 2637	Avg Compression: 0.138664
Article 125200:	Length: 2638	Avg Compression: 0.138261
Article 125400:	Length: 2640	Avg Compression: 0.137885
Article 125600:	Length: 2641	Avg Compression: 0.137500
Article 125800:	Length: 2643	Avg Compression: 0.137103
Article 126000:	Length: 2644	Avg Compression: 0.136718
Article 126200:	Length: 2646	Avg Compression: 0.136353
Article 126400:	Length: 2649	Avg Compression: 0.135976
Article 126600:	Length: 2651	Avg Compression: 0.135606
Article 126800:	Length: 2610	Avg Compression: 0.135786
Article 127000:	Length: 2645	Avg Compression: 0.136604
Article 127200:	Length: 2654	Avg Compression: 0.136480
Article 127400:	Length: 2656	Avg Compression: 0.136109
Article 127600:	Length: 2658	Avg Compression: 0.135756
Article 127800:	Length: 2640	Avg Compression: 0.135972
Article 12

Article 154200:	Length: 3591	Avg Compression: 0.128646
Article 154400:	Length: 3608	Avg Compression: 0.128446
Article 154600:	Length: 3555	Avg Compression: 0.128630
Article 154800:	Length: 3627	Avg Compression: 0.128416
Article 155000:	Length: 3638	Avg Compression: 0.128257
Article 155200:	Length: 3652	Avg Compression: 0.128466
Article 155400:	Length: 3661	Avg Compression: 0.128312
Article 155600:	Length: 3683	Avg Compression: 0.128488
Article 155800:	Length: 3631	Avg Compression: 0.128616
Article 156000:	Length: 3699	Avg Compression: 0.128698
Article 156200:	Length: 3701	Avg Compression: 0.128952
Article 156400:	Length: 3676	Avg Compression: 0.129295
Article 156600:	Length: 3737	Avg Compression: 0.129133
Article 156800:	Length: 3735	Avg Compression: 0.129327
Article 157000:	Length: 3765	Avg Compression: 0.129474
Article 157200:	Length: 3809	Avg Compression: 0.129404
Article 157400:	Length: 3792	Avg Compression: 0.129586
Article 157600:	Length: 3801	Avg Compression: 0.129711
Article 15

Article 184000:	Length: 6938	Avg Compression: 0.140767
Article 184200:	Length: 6968	Avg Compression: 0.140730
Article 184400:	Length: 6972	Avg Compression: 0.140898
Article 184600:	Length: 7022	Avg Compression: 0.141070
Article 184800:	Length: 7089	Avg Compression: 0.141099
Article 185000:	Length: 7122	Avg Compression: 0.141179
Article 185200:	Length: 7132	Avg Compression: 0.141345
Article 185400:	Length: 7203	Avg Compression: 0.141535
Article 185600:	Length: 7206	Avg Compression: 0.141681
Article 185800:	Length: 7241	Avg Compression: 0.141820
Article 186000:	Length: 7349	Avg Compression: 0.141956
Article 186200:	Length: 7365	Avg Compression: 0.142181
Article 186400:	Length: 7418	Avg Compression: 0.142155
Article 186600:	Length: 7465	Avg Compression: 0.142241
Article 186800:	Length: 6993	Avg Compression: 0.142473
Article 187000:	Length: 7519	Avg Compression: 0.142456
Article 187200:	Length: 7600	Avg Compression: 0.142601
Article 187400:	Length: 7648	Avg Compression: 0.142699
Article 18

Article 213600:	Length: 28857	Avg Compression: 0.158112
Article 213800:	Length: 29493	Avg Compression: 0.158460
Article 214000:	Length: 30224	Avg Compression: 0.158555
Article 214200:	Length: 31124	Avg Compression: 0.158522
Article 214400:	Length: 32008	Avg Compression: 0.158499
Article 214600:	Length: 33170	Avg Compression: 0.158350
Article 214800:	Length: 34078	Avg Compression: 0.158277
Article 215000:	Length: 34951	Avg Compression: 0.158235
Article 215200:	Length: 36205	Avg Compression: 0.158086
Article 215400:	Length: 37382	Avg Compression: 0.158430
Article 215600:	Length: 38921	Avg Compression: 0.158480
Article 215800:	Length: 40871	Avg Compression: 0.158703
Article 216000:	Length: 42918	Avg Compression: 0.158802
Article 216200:	Length: 45047	Avg Compression: 0.159038
Article 216400:	Length: 47661	Avg Compression: 0.159078
Article 216600:	Length: 50700	Avg Compression: 0.159525
Article 216800:	Length: 56066	Avg Compression: 0.159150
Article 217000:	Length: 63529	Avg Compression: 0

In [20]:
model.train(256, 192, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [21]:
total_raw = 0
total_compressed = 0

huffman = Huffman(subword_text_encoder.vocab_size)
for index, encoded_article in enumerate(articles.articles_generator(1)):
    if index % 200 == 0:
        article = subword_text_encoder.decode(encoded_article)
        if len(article) == 0:
            continue
        total_raw += len(article) * 8
        total_compressed += huffman.archive_size(model, encoded_article)
        print('Article %d:\tLength: %d\tAvg Compression: %f' % (index, len(article), total_compressed/total_raw))

Article 200:	Length: 17	Avg Compression: 0.227941
Article 400:	Length: 17	Avg Compression: 0.238971
Article 600:	Length: 18	Avg Compression: 0.235577
Article 800:	Length: 18	Avg Compression: 0.244643
Article 1000:	Length: 18	Avg Compression: 0.240057
Article 1200:	Length: 19	Avg Compression: 0.245327
Article 1400:	Length: 19	Avg Compression: 0.244048
Article 1600:	Length: 19	Avg Compression: 0.245690
Article 1800:	Length: 19	Avg Compression: 0.243140
Article 2000:	Length: 19	Avg Compression: 0.242486
Article 2200:	Length: 20	Avg Compression: 0.237685
Article 2400:	Length: 20	Avg Compression: 0.233744
Article 2600:	Length: 20	Avg Compression: 0.236111
Article 2800:	Length: 19	Avg Compression: 0.237118
Article 3000:	Length: 20	Avg Compression: 0.240691
Article 3200:	Length: 20	Avg Compression: 0.239238
Article 3400:	Length: 20	Avg Compression: 0.237966
Article 3600:	Length: 20	Avg Compression: 0.236477
Article 3800:	Length: 21	Avg Compression: 0.234504
Article 4000:	Length: 20	Avg Compre

Article 32200:	Length: 34	Avg Compression: 0.205481
Article 32400:	Length: 34	Avg Compression: 0.204995
Article 32600:	Length: 34	Avg Compression: 0.205887
Article 32800:	Length: 34	Avg Compression: 0.206199
Article 33000:	Length: 34	Avg Compression: 0.206000
Article 33200:	Length: 35	Avg Compression: 0.207599
Article 33400:	Length: 35	Avg Compression: 0.207014
Article 33600:	Length: 35	Avg Compression: 0.207042
Article 33800:	Length: 35	Avg Compression: 0.206771
Article 34000:	Length: 35	Avg Compression: 0.206773
Article 34200:	Length: 35	Avg Compression: 0.206588
Article 34400:	Length: 35	Avg Compression: 0.206939
Article 34600:	Length: 35	Avg Compression: 0.206570
Article 34800:	Length: 36	Avg Compression: 0.206006
Article 35000:	Length: 36	Avg Compression: 0.205737
Article 35200:	Length: 48	Avg Compression: 0.205608
Article 35400:	Length: 54	Avg Compression: 0.204872
Article 35600:	Length: 48	Avg Compression: 0.204628
Article 35800:	Length: 48	Avg Compression: 0.204839
Article 3600

Article 63600:	Length: 370	Avg Compression: 0.179785
Article 63800:	Length: 377	Avg Compression: 0.178977
Article 64000:	Length: 381	Avg Compression: 0.178985
Article 64200:	Length: 390	Avg Compression: 0.178425
Article 64400:	Length: 400	Avg Compression: 0.177378
Article 64600:	Length: 408	Avg Compression: 0.176214
Article 64800:	Length: 415	Avg Compression: 0.176219
Article 65000:	Length: 422	Avg Compression: 0.179637
Article 65200:	Length: 426	Avg Compression: 0.178924
Article 65400:	Length: 438	Avg Compression: 0.178935
Article 65600:	Length: 446	Avg Compression: 0.179092
Article 65800:	Length: 453	Avg Compression: 0.179932
Article 66000:	Length: 460	Avg Compression: 0.179543
Article 66200:	Length: 468	Avg Compression: 0.178495
Article 66400:	Length: 475	Avg Compression: 0.178308
Article 66600:	Length: 482	Avg Compression: 0.177750
Article 66800:	Length: 500	Avg Compression: 0.176829
Article 67000:	Length: 497	Avg Compression: 0.176930
Article 67200:	Length: 470	Avg Compression: 0.

Article 94400:	Length: 1595	Avg Compression: 0.165717
Article 94600:	Length: 1600	Avg Compression: 0.165758
Article 94800:	Length: 1607	Avg Compression: 0.165636
Article 95000:	Length: 1595	Avg Compression: 0.165929
Article 95200:	Length: 1637	Avg Compression: 0.165853
Article 95400:	Length: 1633	Avg Compression: 0.165900
Article 95600:	Length: 1651	Avg Compression: 0.165713
Article 95800:	Length: 1634	Avg Compression: 0.165315
Article 96000:	Length: 1670	Avg Compression: 0.165353
Article 96200:	Length: 1679	Avg Compression: 0.165584
Article 96400:	Length: 1689	Avg Compression: 0.165644
Article 96600:	Length: 1690	Avg Compression: 0.165522
Article 96800:	Length: 1685	Avg Compression: 0.165700
Article 97000:	Length: 1718	Avg Compression: 0.165673
Article 97200:	Length: 1729	Avg Compression: 0.165574
Article 97400:	Length: 1752	Avg Compression: 0.165300
Article 97600:	Length: 1745	Avg Compression: 0.165438
Article 97800:	Length: 1759	Avg Compression: 0.165119
Article 98000:	Length: 1788	

Article 124400:	Length: 2610	Avg Compression: 0.138110
Article 124600:	Length: 2634	Avg Compression: 0.137716
Article 124800:	Length: 2635	Avg Compression: 0.137304
Article 125000:	Length: 2637	Avg Compression: 0.136906
Article 125200:	Length: 2638	Avg Compression: 0.136513
Article 125400:	Length: 2640	Avg Compression: 0.136142
Article 125600:	Length: 2641	Avg Compression: 0.135764
Article 125800:	Length: 2643	Avg Compression: 0.135377
Article 126000:	Length: 2644	Avg Compression: 0.134990
Article 126200:	Length: 2646	Avg Compression: 0.134632
Article 126400:	Length: 2649	Avg Compression: 0.134250
Article 126600:	Length: 2651	Avg Compression: 0.133884
Article 126800:	Length: 2610	Avg Compression: 0.134048
Article 127000:	Length: 2645	Avg Compression: 0.134863
Article 127200:	Length: 2654	Avg Compression: 0.134771
Article 127400:	Length: 2656	Avg Compression: 0.134400
Article 127600:	Length: 2658	Avg Compression: 0.134046
Article 127800:	Length: 2640	Avg Compression: 0.134250
Article 12

Article 154200:	Length: 3591	Avg Compression: 0.127026
Article 154400:	Length: 3608	Avg Compression: 0.126831
Article 154600:	Length: 3555	Avg Compression: 0.127031
Article 154800:	Length: 3627	Avg Compression: 0.126825
Article 155000:	Length: 3638	Avg Compression: 0.126668
Article 155200:	Length: 3652	Avg Compression: 0.126869
Article 155400:	Length: 3661	Avg Compression: 0.126718
Article 155600:	Length: 3683	Avg Compression: 0.126888
Article 155800:	Length: 3631	Avg Compression: 0.127013
Article 156000:	Length: 3699	Avg Compression: 0.127103
Article 156200:	Length: 3701	Avg Compression: 0.127366
Article 156400:	Length: 3676	Avg Compression: 0.127696
Article 156600:	Length: 3737	Avg Compression: 0.127531
Article 156800:	Length: 3735	Avg Compression: 0.127728
Article 157000:	Length: 3765	Avg Compression: 0.127877
Article 157200:	Length: 3809	Avg Compression: 0.127801
Article 157400:	Length: 3792	Avg Compression: 0.127995
Article 157600:	Length: 3801	Avg Compression: 0.128132
Article 15

Article 184000:	Length: 6938	Avg Compression: 0.139200
Article 184200:	Length: 6968	Avg Compression: 0.139148
Article 184400:	Length: 6972	Avg Compression: 0.139317
Article 184600:	Length: 7022	Avg Compression: 0.139500
Article 184800:	Length: 7089	Avg Compression: 0.139539
Article 185000:	Length: 7122	Avg Compression: 0.139621
Article 185200:	Length: 7132	Avg Compression: 0.139777
Article 185400:	Length: 7203	Avg Compression: 0.139960
Article 185600:	Length: 7206	Avg Compression: 0.140119
Article 185800:	Length: 7241	Avg Compression: 0.140267
Article 186000:	Length: 7349	Avg Compression: 0.140411
Article 186200:	Length: 7365	Avg Compression: 0.140629
Article 186400:	Length: 7418	Avg Compression: 0.140593
Article 186600:	Length: 7465	Avg Compression: 0.140686
Article 186800:	Length: 6993	Avg Compression: 0.140925
Article 187000:	Length: 7519	Avg Compression: 0.140902
Article 187200:	Length: 7600	Avg Compression: 0.141048
Article 187400:	Length: 7648	Avg Compression: 0.141153
Article 18

Article 213600:	Length: 28857	Avg Compression: 0.156701
Article 213800:	Length: 29493	Avg Compression: 0.157049
Article 214000:	Length: 30224	Avg Compression: 0.157146
Article 214200:	Length: 31124	Avg Compression: 0.157133
Article 214400:	Length: 32008	Avg Compression: 0.157099
Article 214600:	Length: 33170	Avg Compression: 0.156927
Article 214800:	Length: 34078	Avg Compression: 0.156879
Article 215000:	Length: 34951	Avg Compression: 0.156827
Article 215200:	Length: 36205	Avg Compression: 0.156672
Article 215400:	Length: 37382	Avg Compression: 0.156957
Article 215600:	Length: 38921	Avg Compression: 0.157011
Article 215800:	Length: 40871	Avg Compression: 0.157201
Article 216000:	Length: 42918	Avg Compression: 0.157274
Article 216200:	Length: 45047	Avg Compression: 0.157536
Article 216400:	Length: 47661	Avg Compression: 0.157587
Article 216600:	Length: 50700	Avg Compression: 0.157969
Article 216800:	Length: 56066	Avg Compression: 0.157542
Article 217000:	Length: 63529	Avg Compression: 0

In [43]:
model.train(256, 192)



In [44]:
model.train(256, 192, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [45]:
total_raw = 0
total_compressed = 0

huffman = Huffman(subword_text_encoder.vocab_size)
for index, encoded_article in enumerate(articles.articles_generator(1)):
    if index % 200 == 0:
        article = subword_text_encoder.decode(encoded_article)
        if len(article) == 0:
            continue
        total_raw += len(article) * 8
        total_compressed += huffman.archive_size(model, encoded_article)
        print('Article %d:\tLength: %d\tAvg Compression: %f' % (index, len(article), total_compressed/total_raw))

Article 200:	Length: 17	Avg Compression: 0.213235
Article 400:	Length: 17	Avg Compression: 0.238971
Article 600:	Length: 18	Avg Compression: 0.237981
Article 800:	Length: 18	Avg Compression: 0.246429
Article 1000:	Length: 18	Avg Compression: 0.244318
Article 1200:	Length: 19	Avg Compression: 0.247664
Article 1400:	Length: 19	Avg Compression: 0.246032
Article 1600:	Length: 19	Avg Compression: 0.244828
Article 1800:	Length: 19	Avg Compression: 0.242378
Article 2000:	Length: 19	Avg Compression: 0.240437
Article 2200:	Length: 20	Avg Compression: 0.235222
Article 2400:	Length: 20	Avg Compression: 0.230942
Article 2600:	Length: 20	Avg Compression: 0.235597
Article 2800:	Length: 19	Avg Compression: 0.238550
Article 3000:	Length: 20	Avg Compression: 0.242021
Article 3200:	Length: 20	Avg Compression: 0.240066
Article 3400:	Length: 20	Avg Compression: 0.237966
Article 3600:	Length: 20	Avg Compression: 0.235380
Article 3800:	Length: 21	Avg Compression: 0.233471
Article 4000:	Length: 20	Avg Compre

Article 32200:	Length: 34	Avg Compression: 0.202523
Article 32400:	Length: 34	Avg Compression: 0.202003
Article 32600:	Length: 34	Avg Compression: 0.202775
Article 32800:	Length: 34	Avg Compression: 0.202941
Article 33000:	Length: 34	Avg Compression: 0.202682
Article 33200:	Length: 35	Avg Compression: 0.204447
Article 33400:	Length: 35	Avg Compression: 0.203969
Article 33600:	Length: 35	Avg Compression: 0.204021
Article 33800:	Length: 35	Avg Compression: 0.203827
Article 34000:	Length: 35	Avg Compression: 0.203825
Article 34200:	Length: 35	Avg Compression: 0.203555
Article 34400:	Length: 35	Avg Compression: 0.203901
Article 34600:	Length: 35	Avg Compression: 0.203581
Article 34800:	Length: 36	Avg Compression: 0.203040
Article 35000:	Length: 36	Avg Compression: 0.202819
Article 35200:	Length: 48	Avg Compression: 0.202512
Article 35400:	Length: 54	Avg Compression: 0.201913
Article 35600:	Length: 48	Avg Compression: 0.201647
Article 35800:	Length: 48	Avg Compression: 0.201786
Article 3600

Article 63600:	Length: 370	Avg Compression: 0.179416
Article 63800:	Length: 377	Avg Compression: 0.178638
Article 64000:	Length: 381	Avg Compression: 0.178727
Article 64200:	Length: 390	Avg Compression: 0.178160
Article 64400:	Length: 400	Avg Compression: 0.177135
Article 64600:	Length: 408	Avg Compression: 0.175975
Article 64800:	Length: 415	Avg Compression: 0.175990
Article 65000:	Length: 422	Avg Compression: 0.179285
Article 65200:	Length: 426	Avg Compression: 0.178641
Article 65400:	Length: 438	Avg Compression: 0.178575
Article 65600:	Length: 446	Avg Compression: 0.178763
Article 65800:	Length: 453	Avg Compression: 0.179693
Article 66000:	Length: 460	Avg Compression: 0.179413
Article 66200:	Length: 468	Avg Compression: 0.178405
Article 66400:	Length: 475	Avg Compression: 0.178155
Article 66600:	Length: 482	Avg Compression: 0.177618
Article 66800:	Length: 500	Avg Compression: 0.176606
Article 67000:	Length: 497	Avg Compression: 0.176724
Article 67200:	Length: 470	Avg Compression: 0.

Article 94400:	Length: 1595	Avg Compression: 0.167749
Article 94600:	Length: 1600	Avg Compression: 0.167782
Article 94800:	Length: 1607	Avg Compression: 0.167668
Article 95000:	Length: 1595	Avg Compression: 0.168035
Article 95200:	Length: 1637	Avg Compression: 0.167976
Article 95400:	Length: 1633	Avg Compression: 0.168011
Article 95600:	Length: 1651	Avg Compression: 0.167848
Article 95800:	Length: 1634	Avg Compression: 0.167432
Article 96000:	Length: 1670	Avg Compression: 0.167442
Article 96200:	Length: 1679	Avg Compression: 0.167672
Article 96400:	Length: 1689	Avg Compression: 0.167726
Article 96600:	Length: 1690	Avg Compression: 0.167635
Article 96800:	Length: 1685	Avg Compression: 0.167802
Article 97000:	Length: 1718	Avg Compression: 0.167758
Article 97200:	Length: 1729	Avg Compression: 0.167650
Article 97400:	Length: 1752	Avg Compression: 0.167364
Article 97600:	Length: 1745	Avg Compression: 0.167504
Article 97800:	Length: 1759	Avg Compression: 0.167220
Article 98000:	Length: 1788	

Article 124400:	Length: 2610	Avg Compression: 0.140308
Article 124600:	Length: 2634	Avg Compression: 0.139905
Article 124800:	Length: 2635	Avg Compression: 0.139489
Article 125000:	Length: 2637	Avg Compression: 0.139083
Article 125200:	Length: 2638	Avg Compression: 0.138686
Article 125400:	Length: 2640	Avg Compression: 0.138308
Article 125600:	Length: 2641	Avg Compression: 0.137925
Article 125800:	Length: 2643	Avg Compression: 0.137531
Article 126000:	Length: 2644	Avg Compression: 0.137140
Article 126200:	Length: 2646	Avg Compression: 0.136775
Article 126400:	Length: 2649	Avg Compression: 0.136390
Article 126600:	Length: 2651	Avg Compression: 0.136022
Article 126800:	Length: 2610	Avg Compression: 0.136188
Article 127000:	Length: 2645	Avg Compression: 0.137012
Article 127200:	Length: 2654	Avg Compression: 0.136917
Article 127400:	Length: 2656	Avg Compression: 0.136545
Article 127600:	Length: 2658	Avg Compression: 0.136192
Article 127800:	Length: 2640	Avg Compression: 0.136400
Article 12

Article 154200:	Length: 3591	Avg Compression: 0.129205
Article 154400:	Length: 3608	Avg Compression: 0.129012
Article 154600:	Length: 3555	Avg Compression: 0.129202
Article 154800:	Length: 3627	Avg Compression: 0.129003
Article 155000:	Length: 3638	Avg Compression: 0.128851
Article 155200:	Length: 3652	Avg Compression: 0.129047
Article 155400:	Length: 3661	Avg Compression: 0.128892
Article 155600:	Length: 3683	Avg Compression: 0.129065
Article 155800:	Length: 3631	Avg Compression: 0.129185
Article 156000:	Length: 3699	Avg Compression: 0.129269
Article 156200:	Length: 3701	Avg Compression: 0.129531
Article 156400:	Length: 3676	Avg Compression: 0.129872
Article 156600:	Length: 3737	Avg Compression: 0.129711
Article 156800:	Length: 3735	Avg Compression: 0.129919
Article 157000:	Length: 3765	Avg Compression: 0.130066
Article 157200:	Length: 3809	Avg Compression: 0.129993
Article 157400:	Length: 3792	Avg Compression: 0.130183
Article 157600:	Length: 3801	Avg Compression: 0.130316
Article 15



Article 183200:	Length: 6761	Avg Compression: 0.140888
Article 183400:	Length: 6766	Avg Compression: 0.141048
Article 183600:	Length: 6793	Avg Compression: 0.141135
Article 183800:	Length: 6866	Avg Compression: 0.141410
Article 184000:	Length: 6938	Avg Compression: 0.141588
Article 184200:	Length: 6968	Avg Compression: 0.141547
Article 184400:	Length: 6972	Avg Compression: 0.141716
Article 184600:	Length: 7022	Avg Compression: 0.141896
Article 184800:	Length: 7089	Avg Compression: 0.141927
Article 185000:	Length: 7122	Avg Compression: 0.142015
Article 185200:	Length: 7132	Avg Compression: 0.142180
Article 185400:	Length: 7203	Avg Compression: 0.142374
Article 185600:	Length: 7206	Avg Compression: 0.142531
Article 185800:	Length: 7241	Avg Compression: 0.142676
Article 186000:	Length: 7349	Avg Compression: 0.142820
Article 186200:	Length: 7365	Avg Compression: 0.143042
Article 186400:	Length: 7418	Avg Compression: 0.143002
Article 186600:	Length: 7465	Avg Compression: 0.143089
Article 18

Article 209400:	Length: 19846	Avg Compression: 0.156464
Article 209600:	Length: 20175	Avg Compression: 0.156375
Article 209800:	Length: 20407	Avg Compression: 0.156422
Article 210000:	Length: 20722	Avg Compression: 0.156397
Article 210200:	Length: 20969	Avg Compression: 0.156627
Article 210400:	Length: 21391	Avg Compression: 0.156334
Article 210600:	Length: 22018	Avg Compression: 0.156732
Article 210800:	Length: 22122	Avg Compression: 0.156924
Article 211000:	Length: 22425	Avg Compression: 0.157132
Article 211200:	Length: 22793	Avg Compression: 0.157311
Article 211400:	Length: 23092	Avg Compression: 0.157516
Article 211600:	Length: 23557	Avg Compression: 0.157527
Article 211800:	Length: 24141	Avg Compression: 0.157745
Article 212000:	Length: 24511	Avg Compression: 0.157948
Article 212200:	Length: 25100	Avg Compression: 0.157971
Article 212400:	Length: 25468	Avg Compression: 0.158142
Article 212600:	Length: 25959	Avg Compression: 0.158536
Article 212800:	Length: 26396	Avg Compression: 0

In [46]:
model.train(256, 192, epochs=2)

Epoch 1/2
Epoch 2/2


In [47]:
total_raw = 0
total_compressed = 0

huffman = Huffman(subword_text_encoder.vocab_size)
for index, encoded_article in enumerate(articles.articles_generator(1)):
    if index % 200 == 0:
        article = subword_text_encoder.decode(encoded_article)
        if len(article) == 0:
            continue
        total_raw += len(article) * 8
        total_compressed += huffman.archive_size(model, encoded_article)
        print('Article %d:\tLength: %d\tAvg Compression: %f' % (index, len(article), total_compressed/total_raw))

Article 200:	Length: 17	Avg Compression: 0.213235
Article 400:	Length: 17	Avg Compression: 0.224265
Article 600:	Length: 18	Avg Compression: 0.218750
Article 800:	Length: 18	Avg Compression: 0.230357
Article 1000:	Length: 18	Avg Compression: 0.228693
Article 1200:	Length: 19	Avg Compression: 0.234813
Article 1400:	Length: 19	Avg Compression: 0.236111
Article 1600:	Length: 19	Avg Compression: 0.236207
Article 1800:	Length: 19	Avg Compression: 0.233232
Article 2000:	Length: 19	Avg Compression: 0.232923
Article 2200:	Length: 20	Avg Compression: 0.227833
Article 2400:	Length: 20	Avg Compression: 0.224215
Article 2600:	Length: 20	Avg Compression: 0.227881
Article 2800:	Length: 19	Avg Compression: 0.230439
Article 3000:	Length: 20	Avg Compression: 0.232270
Article 3200:	Length: 20	Avg Compression: 0.229719
Article 3400:	Length: 20	Avg Compression: 0.228261
Article 3600:	Length: 20	Avg Compression: 0.226974
Article 3800:	Length: 21	Avg Compression: 0.225207
Article 4000:	Length: 20	Avg Compre

Article 32200:	Length: 34	Avg Compression: 0.193590
Article 32400:	Length: 34	Avg Compression: 0.193140
Article 32600:	Length: 34	Avg Compression: 0.193924
Article 32800:	Length: 34	Avg Compression: 0.194101
Article 33000:	Length: 34	Avg Compression: 0.193882
Article 33200:	Length: 35	Avg Compression: 0.195520
Article 33400:	Length: 35	Avg Compression: 0.194946
Article 33600:	Length: 35	Avg Compression: 0.195012
Article 33800:	Length: 35	Avg Compression: 0.194723
Article 34000:	Length: 35	Avg Compression: 0.194682
Article 34200:	Length: 35	Avg Compression: 0.194480
Article 34400:	Length: 35	Avg Compression: 0.194788
Article 34600:	Length: 35	Avg Compression: 0.194456
Article 34800:	Length: 36	Avg Compression: 0.193931
Article 35000:	Length: 36	Avg Compression: 0.193674
Article 35200:	Length: 48	Avg Compression: 0.193562
Article 35400:	Length: 54	Avg Compression: 0.192908
Article 35600:	Length: 48	Avg Compression: 0.192679
Article 35800:	Length: 48	Avg Compression: 0.192954
Article 3600

Article 63600:	Length: 370	Avg Compression: 0.173243
Article 63800:	Length: 377	Avg Compression: 0.172481
Article 64000:	Length: 381	Avg Compression: 0.172517
Article 64200:	Length: 390	Avg Compression: 0.172008
Article 64400:	Length: 400	Avg Compression: 0.171059
Article 64600:	Length: 408	Avg Compression: 0.169996
Article 64800:	Length: 415	Avg Compression: 0.170063
Article 65000:	Length: 422	Avg Compression: 0.173401
Article 65200:	Length: 426	Avg Compression: 0.172783
Article 65400:	Length: 438	Avg Compression: 0.172699
Article 65600:	Length: 446	Avg Compression: 0.173037
Article 65800:	Length: 453	Avg Compression: 0.173980
Article 66000:	Length: 460	Avg Compression: 0.173638
Article 66200:	Length: 468	Avg Compression: 0.172595
Article 66400:	Length: 475	Avg Compression: 0.172475
Article 66600:	Length: 482	Avg Compression: 0.171983
Article 66800:	Length: 500	Avg Compression: 0.171013
Article 67000:	Length: 497	Avg Compression: 0.171172
Article 67200:	Length: 470	Avg Compression: 0.

Article 94400:	Length: 1595	Avg Compression: 0.162965
Article 94600:	Length: 1600	Avg Compression: 0.163029
Article 94800:	Length: 1607	Avg Compression: 0.162871
Article 95000:	Length: 1595	Avg Compression: 0.163204
Article 95200:	Length: 1637	Avg Compression: 0.163157
Article 95400:	Length: 1633	Avg Compression: 0.163185
Article 95600:	Length: 1651	Avg Compression: 0.163036
Article 95800:	Length: 1634	Avg Compression: 0.162643
Article 96000:	Length: 1670	Avg Compression: 0.162667
Article 96200:	Length: 1679	Avg Compression: 0.162916
Article 96400:	Length: 1689	Avg Compression: 0.162982
Article 96600:	Length: 1690	Avg Compression: 0.162863
Article 96800:	Length: 1685	Avg Compression: 0.163038
Article 97000:	Length: 1718	Avg Compression: 0.163001
Article 97200:	Length: 1729	Avg Compression: 0.162934
Article 97400:	Length: 1752	Avg Compression: 0.162644
Article 97600:	Length: 1745	Avg Compression: 0.162792
Article 97800:	Length: 1759	Avg Compression: 0.162494
Article 98000:	Length: 1788	

Article 124400:	Length: 2610	Avg Compression: 0.136411
Article 124600:	Length: 2634	Avg Compression: 0.136017
Article 124800:	Length: 2635	Avg Compression: 0.135620
Article 125000:	Length: 2637	Avg Compression: 0.135228
Article 125200:	Length: 2638	Avg Compression: 0.134837
Article 125400:	Length: 2640	Avg Compression: 0.134470
Article 125600:	Length: 2641	Avg Compression: 0.134092
Article 125800:	Length: 2643	Avg Compression: 0.133705
Article 126000:	Length: 2644	Avg Compression: 0.133322
Article 126200:	Length: 2646	Avg Compression: 0.132973
Article 126400:	Length: 2649	Avg Compression: 0.132599
Article 126600:	Length: 2651	Avg Compression: 0.132234
Article 126800:	Length: 2610	Avg Compression: 0.132382
Article 127000:	Length: 2645	Avg Compression: 0.133168
Article 127200:	Length: 2654	Avg Compression: 0.133060
Article 127400:	Length: 2656	Avg Compression: 0.132691
Article 127600:	Length: 2658	Avg Compression: 0.132340
Article 127800:	Length: 2640	Avg Compression: 0.132550
Article 12

Article 154200:	Length: 3591	Avg Compression: 0.125492
Article 154400:	Length: 3608	Avg Compression: 0.125292
Article 154600:	Length: 3555	Avg Compression: 0.125483
Article 154800:	Length: 3627	Avg Compression: 0.125272
Article 155000:	Length: 3638	Avg Compression: 0.125116
Article 155200:	Length: 3652	Avg Compression: 0.125324
Article 155400:	Length: 3661	Avg Compression: 0.125170
Article 155600:	Length: 3683	Avg Compression: 0.125349
Article 155800:	Length: 3631	Avg Compression: 0.125478
Article 156000:	Length: 3699	Avg Compression: 0.125564
Article 156200:	Length: 3701	Avg Compression: 0.125824
Article 156400:	Length: 3676	Avg Compression: 0.126153
Article 156600:	Length: 3737	Avg Compression: 0.125996
Article 156800:	Length: 3735	Avg Compression: 0.126204
Article 157000:	Length: 3765	Avg Compression: 0.126352
Article 157200:	Length: 3809	Avg Compression: 0.126272
Article 157400:	Length: 3792	Avg Compression: 0.126471
Article 157600:	Length: 3801	Avg Compression: 0.126600
Article 15

Article 184000:	Length: 6938	Avg Compression: 0.137763
Article 184200:	Length: 6968	Avg Compression: 0.137719
Article 184400:	Length: 6972	Avg Compression: 0.137891
Article 184600:	Length: 7022	Avg Compression: 0.138073
Article 184800:	Length: 7089	Avg Compression: 0.138093
Article 185000:	Length: 7122	Avg Compression: 0.138175
Article 185200:	Length: 7132	Avg Compression: 0.138349
Article 185400:	Length: 7203	Avg Compression: 0.138550
Article 185600:	Length: 7206	Avg Compression: 0.138713
Article 185800:	Length: 7241	Avg Compression: 0.138861
Article 186000:	Length: 7349	Avg Compression: 0.139004
Article 186200:	Length: 7365	Avg Compression: 0.139238
Article 186400:	Length: 7418	Avg Compression: 0.139182
Article 186600:	Length: 7465	Avg Compression: 0.139264
Article 186800:	Length: 6993	Avg Compression: 0.139482
Article 187000:	Length: 7519	Avg Compression: 0.139458
Article 187200:	Length: 7600	Avg Compression: 0.139601
Article 187400:	Length: 7648	Avg Compression: 0.139714
Article 18

Article 213600:	Length: 28857	Avg Compression: 0.155412
Article 213800:	Length: 29493	Avg Compression: 0.155770
Article 214000:	Length: 30224	Avg Compression: 0.155871
Article 214200:	Length: 31124	Avg Compression: 0.155850
Article 214400:	Length: 32008	Avg Compression: 0.155852
Article 214600:	Length: 33170	Avg Compression: 0.155640
Article 214800:	Length: 34078	Avg Compression: 0.155602
Article 215000:	Length: 34951	Avg Compression: 0.155577
Article 215200:	Length: 36205	Avg Compression: 0.155443
Article 215400:	Length: 37382	Avg Compression: 0.155799
Article 215600:	Length: 38921	Avg Compression: 0.155853
Article 215800:	Length: 40871	Avg Compression: 0.156082
Article 216000:	Length: 42918	Avg Compression: 0.156175
Article 216200:	Length: 45047	Avg Compression: 0.156449
Article 216400:	Length: 47661	Avg Compression: 0.156501
Article 216600:	Length: 50700	Avg Compression: 0.156950
Article 216800:	Length: 56066	Avg Compression: 0.156537
Article 217000:	Length: 63529	Avg Compression: 0

In [48]:
model.train(256, 192, epochs=2)

Epoch 1/2
Epoch 2/2


In [49]:
total_raw = 0
total_compressed = 0

huffman = Huffman(subword_text_encoder.vocab_size)
for index, encoded_article in enumerate(articles.articles_generator(1)):
    if index % 200 == 0:
        article = subword_text_encoder.decode(encoded_article)
        if len(article) == 0:
            continue
        total_raw += len(article) * 8
        total_compressed += huffman.archive_size(model, encoded_article)
        print('Article %d:\tLength: %d\tAvg Compression: %f' % (index, len(article), total_compressed/total_raw))

Article 200:	Length: 17	Avg Compression: 0.242647
Article 400:	Length: 17	Avg Compression: 0.279412
Article 600:	Length: 18	Avg Compression: 0.264423
Article 800:	Length: 18	Avg Compression: 0.271429
Article 1000:	Length: 18	Avg Compression: 0.274148
Article 1200:	Length: 19	Avg Compression: 0.272196
Article 1400:	Length: 19	Avg Compression: 0.268849
Article 1600:	Length: 19	Avg Compression: 0.265517
Article 1800:	Length: 19	Avg Compression: 0.265244
Article 2000:	Length: 19	Avg Compression: 0.268443
Article 2200:	Length: 20	Avg Compression: 0.262315
Article 2400:	Length: 20	Avg Compression: 0.256166
Article 2600:	Length: 20	Avg Compression: 0.261317
Article 2800:	Length: 19	Avg Compression: 0.265267
Article 3000:	Length: 20	Avg Compression: 0.266401
Article 3200:	Length: 20	Avg Compression: 0.264073
Article 3400:	Length: 20	Avg Compression: 0.260481
Article 3600:	Length: 20	Avg Compression: 0.258041
Article 3800:	Length: 21	Avg Compression: 0.255165
Article 4000:	Length: 20	Avg Compre

Article 32200:	Length: 34	Avg Compression: 0.251102
Article 32400:	Length: 34	Avg Compression: 0.250547
Article 32600:	Length: 34	Avg Compression: 0.251656
Article 32800:	Length: 34	Avg Compression: 0.252012
Article 33000:	Length: 34	Avg Compression: 0.251546
Article 33200:	Length: 35	Avg Compression: 0.253515
Article 33400:	Length: 35	Avg Compression: 0.252796
Article 33600:	Length: 35	Avg Compression: 0.253159
Article 33800:	Length: 35	Avg Compression: 0.252671
Article 34000:	Length: 35	Avg Compression: 0.253111
Article 34200:	Length: 35	Avg Compression: 0.252577
Article 34400:	Length: 35	Avg Compression: 0.252958
Article 34600:	Length: 35	Avg Compression: 0.252592
Article 34800:	Length: 36	Avg Compression: 0.251890
Article 35000:	Length: 36	Avg Compression: 0.251641
Article 35200:	Length: 48	Avg Compression: 0.251728
Article 35400:	Length: 54	Avg Compression: 0.250995
Article 35600:	Length: 48	Avg Compression: 0.250682
Article 35800:	Length: 48	Avg Compression: 0.250525
Article 3600

Article 63600:	Length: 370	Avg Compression: 0.258206
Article 63800:	Length: 377	Avg Compression: 0.257431
Article 64000:	Length: 381	Avg Compression: 0.257576
Article 64200:	Length: 390	Avg Compression: 0.257370
Article 64400:	Length: 400	Avg Compression: 0.256380
Article 64600:	Length: 408	Avg Compression: 0.254798
Article 64800:	Length: 415	Avg Compression: 0.254558
Article 65000:	Length: 422	Avg Compression: 0.258409
Article 65200:	Length: 426	Avg Compression: 0.257730
Article 65400:	Length: 438	Avg Compression: 0.257481
Article 65600:	Length: 446	Avg Compression: 0.257130
Article 65800:	Length: 453	Avg Compression: 0.258944
Article 66000:	Length: 460	Avg Compression: 0.258454
Article 66200:	Length: 468	Avg Compression: 0.257178
Article 66400:	Length: 475	Avg Compression: 0.256533
Article 66600:	Length: 482	Avg Compression: 0.255749
Article 66800:	Length: 500	Avg Compression: 0.254837
Article 67000:	Length: 497	Avg Compression: 0.254520
Article 67200:	Length: 470	Avg Compression: 0.

Article 94400:	Length: 1595	Avg Compression: 0.250697
Article 94600:	Length: 1600	Avg Compression: 0.250692
Article 94800:	Length: 1607	Avg Compression: 0.250601
Article 95000:	Length: 1595	Avg Compression: 0.250980
Article 95200:	Length: 1637	Avg Compression: 0.250868
Article 95400:	Length: 1633	Avg Compression: 0.250734
Article 95600:	Length: 1651	Avg Compression: 0.250477
Article 95800:	Length: 1634	Avg Compression: 0.249859
Article 96000:	Length: 1670	Avg Compression: 0.249698
Article 96200:	Length: 1679	Avg Compression: 0.249663
Article 96400:	Length: 1689	Avg Compression: 0.249408
Article 96600:	Length: 1690	Avg Compression: 0.249233
Article 96800:	Length: 1685	Avg Compression: 0.249167
Article 97000:	Length: 1718	Avg Compression: 0.248980
Article 97200:	Length: 1729	Avg Compression: 0.248471
Article 97400:	Length: 1752	Avg Compression: 0.248110
Article 97600:	Length: 1745	Avg Compression: 0.247884
Article 97800:	Length: 1759	Avg Compression: 0.247383
Article 98000:	Length: 1788	

Article 124400:	Length: 2610	Avg Compression: 0.201007
Article 124600:	Length: 2634	Avg Compression: 0.200423
Article 124800:	Length: 2635	Avg Compression: 0.199815
Article 125000:	Length: 2637	Avg Compression: 0.199219
Article 125200:	Length: 2638	Avg Compression: 0.198621
Article 125400:	Length: 2640	Avg Compression: 0.198059
Article 125600:	Length: 2641	Avg Compression: 0.197491
Article 125800:	Length: 2643	Avg Compression: 0.196888
Article 126000:	Length: 2644	Avg Compression: 0.196317
Article 126200:	Length: 2646	Avg Compression: 0.195784
Article 126400:	Length: 2649	Avg Compression: 0.195198
Article 126600:	Length: 2651	Avg Compression: 0.194621
Article 126800:	Length: 2610	Avg Compression: 0.194945
Article 127000:	Length: 2645	Avg Compression: 0.195927
Article 127200:	Length: 2654	Avg Compression: 0.196157
Article 127400:	Length: 2656	Avg Compression: 0.195599
Article 127600:	Length: 2658	Avg Compression: 0.195066
Article 127800:	Length: 2640	Avg Compression: 0.195349
Article 12

Article 154200:	Length: 3591	Avg Compression: 0.183648
Article 154400:	Length: 3608	Avg Compression: 0.183432
Article 154600:	Length: 3555	Avg Compression: 0.183757
Article 154800:	Length: 3627	Avg Compression: 0.183605
Article 155000:	Length: 3638	Avg Compression: 0.183362
Article 155200:	Length: 3652	Avg Compression: 0.183648
Article 155400:	Length: 3661	Avg Compression: 0.183401
Article 155600:	Length: 3683	Avg Compression: 0.183579
Article 155800:	Length: 3631	Avg Compression: 0.183782
Article 156000:	Length: 3699	Avg Compression: 0.183939
Article 156200:	Length: 3701	Avg Compression: 0.184240
Article 156400:	Length: 3676	Avg Compression: 0.184637
Article 156600:	Length: 3737	Avg Compression: 0.184427
Article 156800:	Length: 3735	Avg Compression: 0.184678
Article 157000:	Length: 3765	Avg Compression: 0.184881
Article 157200:	Length: 3809	Avg Compression: 0.185037
Article 157400:	Length: 3792	Avg Compression: 0.185190
Article 157600:	Length: 3801	Avg Compression: 0.185356
Article 15

Article 184000:	Length: 6938	Avg Compression: 0.199278
Article 184200:	Length: 6968	Avg Compression: 0.199317
Article 184400:	Length: 6972	Avg Compression: 0.199433
Article 184600:	Length: 7022	Avg Compression: 0.199607
Article 184800:	Length: 7089	Avg Compression: 0.199665
Article 185000:	Length: 7122	Avg Compression: 0.199781
Article 185200:	Length: 7132	Avg Compression: 0.199883
Article 185400:	Length: 7203	Avg Compression: 0.200032
Article 185600:	Length: 7206	Avg Compression: 0.200226
Article 185800:	Length: 7241	Avg Compression: 0.200384
Article 186000:	Length: 7349	Avg Compression: 0.200527
Article 186200:	Length: 7365	Avg Compression: 0.200746
Article 186400:	Length: 7418	Avg Compression: 0.200890
Article 186600:	Length: 7465	Avg Compression: 0.200964
Article 186800:	Length: 6993	Avg Compression: 0.201442
Article 187000:	Length: 7519	Avg Compression: 0.201541
Article 187200:	Length: 7600	Avg Compression: 0.201626
Article 187400:	Length: 7648	Avg Compression: 0.201678
Article 18

Article 213600:	Length: 28857	Avg Compression: 0.219438
Article 213800:	Length: 29493	Avg Compression: 0.219828
Article 214000:	Length: 30224	Avg Compression: 0.219882
Article 214200:	Length: 31124	Avg Compression: 0.219824
Article 214400:	Length: 32008	Avg Compression: 0.219929
Article 214600:	Length: 33170	Avg Compression: 0.219821
Article 214800:	Length: 34078	Avg Compression: 0.219731
Article 215000:	Length: 34951	Avg Compression: 0.219673
Article 215200:	Length: 36205	Avg Compression: 0.219448
Article 215400:	Length: 37382	Avg Compression: 0.219682
Article 215600:	Length: 38921	Avg Compression: 0.219727
Article 215800:	Length: 40871	Avg Compression: 0.219970
Article 216000:	Length: 42918	Avg Compression: 0.220210
Article 216200:	Length: 45047	Avg Compression: 0.220335
Article 216400:	Length: 47661	Avg Compression: 0.220237
Article 216600:	Length: 50700	Avg Compression: 0.220556
Article 216800:	Length: 56066	Avg Compression: 0.220494
Article 217000:	Length: 63529	Avg Compression: 0

In [8]:
model.train(256, 192, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [9]:
TYPE=np.int16

subword_text_encoder = tfds.features.text.SubwordTextEncoder.load_from_file('vocab_4096_simplified')

class Articles:
    EMPTY_ARTICLE = np.array([], dtype=TYPE) # used for padding
    
    def __init__(self, path):
        with open(path, 'rb') as text_file:
            data = text_file.read()

        self.articles = sorted(set(data.split(b'\0')), key=len)
        self._encoded_articles = None

    @property
    def encoded_articles(self):
        if self._encoded_articles == None:
            self._encoded_articles = [np.array(subword_text_encoder.encode(article), dtype=TYPE) for article in self.articles]
        
        return self._encoded_articles

    def articles_generator(self, batch_size = 1, start = 0, end = None):
        end = end or len(self.articles)

        for _ in range(batch_size - ((end - start - 1) % batch_size + 1)):
            yield self.EMPTY_ARTICLE

        for article in itertools.islice(self.encoded_articles, start, end):
            yield article

    def subbatch_generator(self, batch_size, batch_length, start = 0, end = None):
        end = end or len(self.articles)

        dataset = tf.data.Dataset.from_generator(self.articles_generator, args=(batch_size, start, end), output_types=TYPE)
        dataset = dataset.padded_batch(batch_size, padded_shapes=([None]), drop_remainder=True)
        dataset = dataset.shuffle(100)

        for batch in dataset.as_numpy_iterator():
            remaining = batch
            while remaining.shape[1] > batch_length + 1:
                yield remaining[:, :batch_length + 1]
                remaining = remaining[:, batch_length:]

            if remaining.shape[1] == batch_length + 1:
                yield remaining
                yield np.zeros((batch_size, batch_length + 1), dtype=TYPE)
            else:
                yield np.hstack([remaining, np.zeros([batch_size, batch_length - remaining.shape[1] + 1])])

    def steps(self, batch_size, batch_length):
        articles = self.articles_generator(batch_size, batch_length)
        return sum(math.ceil(len(article) / batch_length + 1) for i, article in enumerate(articles) if (i + 1) % batch_size == 0)

    def dataset(self, batch_size, batch_length, start = 0, end = None):
        end = end or len(self.articles)

        dataset = tf.data.Dataset.from_generator(self.subbatch_generator, args=(batch_size, batch_length, start, end), output_types=TYPE, output_shapes=(batch_size, batch_length + 1))
        return dataset.map(lambda batch: (batch[:, :-1], batch[:, 1:]))

In [10]:
articles = Articles('page_revisions_text_simplified')

In [11]:
model = Model(articles, './training_checkpoints-18',
              vocab_size = subword_text_encoder.vocab_size,
              embedding_dim=512,
              rnn_units=1024)

In [12]:
model.train(256, 192, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3
