In [1]:
import itertools
import math
import os

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

physical_devices = tf.config.list_physical_devices('GPU')
for physical_device in physical_devices:
    tf.config.experimental.set_memory_growth(physical_device, enable=True)

In [2]:
TYPE=np.int16

subword_text_encoder = tfds.features.text.SubwordTextEncoder.load_from_file('vocab_4096')

class Articles:
    EMPTY_ARTICLE = np.array([], dtype=TYPE) # used for padding
    
    def __init__(self, path):
        with open(path, 'rb') as text_file:
            data = text_file.read()

        self.articles = sorted(set(data.split(b'\0')[:2000]), key=len)
        self._encoded_articles = None

    @property
    def encoded_articles(self):
        if self._encoded_articles == None:
            self._encoded_articles = [np.array(subword_text_encoder.encode(article), dtype=TYPE) for article in self.articles]
        
        return self._encoded_articles

    def articles_generator(self, batch_size = 1, start = 0, end = None):
        end = end or len(self.articles)

        for _ in range(batch_size - ((end - start - 1) % batch_size + 1)):
            yield self.EMPTY_ARTICLE

        for article in itertools.islice(self.encoded_articles, start, end):
            yield article

    def subbatch_generator(self, batch_size, batch_length, start = 0, end = None):
        end = end or len(self.articles)

        dataset = tf.data.Dataset.from_generator(self.articles_generator, args=(batch_size, start, end), output_types=TYPE)
        dataset = dataset.padded_batch(batch_size, padded_shapes=([None]), drop_remainder=True)
        dataset = dataset.shuffle(100)

        for batch in dataset.as_numpy_iterator():
            remaining = batch
            while remaining.shape[1] > batch_length + 1:
                yield remaining[:, :batch_length + 1]
                remaining = remaining[:, batch_length:]

            if remaining.shape[1] == batch_length + 1:
                yield remaining
                yield np.zeros((batch_size, batch_length + 1), dtype=TYPE)
            else:
                yield np.hstack([remaining, np.zeros([batch_size, batch_length - remaining.shape[1] + 1])])

    def steps(self, batch_size, batch_length):
        articles = self.articles_generator(batch_size, batch_length)
        return sum(math.ceil(len(article) / batch_length + 1) for i, article in enumerate(articles) if (i + 1) % batch_size == 0)

    def dataset(self, batch_size, batch_length, start = 0, end = None):
        end = end or len(self.articles)

        dataset = tf.data.Dataset.from_generator(self.subbatch_generator, args=(batch_size, batch_length, start, end), output_types=TYPE, output_shapes=(batch_size, batch_length + 1))
        return dataset.map(lambda batch: (batch[:, :-1], batch[:, 1:]))

In [3]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

def average_final_batch_ratio(true_labels, predictions):
    return 0 ** tf.math.abs(true_labels[-1, -1])

class ModelStateResetter(tf.keras.callbacks.Callback):
    def __init__(self):
        pass

    def on_epoch_begin(self, epoch, logs=None):
        self.last_final_batch_count = 0
        
    def on_batch_end(self, batch, logs={}):
        average_final_batch_ratio = logs.get('average_final_batch_ratio', 0)
        final_batch_count = int(round(average_final_batch_ratio * (batch + 1)))
        is_final = final_batch_count - self.last_final_batch_count
        self.last_final_batch_count = final_batch_count
        
        if is_final:
            self.model.reset_states()

class Model:
    def __init__(self, articles, checkpoint_dir, vocab_size, embedding_dim, rnn_units):
        self._articles = articles
        self._batch_size = None
        self._batched_item_length = None
        self._training_model = None
        self._predicting_model = None
        self._vocab_size = vocab_size
        self._embedding_dim = embedding_dim
        self._rnn_units = rnn_units

        self._checkpoint_dir = checkpoint_dir
        self._checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}") # Name of the checkpoint files

    def training_model(self, batch_size, batched_item_length):
        if self._training_model == None or batch_size != self._batch_size or batched_item_length != self._batched_item_length:
            self._batch_size = batch_size
            self._batched_item_length = batched_item_length
            self._training_model = tf.keras.Sequential([
                tf.keras.layers.Masking(mask_value=0, batch_input_shape=[batch_size, batched_item_length]),
                tf.keras.layers.Embedding(self._vocab_size, self._embedding_dim),
                tf.keras.layers.GRU(self._rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                tf.keras.layers.GRU(self._rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                tf.keras.layers.Dense(self._vocab_size),
            ])

            if os.path.isdir(self._checkpoint_dir):
                self._training_model.load_weights(tf.train.latest_checkpoint(self._checkpoint_dir))

            self._training_model.compile(optimizer='adam', loss=loss, metrics=[average_final_batch_ratio])
            self._predicting_model = None
        
        return self._training_model

    @property
    def callbacks(self):
        checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=self._checkpoint_prefix, save_weights_only=True)
        model_state_resetter_callback = ModelStateResetter()
        
        return [checkpoint_callback, model_state_resetter_callback]
    
    def train(self, batch_size, batched_item_length, epochs=1):
        dataset = self._articles.dataset(batch_size, batched_item_length)

        model = self.training_model(batch_size, batched_item_length)

        model.fit(dataset, epochs=epochs, callbacks=self.callbacks)
    
    @property
    def predicting_model(self):
        if self._predicting_model == None:
            self._predicting_model = tf.keras.Sequential([
                tf.keras.layers.Masking(mask_value=0, batch_input_shape=[1, 1]),
                tf.keras.layers.Embedding(self._vocab_size, self._embedding_dim),
                tf.keras.layers.GRU(self._rnn_units, stateful=True, return_sequences=True),
                tf.keras.layers.GRU(self._rnn_units, stateful=True, return_sequences=True),
                tf.keras.layers.Dense(self._vocab_size),
            ])
            
            self._predicting_model.load_weights(tf.train.latest_checkpoint(self._checkpoint_dir))
            self._training_model = None
        
        return self._predicting_model
    
    def predict(self, input_eval):
        return self.predicting_model(input_eval)

In [4]:
articles = Articles('page_revisions_text')

In [5]:
steps = 13

for i in range(steps):
    batch_size = 4 * 2**i
    batch_item_length = 3 * 2**(steps - i - 1)
    count = articles.steps(batch_size, batch_item_length)
    print("batch size: %6d\t batch item length: %4d\tsteps per epoch: %6d" % (batch_size, batch_item_length, count))

batch size:      4	 batch item length: 12288	steps per epoch:      0
batch size:      8	 batch item length: 6144	steps per epoch:      0
batch size:     16	 batch item length: 3072	steps per epoch:      0
batch size:     32	 batch item length: 1536	steps per epoch:     96
batch size:     64	 batch item length:  768	steps per epoch:    149
batch size:    128	 batch item length:  384	steps per epoch:    167
batch size:    256	 batch item length:  192	steps per epoch:    220
batch size:    512	 batch item length:   96	steps per epoch:    349
batch size:   1024	 batch item length:   48	steps per epoch:    625
batch size:   2048	 batch item length:   24	steps per epoch:   1215
batch size:   4096	 batch item length:   12	steps per epoch:   2429
batch size:   8192	 batch item length:    6	steps per epoch:   4856
batch size:  16384	 batch item length:    3	steps per epoch:   9710


In [6]:
model = Model(articles, './training_checkpoints-10',
              vocab_size = subword_text_encoder.vocab_size,
              embedding_dim=512,
              rnn_units=1024)

In [7]:
model.training_model(256, 192).summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking (Masking)            (256, 192)                0         
_________________________________________________________________
embedding (Embedding)        (256, 192, 512)           2072576   
_________________________________________________________________
gru (GRU)                    (256, 192, 1024)          4724736   
_________________________________________________________________
gru_1 (GRU)                  (256, 192, 1024)          6297600   
_________________________________________________________________
dense (Dense)                (256, 192, 4048)          4149200   
Total params: 17,244,112
Trainable params: 17,244,112
Non-trainable params: 0
_________________________________________________________________


In [8]:
model.train(256, 192, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [7]:
import ctypes

class Huffman:
    huffman = ctypes.CDLL('x64/Release/huffman')
    
    huffman.create_tree.restype = ctypes.c_void_p
    huffman.destroy_tree.restype = None
    huffman.load_weights.restype = None
    huffman.create_code_string.restype = ctypes.c_char_p
    
    def __init__(self, category_count):
        self.tree = ctypes.c_void_p(self.huffman.create_tree(category_count))

    def __del__(self):
        self.huffman.destroy_tree(self.tree)
        
    def load_weights(self, weights):
        self.huffman.load_weights(self.tree, weights.ctypes.data_as(ctypes.POINTER(ctypes.c_float)))
    
    def get_code_length(self, category):
        return self.huffman.get_code_length(self.tree, category)

    def get_code_zero_count(self, category):
        return self.huffman.get_code_zero_count(self.tree, category)

In [8]:
def huffman_archive_size(model, text):
    archived_size = 0
    zeros = 0
    input_eval = np.array([[0]], dtype=TYPE)
    huffman_tree = Huffman(subword_text_encoder.vocab_size)

    text_generated = []

    model.predicting_model.reset_states()

    for index, byte in enumerate(text):
        predictions = model.predict(input_eval)
        predictions = tf.squeeze(predictions, 0) # remove the batch dimension

        weights = tf.nn.softmax(predictions[0]).numpy()
        huffman_tree.load_weights(weights)
        zeros += huffman_tree.get_code_zero_count(byte.item())
        archived_size += huffman_tree.get_code_length(byte.item())

        input_eval = tf.expand_dims([byte], 0)
  
    return archived_size, zeros

In [11]:
total_raw = 0
total_compressed = 0

for index, encoded_article in enumerate(articles.articles_generator(1)):
    if index % 10 == 0:
        article = subword_text_encoder.decode(encoded_article)
        raw = len(article) * 8
        if raw == 0:
            continue
        compressed, _ = huffman_archive_size(model, encoded_article)
        total_raw += raw
        total_compressed += compressed

        print('Article %d:\tLength: %d\tCompression: %f\tAvg Compression: %f' % (index, raw, compressed/raw, total_compressed/total_raw))

Article 0:	Length: 128	Compression: 0.609375	Avg Compression: 0.609375
Article 10:	Length: 144	Compression: 0.534722	Avg Compression: 0.569853
Article 20:	Length: 152	Compression: 0.585526	Avg Compression: 0.575472
Article 30:	Length: 160	Compression: 0.537500	Avg Compression: 0.565068
Article 40:	Length: 160	Compression: 0.456250	Avg Compression: 0.541667
Article 50:	Length: 168	Compression: 0.464286	Avg Compression: 0.527412
Article 60:	Length: 168	Compression: 0.440476	Avg Compression: 0.513889
Article 70:	Length: 176	Compression: 0.443182	Avg Compression: 0.503981
Article 80:	Length: 176	Compression: 0.426136	Avg Compression: 0.494413
Article 90:	Length: 184	Compression: 0.483696	Avg Compression: 0.493193
Article 100:	Length: 184	Compression: 0.494565	Avg Compression: 0.493333
Article 110:	Length: 192	Compression: 0.453125	Avg Compression: 0.489458
Article 120:	Length: 192	Compression: 0.458333	Avg Compression: 0.486722
Article 130:	Length: 200	Compression: 0.585000	Avg Compression

Article 1110:	Length: 35264	Compression: 0.272232	Avg Compression: 0.265010
Article 1120:	Length: 35736	Compression: 0.264663	Avg Compression: 0.264998
Article 1130:	Length: 36448	Compression: 0.285530	Avg Compression: 0.265695
Article 1140:	Length: 37448	Compression: 0.254673	Avg Compression: 0.265324
Article 1150:	Length: 38424	Compression: 0.262362	Avg Compression: 0.265225
Article 1160:	Length: 38680	Compression: 0.278102	Avg Compression: 0.265644
Article 1170:	Length: 40216	Compression: 0.283196	Avg Compression: 0.266219
Article 1180:	Length: 41480	Compression: 0.320685	Avg Compression: 0.267999
Article 1190:	Length: 42616	Compression: 0.282218	Avg Compression: 0.268460
Article 1200:	Length: 43488	Compression: 0.304912	Avg Compression: 0.269630
Article 1210:	Length: 45080	Compression: 0.266038	Avg Compression: 0.269514
Article 1220:	Length: 46864	Compression: 0.265150	Avg Compression: 0.269373
Article 1230:	Length: 48608	Compression: 0.243972	Avg Compression: 0.268548
Article 1240

In [12]:
model.train(256, 192, epochs=40)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [13]:
total_raw = 0
total_compressed = 0

for index, encoded_article in enumerate(articles.articles_generator(1)):
    if index % 10 == 0:
        article = subword_text_encoder.decode(encoded_article)
        raw = len(article) * 8
        if raw == 0:
            continue
        compressed, _ = huffman_archive_size(model, encoded_article)
        total_raw += raw
        total_compressed += compressed

        print('Article %d:\tLength: %d\tCompression: %f\tAvg Compression: %f' % (index, raw, compressed/raw, total_compressed/total_raw))

Article 0:	Length: 128	Compression: 0.664062	Avg Compression: 0.664062
Article 10:	Length: 144	Compression: 0.659722	Avg Compression: 0.661765
Article 20:	Length: 152	Compression: 0.565789	Avg Compression: 0.627358
Article 30:	Length: 160	Compression: 0.618750	Avg Compression: 0.625000
Article 40:	Length: 160	Compression: 0.487500	Avg Compression: 0.595430
Article 50:	Length: 168	Compression: 0.500000	Avg Compression: 0.577851
Article 60:	Length: 168	Compression: 0.517857	Avg Compression: 0.568519
Article 70:	Length: 176	Compression: 0.431818	Avg Compression: 0.549363
Article 80:	Length: 176	Compression: 0.494318	Avg Compression: 0.542598
Article 90:	Length: 184	Compression: 0.472826	Avg Compression: 0.534653
Article 100:	Length: 184	Compression: 0.527174	Avg Compression: 0.533889
Article 110:	Length: 192	Compression: 0.468750	Avg Compression: 0.527610
Article 120:	Length: 192	Compression: 0.453125	Avg Compression: 0.521062
Article 130:	Length: 200	Compression: 0.615000	Avg Compression

Article 690:	Length: 9288	Compression: 0.149655	Avg Compression: 0.213436
Article 700:	Length: 9920	Compression: 0.204940	Avg Compression: 0.212681
Article 710:	Length: 10552	Compression: 0.174659	Avg Compression: 0.209397
Article 720:	Length: 11056	Compression: 0.199801	Avg Compression: 0.208601
Article 730:	Length: 11776	Compression: 0.163893	Avg Compression: 0.204970
Article 740:	Length: 12424	Compression: 0.195187	Avg Compression: 0.204198
Article 750:	Length: 12944	Compression: 0.173748	Avg Compression: 0.201885
Article 760:	Length: 13472	Compression: 0.192696	Avg Compression: 0.201212
Article 770:	Length: 13888	Compression: 0.157474	Avg Compression: 0.198140
Article 780:	Length: 14096	Compression: 0.178278	Avg Compression: 0.196818
Article 790:	Length: 14800	Compression: 0.168446	Avg Compression: 0.194966
Article 800:	Length: 15200	Compression: 0.194539	Avg Compression: 0.194939
Article 810:	Length: 16024	Compression: 0.176236	Avg Compression: 0.193777
Article 820:	Length: 16336	

Article 1770:	Length: 213160	Compression: 0.177927	Avg Compression: 0.166034
Article 1780:	Length: 220440	Compression: 0.190038	Avg Compression: 0.166754
Article 1790:	Length: 229744	Compression: 0.199574	Avg Compression: 0.167750
Article 1800:	Length: 246304	Compression: 0.165897	Avg Compression: 0.167691
Article 1810:	Length: 251304	Compression: 0.193113	Avg Compression: 0.168483
Article 1820:	Length: 268336	Compression: 0.178247	Avg Compression: 0.168797
Article 1830:	Length: 283192	Compression: 0.157455	Avg Compression: 0.168425
Article 1840:	Length: 307368	Compression: 0.185380	Avg Compression: 0.169008
Article 1850:	Length: 323168	Compression: 0.165626	Avg Compression: 0.168890
Article 1860:	Length: 341176	Compression: 0.170287	Avg Compression: 0.168940
Article 1870:	Length: 382048	Compression: 0.169685	Avg Compression: 0.168968
Article 1880:	Length: 441872	Compression: 0.129343	Avg Compression: 0.167288
Article 1890:	Length: 508896	Compression: 0.161094	Avg Compression: 0.166999

In [14]:
model.train(256, 192, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [15]:
total_raw = 0
total_compressed = 0

for index, encoded_article in enumerate(articles.articles_generator(1)):
    if index % 10 == 0:
        article = subword_text_encoder.decode(encoded_article)
        raw = len(article) * 8
        if raw == 0:
            continue
        compressed, _ = huffman_archive_size(model, encoded_article)
        total_raw += raw
        total_compressed += compressed

        print('Article %d:\tLength: %d\tCompression: %f\tAvg Compression: %f' % (index, raw, compressed/raw, total_compressed/total_raw))

Article 0:	Length: 128	Compression: 0.671875	Avg Compression: 0.671875
Article 10:	Length: 144	Compression: 0.659722	Avg Compression: 0.665441
Article 20:	Length: 152	Compression: 0.552632	Avg Compression: 0.625000
Article 30:	Length: 160	Compression: 0.637500	Avg Compression: 0.628425
Article 40:	Length: 160	Compression: 0.481250	Avg Compression: 0.596774
Article 50:	Length: 168	Compression: 0.565476	Avg Compression: 0.591009
Article 60:	Length: 168	Compression: 0.529762	Avg Compression: 0.581481
Article 70:	Length: 176	Compression: 0.448864	Avg Compression: 0.562898
Article 80:	Length: 176	Compression: 0.477273	Avg Compression: 0.552374
Article 90:	Length: 184	Compression: 0.483696	Avg Compression: 0.544554
Article 100:	Length: 184	Compression: 0.554348	Avg Compression: 0.545556
Article 110:	Length: 192	Compression: 0.494792	Avg Compression: 0.540663
Article 120:	Length: 192	Compression: 0.479167	Avg Compression: 0.535256
Article 130:	Length: 200	Compression: 0.610000	Avg Compression

Article 1110:	Length: 35264	Compression: 0.163566	Avg Compression: 0.159840
Article 1120:	Length: 35736	Compression: 0.145987	Avg Compression: 0.159363
Article 1130:	Length: 36448	Compression: 0.136167	Avg Compression: 0.158575
Article 1140:	Length: 37448	Compression: 0.152345	Avg Compression: 0.158365
Article 1150:	Length: 38424	Compression: 0.144155	Avg Compression: 0.157890
Article 1160:	Length: 38680	Compression: 0.141262	Avg Compression: 0.157348
Article 1170:	Length: 40216	Compression: 0.157972	Avg Compression: 0.157369
Article 1180:	Length: 41480	Compression: 0.170636	Avg Compression: 0.157802
Article 1190:	Length: 42616	Compression: 0.166323	Avg Compression: 0.158079
Article 1200:	Length: 43488	Compression: 0.162367	Avg Compression: 0.158216
Article 1210:	Length: 45080	Compression: 0.171118	Avg Compression: 0.158632
Article 1220:	Length: 46864	Compression: 0.149454	Avg Compression: 0.158334
Article 1230:	Length: 48608	Compression: 0.138640	Avg Compression: 0.157695
Article 1240

In [16]:
model.train(256, 192, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30


Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [17]:
total_raw = 0
total_compressed = 0

for index, encoded_article in enumerate(articles.articles_generator(1)):
    if index % 10 == 0:
        article = subword_text_encoder.decode(encoded_article)
        raw = len(article) * 8
        if raw == 0:
            continue
        compressed, _ = huffman_archive_size(model, encoded_article)
        total_raw += raw
        total_compressed += compressed

        print('Article %d:\tLength: %d\tCompression: %f\tAvg Compression: %f' % (index, raw, compressed/raw, total_compressed/total_raw))

Article 0:	Length: 128	Compression: 0.742188	Avg Compression: 0.742188
Article 10:	Length: 144	Compression: 0.743056	Avg Compression: 0.742647
Article 20:	Length: 152	Compression: 0.625000	Avg Compression: 0.700472
Article 30:	Length: 160	Compression: 0.662500	Avg Compression: 0.690068
Article 40:	Length: 160	Compression: 0.525000	Avg Compression: 0.654570
Article 50:	Length: 168	Compression: 0.541667	Avg Compression: 0.633772
Article 60:	Length: 168	Compression: 0.529762	Avg Compression: 0.617593
Article 70:	Length: 176	Compression: 0.494318	Avg Compression: 0.600318
Article 80:	Length: 176	Compression: 0.511364	Avg Compression: 0.589385
Article 90:	Length: 184	Compression: 0.559783	Avg Compression: 0.586015
Article 100:	Length: 184	Compression: 0.635870	Avg Compression: 0.591111
Article 110:	Length: 192	Compression: 0.484375	Avg Compression: 0.580823
Article 120:	Length: 192	Compression: 0.526042	Avg Compression: 0.576007
Article 130:	Length: 200	Compression: 0.650000	Avg Compression

Article 1110:	Length: 35264	Compression: 0.162659	Avg Compression: 0.157293
Article 1120:	Length: 35736	Compression: 0.146659	Avg Compression: 0.156926
Article 1130:	Length: 36448	Compression: 0.133752	Avg Compression: 0.156139
Article 1140:	Length: 37448	Compression: 0.148072	Avg Compression: 0.155867
Article 1150:	Length: 38424	Compression: 0.142177	Avg Compression: 0.155409
Article 1160:	Length: 38680	Compression: 0.145217	Avg Compression: 0.155077
Article 1170:	Length: 40216	Compression: 0.156679	Avg Compression: 0.155130
Article 1180:	Length: 41480	Compression: 0.173770	Avg Compression: 0.155739
Article 1190:	Length: 42616	Compression: 0.164516	Avg Compression: 0.156024
Article 1200:	Length: 43488	Compression: 0.164988	Avg Compression: 0.156312
Article 1210:	Length: 45080	Compression: 0.165905	Avg Compression: 0.156620
Article 1220:	Length: 46864	Compression: 0.145570	Avg Compression: 0.156263
Article 1230:	Length: 48608	Compression: 0.137529	Avg Compression: 0.155654
Article 1240

In [18]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

def average_final_batch_ratio(true_labels, predictions):
    return 0 ** tf.math.abs(true_labels[-1, -1])

class ModelStateResetter(tf.keras.callbacks.Callback):
    def __init__(self):
        pass

    def on_epoch_begin(self, epoch, logs=None):
        self.last_final_batch_count = 0
        
    def on_batch_end(self, batch, logs={}):
        average_final_batch_ratio = logs.get('average_final_batch_ratio', 0)
        final_batch_count = int(round(average_final_batch_ratio * (batch + 1)))
        is_final = final_batch_count - self.last_final_batch_count
        self.last_final_batch_count = final_batch_count
        
        if is_final:
            self.model.reset_states()

class Model:
    def __init__(self, articles, checkpoint_dir, vocab_size, embedding_dim, rnn_units):
        self._articles = articles
        self._batch_size = None
        self._batched_item_length = None
        self._training_model = None
        self._predicting_model = None
        self._vocab_size = vocab_size
        self._embedding_dim = embedding_dim
        self._rnn_units = rnn_units

        self._checkpoint_dir = checkpoint_dir
        self._checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}") # Name of the checkpoint files

    def training_model(self, batch_size, batched_item_length):
        if self._training_model == None or batch_size != self._batch_size or batched_item_length != self._batched_item_length:
            self._batch_size = batch_size
            self._batched_item_length = batched_item_length
            self._training_model = tf.keras.Sequential([
                tf.keras.layers.Masking(mask_value=0, batch_input_shape=[batch_size, batched_item_length]),
                tf.keras.layers.Embedding(self._vocab_size, self._embedding_dim),
                tf.keras.layers.LSTM(self._rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                tf.keras.layers.LSTM(self._rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                tf.keras.layers.Dense(self._vocab_size),
            ])

            if os.path.isdir(self._checkpoint_dir):
                self._training_model.load_weights(tf.train.latest_checkpoint(self._checkpoint_dir))

            self._training_model.compile(optimizer='adam', loss=loss, metrics=[average_final_batch_ratio])
            self._predicting_model = None
        
        return self._training_model

    @property
    def callbacks(self):
        checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=self._checkpoint_prefix, save_weights_only=True)
        model_state_resetter_callback = ModelStateResetter()
        
        return [checkpoint_callback, model_state_resetter_callback]
    
    def train(self, batch_size, batched_item_length, epochs=1):
        dataset = self._articles.dataset(batch_size, batched_item_length)

        model = self.training_model(batch_size, batched_item_length)

        model.fit(dataset, epochs=epochs, callbacks=self.callbacks)
    
    @property
    def predicting_model(self):
        if self._predicting_model == None:
            self._predicting_model = tf.keras.Sequential([
                tf.keras.layers.Masking(mask_value=0, batch_input_shape=[1, 1]),
                tf.keras.layers.Embedding(self._vocab_size, self._embedding_dim),
                tf.keras.layers.LSTM(self._rnn_units, stateful=True, return_sequences=True),
                tf.keras.layers.LSTM(self._rnn_units, stateful=True, return_sequences=True),
                tf.keras.layers.Dense(self._vocab_size),
            ])
            
            self._predicting_model.load_weights(tf.train.latest_checkpoint(self._checkpoint_dir))
            self._training_model = None
        
        return self._predicting_model
    
    def predict(self, input_eval):
        return self.predicting_model(input_eval)

In [25]:
model = Model(articles, './training_checkpoints-11',
              vocab_size = subword_text_encoder.vocab_size,
              embedding_dim=512,
              rnn_units=1024)

In [26]:
model.training_model(256, 192).summary()

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking_11 (Masking)         (256, 192)                0         
_________________________________________________________________
embedding_11 (Embedding)     (256, 192, 512)           2072576   
_________________________________________________________________
lstm_6 (LSTM)                (256, 192, 1024)          6295552   
_________________________________________________________________
lstm_7 (LSTM)                (256, 192, 1024)          8392704   
_________________________________________________________________
dense_11 (Dense)             (256, 192, 4048)          4149200   
Total params: 20,910,032
Trainable params: 20,910,032
Non-trainable params: 0
_________________________________________________________________


In [27]:
model.train(256, 192, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [28]:
total_raw = 0
total_compressed = 0

for index, encoded_article in enumerate(articles.articles_generator(1)):
    if index % 10 == 0:
        article = subword_text_encoder.decode(encoded_article)
        raw = len(article) * 8
        if raw == 0:
            continue
        compressed, _ = huffman_archive_size(model, encoded_article)
        total_raw += raw
        total_compressed += compressed

        print('Article %d:\tLength: %d\tCompression: %f\tAvg Compression: %f' % (index, raw, compressed/raw, total_compressed/total_raw))

Article 0:	Length: 128	Compression: 0.757812	Avg Compression: 0.757812
Article 10:	Length: 144	Compression: 0.722222	Avg Compression: 0.738971
Article 20:	Length: 152	Compression: 0.750000	Avg Compression: 0.742925
Article 30:	Length: 160	Compression: 0.700000	Avg Compression: 0.731164
Article 40:	Length: 160	Compression: 0.750000	Avg Compression: 0.735215
Article 50:	Length: 168	Compression: 0.779762	Avg Compression: 0.743421
Article 60:	Length: 168	Compression: 0.654762	Avg Compression: 0.729630
Article 70:	Length: 176	Compression: 0.636364	Avg Compression: 0.716561
Article 80:	Length: 176	Compression: 0.653409	Avg Compression: 0.708799
Article 90:	Length: 184	Compression: 0.695652	Avg Compression: 0.707302
Article 100:	Length: 184	Compression: 0.576087	Avg Compression: 0.693889
Article 110:	Length: 192	Compression: 0.567708	Avg Compression: 0.681727
Article 120:	Length: 192	Compression: 0.583333	Avg Compression: 0.673077
Article 130:	Length: 200	Compression: 0.705000	Avg Compression

Article 1110:	Length: 35264	Compression: 0.182140	Avg Compression: 0.187126
Article 1120:	Length: 35736	Compression: 0.171536	Avg Compression: 0.186588
Article 1130:	Length: 36448	Compression: 0.166978	Avg Compression: 0.185922
Article 1140:	Length: 37448	Compression: 0.176939	Avg Compression: 0.185620
Article 1150:	Length: 38424	Compression: 0.184650	Avg Compression: 0.185587
Article 1160:	Length: 38680	Compression: 0.182756	Avg Compression: 0.185495
Article 1170:	Length: 40216	Compression: 0.198777	Avg Compression: 0.185930
Article 1180:	Length: 41480	Compression: 0.204870	Avg Compression: 0.186549
Article 1190:	Length: 42616	Compression: 0.197086	Avg Compression: 0.186891
Article 1200:	Length: 43488	Compression: 0.200124	Avg Compression: 0.187316
Article 1210:	Length: 45080	Compression: 0.201065	Avg Compression: 0.187758
Article 1220:	Length: 46864	Compression: 0.184214	Avg Compression: 0.187643
Article 1230:	Length: 48608	Compression: 0.163697	Avg Compression: 0.186865
Article 1240

Article 1820:	Length: 268336	Compression: 0.192464	Avg Compression: 0.179065
Article 1830:	Length: 283192	Compression: 0.169539	Avg Compression: 0.178752
Article 1840:	Length: 307368	Compression: 0.190742	Avg Compression: 0.179165
Article 1850:	Length: 323168	Compression: 0.175983	Avg Compression: 0.179053
Article 1860:	Length: 341176	Compression: 0.186004	Avg Compression: 0.179301
Article 1870:	Length: 382048	Compression: 0.175237	Avg Compression: 0.179145
Article 1880:	Length: 441872	Compression: 0.141865	Avg Compression: 0.177564
Article 1890:	Length: 508896	Compression: 0.176702	Avg Compression: 0.177524


In [29]:
model.train(256, 192, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [30]:
total_raw = 0
total_compressed = 0

for index, encoded_article in enumerate(articles.articles_generator(1)):
    if index % 10 == 0:
        article = subword_text_encoder.decode(encoded_article)
        raw = len(article) * 8
        if raw == 0:
            continue
        compressed, _ = huffman_archive_size(model, encoded_article)
        total_raw += raw
        total_compressed += compressed

        print('Article %d:\tLength: %d\tCompression: %f\tAvg Compression: %f' % (index, raw, compressed/raw, total_compressed/total_raw))

Article 0:	Length: 128	Compression: 1.250000	Avg Compression: 1.250000
Article 10:	Length: 144	Compression: 1.166667	Avg Compression: 1.205882
Article 20:	Length: 152	Compression: 1.111842	Avg Compression: 1.172170
Article 30:	Length: 160	Compression: 1.193750	Avg Compression: 1.178082
Article 40:	Length: 160	Compression: 1.187500	Avg Compression: 1.180108
Article 50:	Length: 168	Compression: 1.130952	Avg Compression: 1.171053
Article 60:	Length: 168	Compression: 0.988095	Avg Compression: 1.142593
Article 70:	Length: 176	Compression: 1.113636	Avg Compression: 1.138535
Article 80:	Length: 176	Compression: 1.039773	Avg Compression: 1.126397
Article 90:	Length: 184	Compression: 1.097826	Avg Compression: 1.123144
Article 100:	Length: 184	Compression: 0.896739	Avg Compression: 1.100000
Article 110:	Length: 192	Compression: 0.802083	Avg Compression: 1.071285
Article 120:	Length: 192	Compression: 0.953125	Avg Compression: 1.060897
Article 130:	Length: 200	Compression: 0.945000	Avg Compression

Article 1110:	Length: 35264	Compression: 0.137222	Avg Compression: 0.149279
Article 1120:	Length: 35736	Compression: 0.129113	Avg Compression: 0.148584
Article 1130:	Length: 36448	Compression: 0.118278	Avg Compression: 0.147554
Article 1140:	Length: 37448	Compression: 0.132664	Avg Compression: 0.147052
Article 1150:	Length: 38424	Compression: 0.136842	Avg Compression: 0.146711
Article 1160:	Length: 38680	Compression: 0.128826	Avg Compression: 0.146128
Article 1170:	Length: 40216	Compression: 0.154192	Avg Compression: 0.146393
Article 1180:	Length: 41480	Compression: 0.136933	Avg Compression: 0.146083
Article 1190:	Length: 42616	Compression: 0.151445	Avg Compression: 0.146258
Article 1200:	Length: 43488	Compression: 0.151145	Avg Compression: 0.146414
Article 1210:	Length: 45080	Compression: 0.159894	Avg Compression: 0.146848
Article 1220:	Length: 46864	Compression: 0.139745	Avg Compression: 0.146618
Article 1230:	Length: 48608	Compression: 0.124794	Avg Compression: 0.145909
Article 1240

In [8]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

def average_final_batch_ratio(true_labels, predictions):
    return 0 ** tf.math.abs(true_labels[-1, -1])

class ModelStateResetter(tf.keras.callbacks.Callback):
    def __init__(self):
        pass

    def on_epoch_begin(self, epoch, logs=None):
        self.last_final_batch_count = 0
        
    def on_batch_end(self, batch, logs={}):
        average_final_batch_ratio = logs.get('average_final_batch_ratio', 0)
        final_batch_count = int(round(average_final_batch_ratio * (batch + 1)))
        is_final = final_batch_count - self.last_final_batch_count
        self.last_final_batch_count = final_batch_count
        
        if is_final:
            self.model.reset_states()

class Model:
    def __init__(self, articles, checkpoint_dir, vocab_size, embedding_dim, rnn_units):
        self._articles = articles
        self._batch_size = None
        self._batched_item_length = None
        self._training_model = None
        self._predicting_model = None
        self._vocab_size = vocab_size
        self._embedding_dim = embedding_dim
        self._rnn_units = rnn_units

        self._checkpoint_dir = checkpoint_dir
        self._checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}") # Name of the checkpoint files

    def training_model(self, batch_size, batched_item_length):
        if self._training_model == None or batch_size != self._batch_size or batched_item_length != self._batched_item_length:
            self._batch_size = batch_size
            self._batched_item_length = batched_item_length
            self._training_model = tf.keras.Sequential([
                tf.keras.layers.Masking(mask_value=0, batch_input_shape=[batch_size, batched_item_length]),
                tf.keras.layers.Embedding(self._vocab_size, self._embedding_dim),
                tf.keras.layers.LSTM(self._rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                tf.keras.layers.LSTM(self._rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                tf.keras.layers.Dense(self._vocab_size),
            ])

            if os.path.isdir(self._checkpoint_dir):
                self._training_model.load_weights(tf.train.latest_checkpoint(self._checkpoint_dir))

            self._training_model.compile(optimizer='adam', loss=loss, metrics=[average_final_batch_ratio])
            self._predicting_model = None
        
        return self._training_model

    @property
    def callbacks(self):
        checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=self._checkpoint_prefix, save_weights_only=True)
        model_state_resetter_callback = ModelStateResetter()
        
        return [checkpoint_callback, model_state_resetter_callback]
    
    def train(self, batch_size, batched_item_length, epochs=1):
        dataset = self._articles.dataset(batch_size, batched_item_length)

        model = self.training_model(batch_size, batched_item_length)

        model.fit(dataset, epochs=epochs, callbacks=self.callbacks)
    
    @property
    def predicting_model(self):
        if self._predicting_model == None:
            self._predicting_model = tf.keras.Sequential([
                tf.keras.layers.Masking(mask_value=0, batch_input_shape=[1, 1]),
                tf.keras.layers.Embedding(self._vocab_size, self._embedding_dim),
                tf.keras.layers.LSTM(self._rnn_units, stateful=True, return_sequences=True),
                tf.keras.layers.LSTM(self._rnn_units, stateful=True, return_sequences=True),
                tf.keras.layers.Dense(self._vocab_size),
            ])
            
            self._predicting_model.load_weights(tf.train.latest_checkpoint(self._checkpoint_dir))
            self._training_model = None
        
        return self._predicting_model
    
    def predict(self, input_eval):
        return self.predicting_model(input_eval)

In [9]:
model = Model(articles, './training_checkpoints-14',
              vocab_size = subword_text_encoder.vocab_size,
              embedding_dim=512,
              rnn_units=1024)

In [10]:
model.training_model(128, 256).summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking (Masking)            (128, 256)                0         
_________________________________________________________________
embedding (Embedding)        (128, 256, 512)           2072576   
_________________________________________________________________
lstm (LSTM)                  (128, 256, 1024)          6295552   
_________________________________________________________________
lstm_1 (LSTM)                (128, 256, 1024)          8392704   
_________________________________________________________________
dense (Dense)                (128, 256, 4048)          4149200   
Total params: 20,910,032
Trainable params: 20,910,032
Non-trainable params: 0
_________________________________________________________________


In [11]:
model.train(192, 256, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [12]:
total_raw = 0
total_compressed = 0

for index, encoded_article in enumerate(articles.articles_generator(1)):
    if index % 10 == 0:
        article = subword_text_encoder.decode(encoded_article)
        raw = len(article) * 8
        if raw == 0:
            continue
        compressed, _ = huffman_archive_size(model, encoded_article)
        total_raw += raw
        total_compressed += compressed

        print('Article %d:\tLength: %d\tCompression: %f\tAvg Compression: %f' % (index, raw, compressed/raw, total_compressed/total_raw))

Article 0:	Length: 128	Compression: 0.703125	Avg Compression: 0.703125
Article 10:	Length: 144	Compression: 0.750000	Avg Compression: 0.727941
Article 20:	Length: 152	Compression: 0.684211	Avg Compression: 0.712264
Article 30:	Length: 160	Compression: 0.687500	Avg Compression: 0.705479
Article 40:	Length: 160	Compression: 0.656250	Avg Compression: 0.694892
Article 50:	Length: 168	Compression: 0.529762	Avg Compression: 0.664474
Article 60:	Length: 168	Compression: 0.607143	Avg Compression: 0.655556
Article 70:	Length: 176	Compression: 0.539773	Avg Compression: 0.639331
Article 80:	Length: 176	Compression: 0.585227	Avg Compression: 0.632682
Article 90:	Length: 184	Compression: 0.614130	Avg Compression: 0.630569
Article 100:	Length: 184	Compression: 0.543478	Avg Compression: 0.621667
Article 110:	Length: 192	Compression: 0.557292	Avg Compression: 0.615462
Article 120:	Length: 192	Compression: 0.640625	Avg Compression: 0.617674
Article 130:	Length: 200	Compression: 0.490000	Avg Compression

Article 1110:	Length: 35264	Compression: 0.169918	Avg Compression: 0.165799
Article 1120:	Length: 35736	Compression: 0.163393	Avg Compression: 0.165716
Article 1130:	Length: 36448	Compression: 0.141215	Avg Compression: 0.164884
Article 1140:	Length: 37448	Compression: 0.167913	Avg Compression: 0.164986
Article 1150:	Length: 37904	Compression: 0.182461	Avg Compression: 0.165563
Article 1160:	Length: 38680	Compression: 0.164736	Avg Compression: 0.165536
Article 1170:	Length: 40216	Compression: 0.183783	Avg Compression: 0.166134
Article 1180:	Length: 41480	Compression: 0.183052	Avg Compression: 0.166687
Article 1190:	Length: 42616	Compression: 0.179651	Avg Compression: 0.167108
Article 1200:	Length: 43488	Compression: 0.171013	Avg Compression: 0.167234
Article 1210:	Length: 45080	Compression: 0.178461	Avg Compression: 0.167595
Article 1220:	Length: 46864	Compression: 0.171027	Avg Compression: 0.167706
Article 1230:	Length: 48608	Compression: 0.147321	Avg Compression: 0.167044
Article 1240

In [13]:
model.train(192, 256, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50


Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [14]:
total_raw = 0
total_compressed = 0

for index, encoded_article in enumerate(articles.articles_generator(1)):
    if index % 10 == 0:
        article = subword_text_encoder.decode(encoded_article)
        raw = len(article) * 8
        if raw == 0:
            continue
        compressed, _ = huffman_archive_size(model, encoded_article)
        total_raw += raw
        total_compressed += compressed

        print('Article %d:\tLength: %d\tCompression: %f\tAvg Compression: %f' % (index, raw, compressed/raw, total_compressed/total_raw))

Article 0:	Length: 128	Compression: 0.882812	Avg Compression: 0.882812
Article 10:	Length: 144	Compression: 0.909722	Avg Compression: 0.897059
Article 20:	Length: 152	Compression: 0.809211	Avg Compression: 0.865566
Article 30:	Length: 160	Compression: 0.937500	Avg Compression: 0.885274
Article 40:	Length: 160	Compression: 0.775000	Avg Compression: 0.861559
Article 50:	Length: 168	Compression: 0.809524	Avg Compression: 0.851974
Article 60:	Length: 168	Compression: 0.696429	Avg Compression: 0.827778
Article 70:	Length: 176	Compression: 0.630682	Avg Compression: 0.800159
Article 80:	Length: 176	Compression: 0.693182	Avg Compression: 0.787011
Article 90:	Length: 184	Compression: 0.657609	Avg Compression: 0.772277
Article 100:	Length: 184	Compression: 0.668478	Avg Compression: 0.761667
Article 110:	Length: 192	Compression: 0.656250	Avg Compression: 0.751506
Article 120:	Length: 192	Compression: 0.812500	Avg Compression: 0.756868
Article 130:	Length: 200	Compression: 0.525000	Avg Compression

Article 1110:	Length: 35264	Compression: 0.113373	Avg Compression: 0.119931
Article 1120:	Length: 35736	Compression: 0.109302	Avg Compression: 0.119564
Article 1130:	Length: 36448	Compression: 0.093613	Avg Compression: 0.118683
Article 1140:	Length: 37448	Compression: 0.117523	Avg Compression: 0.118644
Article 1150:	Length: 37904	Compression: 0.128746	Avg Compression: 0.118977
Article 1160:	Length: 38680	Compression: 0.109643	Avg Compression: 0.118673
Article 1170:	Length: 40216	Compression: 0.123558	Avg Compression: 0.118833
Article 1180:	Length: 41480	Compression: 0.118370	Avg Compression: 0.118818
Article 1190:	Length: 42616	Compression: 0.120330	Avg Compression: 0.118867
Article 1200:	Length: 43488	Compression: 0.109938	Avg Compression: 0.118580
Article 1210:	Length: 45080	Compression: 0.117547	Avg Compression: 0.118547
Article 1220:	Length: 46864	Compression: 0.116230	Avg Compression: 0.118472
Article 1230:	Length: 48608	Compression: 0.100889	Avg Compression: 0.117901
Article 1240

In [15]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

def average_final_batch_ratio(true_labels, predictions):
    return 0 ** tf.math.abs(true_labels[-1, -1])

class ModelStateResetter(tf.keras.callbacks.Callback):
    def __init__(self):
        pass

    def on_epoch_begin(self, epoch, logs=None):
        self.last_final_batch_count = 0
        
    def on_batch_end(self, batch, logs={}):
        average_final_batch_ratio = logs.get('average_final_batch_ratio', 0)
        final_batch_count = int(round(average_final_batch_ratio * (batch + 1)))
        is_final = final_batch_count - self.last_final_batch_count
        self.last_final_batch_count = final_batch_count
        
        if is_final:
            self.model.reset_states()

class Model:
    def __init__(self, articles, checkpoint_dir, vocab_size, embedding_dim, rnn_units):
        self._articles = articles
        self._batch_size = None
        self._batched_item_length = None
        self._training_model = None
        self._predicting_model = None
        self._vocab_size = vocab_size
        self._embedding_dim = embedding_dim
        self._rnn_units = rnn_units

        self._checkpoint_dir = checkpoint_dir
        self._checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}") # Name of the checkpoint files

    def training_model(self, batch_size, batched_item_length):
        if self._training_model == None or batch_size != self._batch_size or batched_item_length != self._batched_item_length:
            self._batch_size = batch_size
            self._batched_item_length = batched_item_length
            self._training_model = tf.keras.Sequential([
                tf.keras.layers.Masking(mask_value=0, batch_input_shape=[batch_size, batched_item_length]),
                tf.keras.layers.Embedding(self._vocab_size, self._embedding_dim),
                tf.keras.layers.GRU(self._rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                tf.keras.layers.GRU(self._rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                tf.keras.layers.Dense(self._vocab_size),
            ])

            if os.path.isdir(self._checkpoint_dir):
                self._training_model.load_weights(tf.train.latest_checkpoint(self._checkpoint_dir))

            self._training_model.compile(optimizer='adam', loss=loss, metrics=[average_final_batch_ratio])
            self._predicting_model = None
        
        return self._training_model

    @property
    def callbacks(self):
        checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=self._checkpoint_prefix, save_weights_only=True)
        model_state_resetter_callback = ModelStateResetter()
        
        return [checkpoint_callback, model_state_resetter_callback]
    
    def train(self, batch_size, batched_item_length, epochs=1):
        dataset = self._articles.dataset(batch_size, batched_item_length)

        model = self.training_model(batch_size, batched_item_length)

        model.fit(dataset, epochs=epochs, callbacks=self.callbacks)
    
    @property
    def predicting_model(self):
        if self._predicting_model == None:
            self._predicting_model = tf.keras.Sequential([
                tf.keras.layers.Masking(mask_value=0, batch_input_shape=[1, 1]),
                tf.keras.layers.Embedding(self._vocab_size, self._embedding_dim),
                tf.keras.layers.GRU(self._rnn_units, stateful=True, return_sequences=True),
                tf.keras.layers.GRU(self._rnn_units, stateful=True, return_sequences=True),
                tf.keras.layers.Dense(self._vocab_size),
            ])
            
            self._predicting_model.load_weights(tf.train.latest_checkpoint(self._checkpoint_dir))
            self._training_model = None
        
        return self._predicting_model
    
    def predict(self, input_eval):
        return self.predicting_model(input_eval)

In [9]:
model = Model(articles, './training_checkpoints-15',
              vocab_size = subword_text_encoder.vocab_size,
              embedding_dim=512,
              rnn_units=1024)

In [11]:
model.training_model(192, 256).summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking_1 (Masking)          (192, 256)                0         
_________________________________________________________________
embedding_1 (Embedding)      (192, 256, 512)           2072576   
_________________________________________________________________
gru_2 (GRU)                  (192, 256, 1024)          4724736   
_________________________________________________________________
gru_3 (GRU)                  (192, 256, 1024)          6297600   
_________________________________________________________________
dense_1 (Dense)              (192, 256, 4048)          4149200   
Total params: 17,244,112
Trainable params: 17,244,112
Non-trainable params: 0
_________________________________________________________________


In [18]:
model.train(192, 256, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
  1/203 [..............................] - ETA: 2:46

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:GPU:0 to /job:localhost/replica:0/task:0/device:CPU:0 in order to run Identity: GPU sync failed [Op:Identity]

In [12]:
model.train(192, 256, epochs=48)

Epoch 1/48
Epoch 2/48
  1/203 [..............................] - ETA: 41s

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:GPU:0 to /job:localhost/replica:0/task:0/device:CPU:0 in order to run Identity: GPU sync failed [Op:Identity]

In [None]:
model.train(192, 256, epochs=47)

Epoch 1/47
Epoch 2/47
Epoch 3/47
Epoch 4/47


Моделът забива няколко поредни пъти при обучение. Не успях да дебъгна проблема.

In [14]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

def average_final_batch_ratio(true_labels, predictions):
    return 0 ** tf.math.abs(true_labels[-1, -1])

class ModelStateResetter(tf.keras.callbacks.Callback):
    def __init__(self):
        pass

    def on_epoch_begin(self, epoch, logs=None):
        self.last_final_batch_count = 0
        
    def on_batch_end(self, batch, logs={}):
        average_final_batch_ratio = logs.get('average_final_batch_ratio', 0)
        final_batch_count = int(round(average_final_batch_ratio * (batch + 1)))
        is_final = final_batch_count - self.last_final_batch_count
        self.last_final_batch_count = final_batch_count
        
        if is_final:
            self.model.reset_states()

class Model:
    def __init__(self, articles, checkpoint_dir, vocab_size, embedding_dim, rnn_units):
        self._articles = articles
        self._batch_size = None
        self._batched_item_length = None
        self._training_model = None
        self._predicting_model = None
        self._vocab_size = vocab_size
        self._embedding_dim = embedding_dim
        self._rnn_units = rnn_units

        self._checkpoint_dir = checkpoint_dir
        self._checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}") # Name of the checkpoint files

    def training_model(self, batch_size, batched_item_length):
        if self._training_model == None or batch_size != self._batch_size or batched_item_length != self._batched_item_length:
            self._batch_size = batch_size
            self._batched_item_length = batched_item_length
            self._training_model = tf.keras.Sequential([
                tf.keras.layers.Masking(mask_value=0, batch_input_shape=[batch_size, batched_item_length]),
                tf.keras.layers.Embedding(self._vocab_size, self._embedding_dim),
                tf.keras.layers.LSTM(self._rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                tf.keras.layers.LSTM(self._rnn_units // 2, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                tf.keras.layers.Dense(self._vocab_size),
            ])

            if os.path.isdir(self._checkpoint_dir):
                self._training_model.load_weights(tf.train.latest_checkpoint(self._checkpoint_dir))

            self._training_model.compile(optimizer='adam', loss=loss, metrics=[average_final_batch_ratio])
            self._predicting_model = None
        
        return self._training_model

    @property
    def callbacks(self):
        checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=self._checkpoint_prefix, save_weights_only=True)
        model_state_resetter_callback = ModelStateResetter()
        
        return [checkpoint_callback, model_state_resetter_callback]
    
    def train(self, batch_size, batched_item_length, epochs=1):
        dataset = self._articles.dataset(batch_size, batched_item_length)

        model = self.training_model(batch_size, batched_item_length)

        model.fit(dataset, epochs=epochs, callbacks=self.callbacks)
    
    @property
    def predicting_model(self):
        if self._predicting_model == None:
            self._predicting_model = tf.keras.Sequential([
                tf.keras.layers.Masking(mask_value=0, batch_input_shape=[1, 1]),
                tf.keras.layers.Embedding(self._vocab_size, self._embedding_dim),
                tf.keras.layers.LSTM(self._rnn_units, stateful=True, return_sequences=True),
                tf.keras.layers.LSTM(self._rnn_units // 2, stateful=True, return_sequences=True),
                tf.keras.layers.Dense(self._vocab_size),
            ])
            
            self._predicting_model.load_weights(tf.train.latest_checkpoint(self._checkpoint_dir))
            self._training_model = None
        
        return self._predicting_model
    
    def predict(self, input_eval):
        return self.predicting_model(input_eval)

In [15]:
model = Model(articles, './training_checkpoints-16',
              vocab_size = subword_text_encoder.vocab_size,
              embedding_dim=256,
              rnn_units=1024)

In [16]:
model.training_model(192, 256).summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking_2 (Masking)          (192, 256)                0         
_________________________________________________________________
embedding_2 (Embedding)      (192, 256, 256)           1036288   
_________________________________________________________________
lstm_4 (LSTM)                (192, 256, 1024)          5246976   
_________________________________________________________________
lstm_5 (LSTM)                (192, 256, 512)           3147776   
_________________________________________________________________
dense_2 (Dense)              (192, 256, 4048)          2076624   
Total params: 11,507,664
Trainable params: 11,507,664
Non-trainable params: 0
_________________________________________________________________


In [17]:
model.train(192, 256, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [18]:
total_raw = 0
total_compressed = 0

for index, encoded_article in enumerate(articles.articles_generator(1)):
    if index % 10 == 0:
        article = subword_text_encoder.decode(encoded_article)
        raw = len(article) * 8
        if raw == 0:
            continue
        compressed, _ = huffman_archive_size(model, encoded_article)
        total_raw += raw
        total_compressed += compressed

        print('Article %d:\tLength: %d\tCompression: %f\tAvg Compression: %f' % (index, raw, compressed/raw, total_compressed/total_raw))

Article 0:	Length: 128	Compression: 0.820312	Avg Compression: 0.820312
Article 10:	Length: 144	Compression: 0.715278	Avg Compression: 0.764706
Article 20:	Length: 152	Compression: 0.598684	Avg Compression: 0.705189
Article 30:	Length: 160	Compression: 0.556250	Avg Compression: 0.664384
Article 40:	Length: 160	Compression: 0.731250	Avg Compression: 0.678763
Article 50:	Length: 168	Compression: 0.648810	Avg Compression: 0.673246
Article 60:	Length: 168	Compression: 0.642857	Avg Compression: 0.668519
Article 70:	Length: 176	Compression: 0.920455	Avg Compression: 0.703822
Article 80:	Length: 176	Compression: 0.676136	Avg Compression: 0.700419
Article 90:	Length: 184	Compression: 0.586957	Avg Compression: 0.687500
Article 100:	Length: 184	Compression: 0.630435	Avg Compression: 0.681667
Article 110:	Length: 192	Compression: 0.744792	Avg Compression: 0.687751
Article 120:	Length: 192	Compression: 0.552083	Avg Compression: 0.675824
Article 130:	Length: 200	Compression: 0.665000	Avg Compression

Article 1110:	Length: 35264	Compression: 0.203267	Avg Compression: 0.201383
Article 1120:	Length: 36120	Compression: 0.146069	Avg Compression: 0.199456
Article 1130:	Length: 36448	Compression: 0.192219	Avg Compression: 0.199210
Article 1140:	Length: 37448	Compression: 0.187780	Avg Compression: 0.198825
Article 1150:	Length: 38424	Compression: 0.191833	Avg Compression: 0.198591
Article 1160:	Length: 38680	Compression: 0.192218	Avg Compression: 0.198383
Article 1170:	Length: 40216	Compression: 0.212552	Avg Compression: 0.198847
Article 1180:	Length: 41480	Compression: 0.225579	Avg Compression: 0.199721
Article 1190:	Length: 42616	Compression: 0.212878	Avg Compression: 0.200148
Article 1200:	Length: 43488	Compression: 0.213300	Avg Compression: 0.200570
Article 1210:	Length: 45080	Compression: 0.212134	Avg Compression: 0.200942
Article 1220:	Length: 46864	Compression: 0.192493	Avg Compression: 0.200669
Article 1230:	Length: 48608	Compression: 0.175938	Avg Compression: 0.199865
Article 1240

In [19]:
model.train(192, 256, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50


Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [20]:
total_raw = 0
total_compressed = 0

for index, encoded_article in enumerate(articles.articles_generator(1)):
    if index % 10 == 0:
        article = subword_text_encoder.decode(encoded_article)
        raw = len(article) * 8
        if raw == 0:
            continue
        compressed, _ = huffman_archive_size(model, encoded_article)
        total_raw += raw
        total_compressed += compressed

        print('Article %d:\tLength: %d\tCompression: %f\tAvg Compression: %f' % (index, raw, compressed/raw, total_compressed/total_raw))

Article 0:	Length: 128	Compression: 1.015625	Avg Compression: 1.015625
Article 10:	Length: 144	Compression: 0.909722	Avg Compression: 0.959559
Article 20:	Length: 152	Compression: 0.822368	Avg Compression: 0.910377
Article 30:	Length: 160	Compression: 0.731250	Avg Compression: 0.861301
Article 40:	Length: 160	Compression: 0.962500	Avg Compression: 0.883065
Article 50:	Length: 168	Compression: 0.851190	Avg Compression: 0.877193
Article 60:	Length: 168	Compression: 0.839286	Avg Compression: 0.871296
Article 70:	Length: 176	Compression: 1.215909	Avg Compression: 0.919586
Article 80:	Length: 176	Compression: 0.857955	Avg Compression: 0.912011
Article 90:	Length: 184	Compression: 0.869565	Avg Compression: 0.907178
Article 100:	Length: 184	Compression: 0.929348	Avg Compression: 0.909444
Article 110:	Length: 192	Compression: 0.791667	Avg Compression: 0.898092
Article 120:	Length: 192	Compression: 0.692708	Avg Compression: 0.880037
Article 130:	Length: 200	Compression: 0.870000	Avg Compression

Article 1110:	Length: 35264	Compression: 0.165806	Avg Compression: 0.172358
Article 1120:	Length: 36120	Compression: 0.122231	Avg Compression: 0.170611
Article 1130:	Length: 36448	Compression: 0.152244	Avg Compression: 0.169987
Article 1140:	Length: 37448	Compression: 0.160382	Avg Compression: 0.169664
Article 1150:	Length: 38424	Compression: 0.154877	Avg Compression: 0.169169
Article 1160:	Length: 38680	Compression: 0.151060	Avg Compression: 0.168579
Article 1170:	Length: 40216	Compression: 0.180127	Avg Compression: 0.168958
Article 1180:	Length: 41480	Compression: 0.169600	Avg Compression: 0.168978
Article 1190:	Length: 42616	Compression: 0.186362	Avg Compression: 0.169543
Article 1200:	Length: 43488	Compression: 0.179268	Avg Compression: 0.169855
Article 1210:	Length: 45080	Compression: 0.182010	Avg Compression: 0.170246
Article 1220:	Length: 46864	Compression: 0.164092	Avg Compression: 0.170047
Article 1230:	Length: 48608	Compression: 0.152156	Avg Compression: 0.169466
Article 1240

In [21]:
model.train(192, 256, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [23]:
total_raw = 0
total_compressed = 0

for index, encoded_article in enumerate(articles.articles_generator(1)):
    if index % 10 == 0:
        article = subword_text_encoder.decode(encoded_article)
        raw = len(article) * 8
        if raw == 0:
            continue
        compressed, _ = huffman_archive_size(model, encoded_article)
        total_raw += raw
        total_compressed += compressed

        print('Article %d:\tLength: %d\tCompression: %f\tAvg Compression: %f' % (index, raw, compressed/raw, total_compressed/total_raw))

Article 0:	Length: 128	Compression: 1.617188	Avg Compression: 1.617188
Article 10:	Length: 144	Compression: 1.361111	Avg Compression: 1.481618
Article 20:	Length: 152	Compression: 1.236842	Avg Compression: 1.393868
Article 30:	Length: 160	Compression: 1.193750	Avg Compression: 1.339041
Article 40:	Length: 160	Compression: 1.456250	Avg Compression: 1.364247
Article 50:	Length: 168	Compression: 1.220238	Avg Compression: 1.337719
Article 60:	Length: 168	Compression: 1.220238	Avg Compression: 1.319444
Article 70:	Length: 176	Compression: 1.784091	Avg Compression: 1.384554
Article 80:	Length: 176	Compression: 1.232955	Avg Compression: 1.365922
Article 90:	Length: 184	Compression: 1.048913	Avg Compression: 1.329827
Article 100:	Length: 184	Compression: 1.217391	Avg Compression: 1.318333
Article 110:	Length: 192	Compression: 1.192708	Avg Compression: 1.306225
Article 120:	Length: 192	Compression: 0.989583	Avg Compression: 1.278388
Article 130:	Length: 200	Compression: 1.195000	Avg Compression

Article 1110:	Length: 35264	Compression: 0.146665	Avg Compression: 0.164177
Article 1120:	Length: 36120	Compression: 0.109524	Avg Compression: 0.162273
Article 1130:	Length: 36448	Compression: 0.137072	Avg Compression: 0.161417
Article 1140:	Length: 37448	Compression: 0.142010	Avg Compression: 0.160763
Article 1150:	Length: 38424	Compression: 0.136842	Avg Compression: 0.159963
Article 1160:	Length: 38680	Compression: 0.130584	Avg Compression: 0.159006
Article 1170:	Length: 40216	Compression: 0.161055	Avg Compression: 0.159073
Article 1180:	Length: 41480	Compression: 0.143852	Avg Compression: 0.158576
Article 1190:	Length: 42616	Compression: 0.173268	Avg Compression: 0.159053
Article 1200:	Length: 43488	Compression: 0.169012	Avg Compression: 0.159373
Article 1210:	Length: 45080	Compression: 0.162866	Avg Compression: 0.159485
Article 1220:	Length: 46864	Compression: 0.145805	Avg Compression: 0.159042
Article 1230:	Length: 48608	Compression: 0.134443	Avg Compression: 0.158243
Article 1240

Article 1810:	Length: 251304	Compression: 0.162357	Avg Compression: 0.140708
Article 1820:	Length: 268336	Compression: 0.153893	Avg Compression: 0.141133
Article 1830:	Length: 283192	Compression: 0.134096	Avg Compression: 0.140901
Article 1840:	Length: 307368	Compression: 0.161201	Avg Compression: 0.141600
Article 1850:	Length: 323168	Compression: 0.142703	Avg Compression: 0.141639
Article 1860:	Length: 341176	Compression: 0.150084	Avg Compression: 0.141939
Article 1870:	Length: 382048	Compression: 0.148002	Avg Compression: 0.142171
Article 1880:	Length: 441872	Compression: 0.122787	Avg Compression: 0.141349
Article 1890:	Length: 508896	Compression: 0.146672	Avg Compression: 0.141597


Странно е, че за малки файлове компресията е толкова по-лоша. Да опитаме винаги да използваме първия символ некомпресиран.

In [41]:
import ctypes

class Huffman:
    huffman = ctypes.CDLL('x64/Release/huffman')
    
    huffman.create_tree.restype = ctypes.c_void_p
    huffman.destroy_tree.restype = None
    huffman.load_weights.restype = None
    huffman.create_code_string.restype = ctypes.c_char_p
    
    def __init__(self, category_count):
        self.tree = ctypes.c_void_p(self.huffman.create_tree(category_count))

    def __del__(self):
        self.huffman.destroy_tree(self.tree)
        
    def load_weights(self, weights):
        self.huffman.load_weights(self.tree, weights.ctypes.data_as(ctypes.POINTER(ctypes.c_float)))
    
    def get_code_length(self, category):
        return self.huffman.get_code_length(self.tree, category)

    def get_code_zero_count(self, category):
        return self.huffman.get_code_zero_count(self.tree, category)
    
    def archive_size(self, model, text):
        archived_size = 14
        input_eval = np.array([[text[0]]], dtype=TYPE)

        model.predicting_model.reset_states()

        for byte in text[1:]:
            predictions = model.predict(input_eval)
            predictions = tf.squeeze(predictions, 0) # remove the batch dimension

            weights = tf.nn.softmax(predictions[0]).numpy()
            self.load_weights(weights)
            archived_size += self.get_code_length(byte.item())

            input_eval = tf.expand_dims([byte], 0)

        return archived_size

In [42]:
total_raw = 0
total_compressed = 0

huffman = Huffman(subword_text_encoder.vocab_size)
for index, encoded_article in enumerate(articles.articles_generator(1)):
    if index % 10 == 0:
        article = subword_text_encoder.decode(encoded_article)
        if len(article) == 0:
            continue
        total_raw += len(article) * 8
        total_compressed += huffman.archive_size(model, encoded_article)
        print('Article %d:\tLength: %d\tAvg Compression: %f' % (index, len(article), total_compressed/total_raw))

Article 0:	Length: 16	Avg Compression: 0.195312
Article 10:	Length: 18	Avg Compression: 0.194853
Article 20:	Length: 19	Avg Compression: 0.188679
Article 30:	Length: 20	Avg Compression: 0.184932
Article 40:	Length: 20	Avg Compression: 0.181452
Article 50:	Length: 21	Avg Compression: 0.178728
Article 60:	Length: 21	Avg Compression: 0.176852
Article 70:	Length: 22	Avg Compression: 0.187102
Article 80:	Length: 22	Avg Compression: 0.185056
Article 90:	Length: 23	Avg Compression: 0.184406
Article 100:	Length: 23	Avg Compression: 0.183333
Article 110:	Length: 24	Avg Compression: 0.188253
Article 120:	Length: 24	Avg Compression: 0.184982
Article 130:	Length: 25	Avg Compression: 0.184564
Article 140:	Length: 25	Avg Compression: 0.184598
Article 150:	Length: 26	Avg Compression: 0.180874
Article 160:	Length: 26	Avg Compression: 0.177333
Article 170:	Length: 27	Avg Compression: 0.175684
Article 180:	Length: 27	Avg Compression: 0.172203
Article 190:	Length: 28	Avg Compression: 0.170131
Article 200

Article 1590:	Length: 14498	Avg Compression: 0.127394
Article 1600:	Length: 14754	Avg Compression: 0.127406
Article 1610:	Length: 15109	Avg Compression: 0.126944
Article 1620:	Length: 15406	Avg Compression: 0.126757
Article 1630:	Length: 15644	Avg Compression: 0.126720
Article 1640:	Length: 16120	Avg Compression: 0.126298
Article 1650:	Length: 16602	Avg Compression: 0.126138
Article 1660:	Length: 16823	Avg Compression: 0.125653
Article 1670:	Length: 17684	Avg Compression: 0.125502
Article 1680:	Length: 18309	Avg Compression: 0.125229
Article 1690:	Length: 18980	Avg Compression: 0.125401
Article 1700:	Length: 19631	Avg Compression: 0.125847
Article 1710:	Length: 20178	Avg Compression: 0.126605
Article 1720:	Length: 20526	Avg Compression: 0.127528
Article 1730:	Length: 21808	Avg Compression: 0.128492
Article 1740:	Length: 23634	Avg Compression: 0.128349
Article 1750:	Length: 24385	Avg Compression: 0.128701
Article 1760:	Length: 25324	Avg Compression: 0.129135
Article 1770:	Length: 26645	

............................

In [6]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

def average_final_batch_ratio(true_labels, predictions):
    return 0 ** tf.math.abs(true_labels[-1, -1])

class ModelStateResetter(tf.keras.callbacks.Callback):
    def __init__(self):
        pass

    def on_epoch_begin(self, epoch, logs=None):
        self.last_final_batch_count = 0
        
    def on_batch_end(self, batch, logs={}):
        average_final_batch_ratio = logs.get('average_final_batch_ratio', 0)
        final_batch_count = int(round(average_final_batch_ratio * (batch + 1)))
        is_final = final_batch_count - self.last_final_batch_count
        self.last_final_batch_count = final_batch_count
        
        if is_final:
            self.model.reset_states()

class Model:
    def __init__(self, articles, checkpoint_dir, vocab_size, embedding_dim, rnn_units):
        self._articles = articles
        self._batch_size = None
        self._batched_item_length = None
        self._training_model = None
        self._predicting_model = None
        self._vocab_size = vocab_size
        self._embedding_dim = embedding_dim
        self._rnn_units = rnn_units

        self._checkpoint_dir = checkpoint_dir
        self._checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}") # Name of the checkpoint files

    def training_model(self, batch_size, batched_item_length):
        if self._training_model == None or batch_size != self._batch_size or batched_item_length != self._batched_item_length:
            self._batch_size = batch_size
            self._batched_item_length = batched_item_length
            self._training_model = tf.keras.Sequential([
                tf.keras.layers.Masking(mask_value=0, batch_input_shape=[batch_size, batched_item_length]),
                tf.keras.layers.Embedding(self._vocab_size, self._embedding_dim),
                tf.keras.layers.LSTM(self._rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                tf.keras.layers.LSTM(self._rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                tf.keras.layers.Dense(self._vocab_size),
            ])

            if os.path.isdir(self._checkpoint_dir):
                self._training_model.load_weights(tf.train.latest_checkpoint(self._checkpoint_dir))

            self._training_model.compile(optimizer='adam', loss=loss, metrics=[average_final_batch_ratio])
            self._predicting_model = None
        
        return self._training_model

    @property
    def callbacks(self):
        checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=self._checkpoint_prefix, save_weights_only=True)
        model_state_resetter_callback = ModelStateResetter()
        
        return [checkpoint_callback, model_state_resetter_callback]
    
    def train(self, batch_size, batched_item_length, epochs=1):
        dataset = self._articles.dataset(batch_size, batched_item_length)

        model = self.training_model(batch_size, batched_item_length)

        model.fit(dataset, epochs=epochs, callbacks=self.callbacks)
    
    @property
    def predicting_model(self):
        if self._predicting_model == None:
            self._predicting_model = tf.keras.Sequential([
                tf.keras.layers.Masking(mask_value=0, batch_input_shape=[1, 1]),
                tf.keras.layers.Embedding(self._vocab_size, self._embedding_dim),
                tf.keras.layers.LSTM(self._rnn_units, stateful=True, return_sequences=True),
                tf.keras.layers.LSTM(self._rnn_units, stateful=True, return_sequences=True),
                tf.keras.layers.Dense(self._vocab_size),
            ])
            
            self._predicting_model.load_weights(tf.train.latest_checkpoint(self._checkpoint_dir))
            self._training_model = None
        
        return self._predicting_model
    
    def predict(self, input_eval):
        return self.predicting_model(input_eval)

In [7]:
model = Model(articles, './training_checkpoints-17',
              vocab_size = subword_text_encoder.vocab_size,
              embedding_dim=256,
              rnn_units=1024)

In [8]:
model.training_model(192, 256).summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking (Masking)            (192, 256)                0         
_________________________________________________________________
embedding (Embedding)        (192, 256, 256)           1036288   
_________________________________________________________________
lstm (LSTM)                  (192, 256, 1024)          5246976   
_________________________________________________________________
lstm_1 (LSTM)                (192, 256, 1024)          8392704   
_________________________________________________________________
dense (Dense)                (192, 256, 4048)          4149200   
Total params: 18,825,168
Trainable params: 18,825,168
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.train(192, 256, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100


In [None]:
model.train(192, 256, epochs=71)

Epoch 1/71
Epoch 2/71


In [None]:
model.train(192, 192, epochs=70)

Epoch 1/70
Epoch 2/70
