In [1]:
import itertools
import math
import os

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

physical_devices = tf.config.list_physical_devices('GPU')
for physical_device in physical_devices:
    tf.config.experimental.set_memory_growth(physical_device, enable=True)

In [2]:
TYPE=np.int16

subword_text_encoder = tfds.features.text.SubwordTextEncoder.load_from_file('vocab_2046')

class Articles:
    EMPTY_ARTICLE = np.array([], dtype=TYPE) # used for padding
    
    def __init__(self, path):
        with open(path, 'rb') as text_file:
            data = text_file.read()

        self.articles = sorted(set(data.split(b'\0')), key=len)
        self._encoded_articles = None

    @property
    def encoded_articles(self):
        if self._encoded_articles == None:
            self._encoded_articles = [np.array(subword_text_encoder.encode(article), dtype=TYPE) for article in self.articles]
        
        return self._encoded_articles

    def articles_generator(self, batch_size = 1, start = 0, end = None):
        end = end or len(self.articles)

        for _ in range(batch_size - ((end - start - 1) % batch_size + 1)):
            yield self.EMPTY_ARTICLE

        for article in itertools.islice(self.encoded_articles, start, end):
            yield article

    def subbatch_generator(self, batch_size, batch_length, start = 0, end = None):
        end = end or len(self.articles)

        dataset = tf.data.Dataset.from_generator(self.articles_generator, args=(batch_size, start, end), output_types=TYPE)
        dataset = dataset.padded_batch(batch_size, padded_shapes=([None]), drop_remainder=True)
        dataset = dataset.shuffle(100)

        for batch in dataset.as_numpy_iterator():
            remaining = batch
            while remaining.shape[1] > batch_length + 1:
                yield remaining[:, :batch_length + 1]
                remaining = remaining[:, batch_length:]

            if remaining.shape[1] == batch_length + 1:
                yield remaining
                yield np.zeros((batch_size, batch_length + 1), dtype=TYPE)
            else:
                yield np.hstack([remaining, np.zeros([batch_size, batch_length - remaining.shape[1] + 1])])

    def steps(self, batch_size, batch_length):
        articles = self.articles_generator(batch_size, batch_length)
        return sum(math.ceil(len(article) / batch_length) for i, article in enumerate(articles) if (i + 1) % batch_size == 0)

    def dataset(self, batch_size, batch_length, start = 0, end = None):
        end = end or len(self.articles)

        dataset = tf.data.Dataset.from_generator(self.subbatch_generator, args=(batch_size, batch_length, start, end), output_types=TYPE, output_shapes=(batch_size, batch_length + 1))
        return dataset.map(lambda batch: (batch[:, :-1], batch[:, 1:]))

In [3]:
articles = Articles('page_revisions_text')

In [4]:
steps = 15

for i in range(steps):
    batch_size = 2 * 2**i
    batch_item_length = 2 * 2**(steps - i - 1)
    count = articles.steps(batch_size, batch_item_length)
    print("batch size: %6d\t batch item length: %4d\tsteps per epoch: %6d" % (batch_size, batch_item_length, count))

batch size:      2	 batch item length: 32768	steps per epoch:  93929
batch size:      4	 batch item length: 16384	steps per epoch:  51451
batch size:      8	 batch item length: 8192	steps per epoch:  27760
batch size:     16	 batch item length: 4096	steps per epoch:  15763
batch size:     32	 batch item length: 2048	steps per epoch:  10129
batch size:     64	 batch item length: 1024	steps per epoch:   7745
batch size:    128	 batch item length:  512	steps per epoch:   7017
batch size:    256	 batch item length:  256	steps per epoch:   7168
batch size:    512	 batch item length:  128	steps per epoch:   8259
batch size:   1024	 batch item length:   64	steps per epoch:  10861
batch size:   2048	 batch item length:   32	steps per epoch:  16331
batch size:   4096	 batch item length:   16	steps per epoch:  27522
batch size:   8192	 batch item length:    8	steps per epoch:  50147
batch size:  16384	 batch item length:    4	steps per epoch:  95507
batch size:  32768	 batch item length:    2	st

In [5]:
BATCH_SIZE = 16384
BATCHED_ITEM_LENGTH = 2

def build_model(vocab_size, embedding_dim, rnn_units):
    return tf.keras.Sequential([
        tf.keras.layers.Masking(mask_value=0, batch_input_shape=[BATCH_SIZE, BATCHED_ITEM_LENGTH]),
        tf.keras.layers.Embedding(vocab_size, embedding_dim),
        tf.keras.layers.GRU(rnn_units // 4 * 4, return_sequences=True, stateful=True, unroll=True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.GRU(rnn_units // 4 * 3, return_sequences=True, stateful=True, unroll=True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.GRU(rnn_units // 4 * 2, return_sequences=True, stateful=True, unroll=True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size),
    ])

checkpoint_dir = './training_checkpoints-6' # Directory where the checkpoints will be saved
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}") # Name of the checkpoint files

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, save_weights_only=True)

class ModelStateResetter(tf.keras.callbacks.Callback):
    def __init__(self):
        pass

    def on_epoch_begin(self, epoch, logs=None):
        self.last_final_batch_count = 0
        
    def on_batch_end(self, batch, logs={}):
        average_final_batch_ratio = logs.get('average_final_batch_ratio', 0)
        final_batch_count = int(round(average_final_batch_ratio * (batch + 1)))
        is_final = final_batch_count - self.last_final_batch_count
        self.last_final_batch_count = final_batch_count
        
        if is_final:
            self.model.reset_states()
        
model_state_resetter_callback = ModelStateResetter()

def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

def average_final_batch_ratio(true_labels, predictions):
    return 0 ** tf.math.abs(true_labels[-1, -1])

In [6]:
model = build_model(vocab_size = subword_text_encoder.vocab_size, embedding_dim=32, rnn_units=1024)
# model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.compile(optimizer='adam', loss=loss, metrics=[average_final_batch_ratio])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking (Masking)            (16384, 2)                0         
_________________________________________________________________
embedding (Embedding)        (16384, 2, 32)            65184     
_________________________________________________________________
gru (GRU)                    (16384, 2, 1024)          3250176   
_________________________________________________________________
gru_1 (GRU)                  (16384, 2, 768)           4133376   
_________________________________________________________________
gru_2 (GRU)                  (16384, 2, 512)           1969152   
_________________________________________________________________
dense (Dense)                (16384, 2, 2037)          1044981   
Total params: 10,462,869
Trainable params: 10,462,869
Non-trainable params: 0
____________________________________________

In [7]:
dataset = articles.dataset(BATCH_SIZE, BATCHED_ITEM_LENGTH)
callbacks = [checkpoint_callback, model_state_resetter_callback]
model.fit(dataset, callbacks=callbacks)



<tensorflow.python.keras.callbacks.History at 0x213d6736a08>

In [8]:
model.fit(dataset, callbacks=callbacks)



<tensorflow.python.keras.callbacks.History at 0x213d6751888>

In [10]:
def build_predicting_model(vocab_size, embedding_dim, rnn_units):
    return tf.keras.Sequential([
        tf.keras.layers.Masking(mask_value=0, batch_input_shape=[1, 1]),
        tf.keras.layers.Embedding(vocab_size, embedding_dim),
        tf.keras.layers.GRU(rnn_units // 4 * 4, stateful=True, return_sequences=True),
        tf.keras.layers.GRU(rnn_units // 4 * 3, stateful=True, return_sequences=True),
        tf.keras.layers.GRU(rnn_units // 4 * 2, stateful=True, return_sequences=True),
        tf.keras.layers.Dense(vocab_size),
    ])

model = build_predicting_model(vocab_size = subword_text_encoder.vocab_size, embedding_dim=32, rnn_units=1024)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, 1]))

In [11]:
import ctypes

class Huffman:
    huffman = ctypes.CDLL('x64/Release/huffman')
    
    huffman.create_tree.restype = ctypes.c_void_p
    huffman.destroy_tree.restype = None
    huffman.load_weights.restype = None
    huffman.create_code_string.restype = ctypes.c_char_p
    
    def __init__(self, category_count):
        self.tree = ctypes.c_void_p(self.huffman.create_tree(category_count))

    def __del__(self):
        self.huffman.destroy_tree(self.tree)
        
    def load_weights(self, weights):
        self.huffman.load_weights(self.tree, weights.ctypes.data_as(ctypes.POINTER(ctypes.c_float)))
    
    def get_code_length(self, category):
        return self.huffman.get_code_length(self.tree, category)

    def get_code_zero_count(self, category):
        return self.huffman.get_code_zero_count(self.tree, category)

In [12]:
def huffman_archive_size(model, text):
    archived_size = 0
    zeros = 0
    input_eval = np.array([[0]], dtype=TYPE)
    huffman_tree = Huffman(subword_text_encoder.vocab_size)

    text_generated = []

    model.reset_states()

    for index, byte in enumerate(text):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0) # remove the batch dimension

        weights = tf.nn.softmax(predictions[0]).numpy()
        huffman_tree.load_weights(weights)
        zeros += huffman_tree.get_code_zero_count(byte.item())
        archived_size += huffman_tree.get_code_length(byte.item())

        input_eval = tf.expand_dims([byte], 0)
  
    return archived_size, zeros

In [16]:
total_raw = 0
total_compressed = 0

for index, encoded_article in enumerate(articles.articles_generator(1)):
    if index % 1000 == 0:
        article = subword_text_encoder.decode(encoded_article)
        raw = len(article) * 8
        if raw == 0:
            continue
        compressed, _ = huffman_archive_size(model, encoded_article)
        total_raw += raw
        total_compressed += compressed

        print('Article %d:\tLength: %d\tCompression: %f\tAvg Compression: %f' % (index, raw, compressed/raw, total_compressed/total_raw))

Article 1000:	Length: 144	Compression: 11.562500	Avg Compression: 11.562500
Article 2000:	Length: 152	Compression: 11.006579	Avg Compression: 11.277027
Article 3000:	Length: 152	Compression: 11.105263	Avg Compression: 11.218750
Article 4000:	Length: 168	Compression: 10.119048	Avg Compression: 10.918831
Article 5000:	Length: 168	Compression: 9.940476	Avg Compression: 10.709184
Article 6000:	Length: 176	Compression: 19.596591	Avg Compression: 12.338542
Article 7000:	Length: 176	Compression: 9.619318	Avg Compression: 11.917254
Article 8000:	Length: 176	Compression: 9.500000	Avg Compression: 11.592988
Article 9000:	Length: 184	Compression: 9.309783	Avg Compression: 11.312166
Article 10000:	Length: 184	Compression: 9.217391	Avg Compression: 11.082738
Article 11000:	Length: 192	Compression: 18.062500	Avg Compression: 11.798611
Article 12000:	Length: 192	Compression: 17.989583	Avg Compression: 12.374516
Article 13000:	Length: 200	Compression: 17.295000	Avg Compression: 12.809187
Article 14000

Article 109000:	Length: 17528	Compression: 0.565838	Avg Compression: 0.660689
Article 110000:	Length: 18032	Compression: 0.386757	Avg Compression: 0.650088
Article 111000:	Length: 18480	Compression: 0.420292	Avg Compression: 0.641321
Article 112000:	Length: 18496	Compression: 0.524059	Avg Compression: 0.637008
Article 113000:	Length: 19312	Compression: 0.401305	Avg Compression: 0.628292
Article 114000:	Length: 19672	Compression: 0.208469	Avg Compression: 0.613051
Article 115000:	Length: 19848	Compression: 0.222088	Avg Compression: 0.599237
Article 116000:	Length: 20024	Compression: 0.180184	Avg Compression: 0.584813
Article 117000:	Length: 20096	Compression: 0.182325	Avg Compression: 0.571374
Article 118000:	Length: 20176	Compression: 0.213125	Avg Compression: 0.559754
Article 119000:	Length: 20272	Compression: 0.158100	Avg Compression: 0.547077
Article 120000:	Length: 20408	Compression: 0.201098	Avg Compression: 0.536423
Article 121000:	Length: 20544	Compression: 0.209696	Avg Compress

Article 214000:	Length: 177264	Compression: 0.425354	Avg Compression: 0.433760
Article 215000:	Length: 193048	Compression: 0.449090	Avg Compression: 0.434297
Article 216000:	Length: 212904	Compression: 0.430419	Avg Compression: 0.434153
Article 217000:	Length: 238208	Compression: 0.423525	Avg Compression: 0.433728
Article 218000:	Length: 274168	Compression: 0.433730	Avg Compression: 0.433728
Article 219000:	Length: 329056	Compression: 0.416333	Avg Compression: 0.432856
Article 220000:	Length: 456256	Compression: 0.639783	Avg Compression: 0.446304


In [6]:
model = build_model(vocab_size = subword_text_encoder.vocab_size, embedding_dim=32, rnn_units=1024)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.compile(optimizer='adam', loss=loss, metrics=[average_final_batch_ratio])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking (Masking)            (16384, 2)                0         
_________________________________________________________________
embedding (Embedding)        (16384, 2, 32)            65184     
_________________________________________________________________
gru (GRU)                    (16384, 2, 1024)          3250176   
_________________________________________________________________
gru_1 (GRU)                  (16384, 2, 768)           4133376   
_________________________________________________________________
gru_2 (GRU)                  (16384, 2, 512)           1969152   
_________________________________________________________________
dense (Dense)                (16384, 2, 2037)          1044981   
Total params: 10,462,869
Trainable params: 10,462,869
Non-trainable params: 0
____________________________________________

In [7]:
dataset = articles.dataset(BATCH_SIZE, BATCHED_ITEM_LENGTH)
callbacks = [checkpoint_callback, model_state_resetter_callback]
model.fit(dataset, callbacks=callbacks)



<tensorflow.python.keras.callbacks.History at 0x209a78ef208>

In [13]:
total_raw = 0
total_compressed = 0

for index, encoded_article in enumerate(articles.articles_generator(1)):
    if index % 1000 == 0:
        article = subword_text_encoder.decode(encoded_article)
        raw = len(article) * 8
        if raw == 0:
            continue
        compressed, _ = huffman_archive_size(model, encoded_article)
        total_raw += raw
        total_compressed += compressed

        print('Article %d:\tLength: %d\tCompression: %f\tAvg Compression: %f' % (index, raw, compressed/raw, total_compressed/total_raw))

Article 1000:	Length: 144	Compression: 1.958333	Avg Compression: 1.958333
Article 2000:	Length: 152	Compression: 2.059211	Avg Compression: 2.010135
Article 3000:	Length: 160	Compression: 2.000000	Avg Compression: 2.006579
Article 4000:	Length: 168	Compression: 1.880952	Avg Compression: 1.972756
Article 5000:	Length: 168	Compression: 6.202381	Avg Compression: 2.869949
Article 6000:	Length: 176	Compression: 1.784091	Avg Compression: 2.672521
Article 7000:	Length: 176	Compression: 1.750000	Avg Compression: 2.530594
Article 8000:	Length: 176	Compression: 1.698864	Avg Compression: 2.419697
Article 9000:	Length: 184	Compression: 1.750000	Avg Compression: 2.337766
Article 10000:	Length: 184	Compression: 1.657609	Avg Compression: 2.263626
Article 11000:	Length: 192	Compression: 1.614583	Avg Compression: 2.197340
Article 12000:	Length: 192	Compression: 1.609375	Avg Compression: 2.142857
Article 13000:	Length: 200	Compression: 1.605000	Avg Compression: 2.095511
Article 14000:	Length: 200	Compres

Article 110000:	Length: 18000	Compression: 0.477333	Avg Compression: 0.543160
Article 111000:	Length: 18424	Compression: 0.455601	Avg Compression: 0.539829
Article 112000:	Length: 18920	Compression: 0.419715	Avg Compression: 0.535314
Article 113000:	Length: 19104	Compression: 0.589405	Avg Compression: 0.537292
Article 114000:	Length: 19672	Compression: 0.288684	Avg Compression: 0.528271
Article 115000:	Length: 19848	Compression: 0.599960	Avg Compression: 0.530803
Article 116000:	Length: 20024	Compression: 0.297293	Avg Compression: 0.522769
Article 117000:	Length: 20096	Compression: 0.316083	Avg Compression: 0.515870
Article 118000:	Length: 20176	Compression: 0.318497	Avg Compression: 0.509470
Article 119000:	Length: 20272	Compression: 0.314769	Avg Compression: 0.503327
Article 120000:	Length: 20408	Compression: 0.339965	Avg Compression: 0.498298
Article 121000:	Length: 20544	Compression: 0.350516	Avg Compression: 0.493856
Article 122000:	Length: 20664	Compression: 0.317944	Avg Compress

Article 215000:	Length: 193048	Compression: 0.439632	Avg Compression: 0.439498
Article 216000:	Length: 212904	Compression: 0.422289	Avg Compression: 0.438858
Article 217000:	Length: 238072	Compression: 0.426522	Avg Compression: 0.438365
Article 218000:	Length: 274168	Compression: 0.406561	Avg Compression: 0.436966
Article 219000:	Length: 329056	Compression: 0.387226	Avg Compression: 0.434472
Article 220000:	Length: 456256	Compression: 0.624904	Avg Compression: 0.446851
