In [1]:
import os

import tensorflow as tf
import numpy as np

In [2]:
BATCH_SIZE = 64
BATCHED_ITEM_LENGTH = 128
BUFFER_SIZE = 256

with open('page_revisions_text', 'rb') as text_file:
    data = text_file.read()

articles = sorted(data.split(b'\0')[:2000], key=len)

def articles_generator():
    for index, article in enumerate(articles):
        yield np.frombuffer(article + b'\0', dtype=np.uint8)

    # Pad the article count to the batch size
    # We do this to ensure that no data is dropped
    index += 1
    while index % BATCH_SIZE != 0:
        yield np.frombuffer(b'\0', dtype=np.uint8)
        index += 1
        
def subbatches():
    dataset = tf.data.Dataset.from_generator(articles_generator, output_types=tf.uint8)
    dataset = dataset.shuffle(BUFFER_SIZE)
    dataset = dataset.padded_batch(BATCH_SIZE, padded_shapes=([None]), drop_remainder=True)

    for batch in dataset.as_numpy_iterator():
        remaining = batch
        while remaining.shape[1] > 1:
            yield remaining[:, :BATCHED_ITEM_LENGTH]
            remaining = remaining[:, BATCHED_ITEM_LENGTH-1:]

dataset = tf.data.Dataset.from_generator(subbatches, output_types=tf.uint8, output_shapes=(BATCH_SIZE, None))
dataset = dataset.map(lambda batch: (batch[:, :-1], batch[:, 1:]))

dataset

<MapDataset shapes: ((64, None), (64, None)), types: (tf.uint8, tf.uint8)>

In [3]:
vocab_size = 256
embedding_dim = 256
rnn_units = 1024

def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    return tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]),
        tf.keras.layers.LSTM(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size),
    ])

In [8]:
from tensorflow.python.eager import context
from tensorflow.python.keras import backend_config
from tensorflow.python.framework import constant_op
from tensorflow.python.framework import ops
from tensorflow.python.framework import tensor_util
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import nn
from tensorflow.python.ops import clip_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import variables as variables_module

epsilon = backend_config.epsilon

def get_graph():
    if context.executing_eagerly():
        global _GRAPH
        if _GRAPH is None:
            _GRAPH = func_graph.FuncGraph('keras_graph')
        return _GRAPH
    else:
        return ops.get_default_graph()

def flatten(x):
    return array_ops.reshape(x, [-1])

def cast(x, dtype):
    return math_ops.cast(x, dtype)
  
def _is_symbolic_tensor(x):
    return tensor_util.is_tensor(x) and not isinstance(x, ops.EagerTensor)

# This is based around the `sparse_categorical_crossentropy` implementation in Keras:
# https://github.com/tensorflow/tensorflow/blob/v2.1.0/tensorflow/python/keras/backend.py#L4507-L4582
def loss(target, output, from_logits=False, axis=-1):
    if not from_logits:
        if (isinstance(output, (ops.EagerTensor, variables_module.Variable)) or output.op.type != 'Softmax'):
            epsilon_ = constant_op.constant(epsilon(), dtype=output.dtype.base_dtype)
            output = clip_ops.clip_by_value(output, epsilon_, 1 - epsilon_)
            output = math_ops.log(output)
        else:
            # When softmax activation function is used for output operation, we
            # use logits from the softmax function directly to compute loss in order
            # to prevent collapsing zero when training.
            # See b/117284466
            assert len(output.op.inputs) == 1
            output = output.op.inputs[0]
  
    if isinstance(output.shape, (tuple, list)):
        output_rank = len(output.shape)
    else:
        output_rank = output.shape.ndims

    if output_rank is not None:
        axis %= output_rank
        if axis != output_rank - 1:
            permutation = list(itertools.chain(range(axis), range(axis + 1, output_rank), [axis]))
            output = array_ops.transpose(output, perm=permutation)
    elif axis != -1:
        raise ValueError(
            'Cannot compute sparse categorical crossentropy with `axis={}` on an '
            'output tensor with unknown rank'.format(axis))
  
    target = cast(target, 'int64')
  
    # Try to adjust the shape so that rank of labels = rank of logits - 1.
    output_shape = array_ops.shape_v2(output)
    target_rank = target.shape.ndims
  
    update_shape = (target_rank is not None and output_rank is not None and target_rank != output_rank - 1)
    if update_shape:
        target = flatten(target)
        output = array_ops.reshape(output, [-1, output_shape[-1]])
  
    if __builtins__.any([_is_symbolic_tensor(v) for v in [target, output]]):
        with get_graph().as_default():
            res = huffman_code_lengths(labels=target, logits=output)
    else:
        res = huffman_code_lengths(labels=target, logits=output)
  
    if update_shape and output_rank >= 3:
        # If our output includes timesteps or spatial dimensions we need to reshape
        return array_ops.reshape(res, output_shape[:-1])
    else:
        return res

def huffman_code_lengths(labels, logits):
    category_count = logits.shape[-1] or 0
    return tf.reduce_sum(-tf.math.log(tf.one_hot(labels, depth=category_count) * tf.nn.softmax(logits) + 0.0001), axis=-1)

In [33]:
def average_batch_length(true_labels, predictions):
    return tf.shape(true_labels)[1]

In [34]:
checkpoint_dir = './training_checkpoints' # Directory where the checkpoints will be saved
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}") # Name of the checkpoint files

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, save_weights_only=True)

In [35]:
class ModelStateResetter(tf.keras.callbacks.Callback):
    def __init__(self):
        self.last_total_length = 0

    def on_batch_end(self, batch, logs={}):
        total_length = int(round(logs['average_batch_length'] * (batch + 1)))
        current_batch_length = total_length - self.last_total_length
        self.last_total_length = total_length
        
        if current_batch_length < BATCHED_ITEM_LENGTH - 1:
            self.model.reset_states()
        
model_state_resetter_callback = ModelStateResetter()

In [38]:
for learning_rate in [0.1, 0.3, 1, 3, 10]:
    print('Learning rate %f:' % learning_rate)
    optimizer = tf.keras.optimizers.Adadelta(learning_rate=learning_rate)

    model = build_model(vocab_size = vocab_size, embedding_dim=embedding_dim, rnn_units=rnn_units, batch_size=BATCH_SIZE)
    model.compile(optimizer=optimizer, loss=loss, metrics=[average_batch_length])
    model.fit(dataset, callbacks=[checkpoint_callback, model_state_resetter_callback])

Learning rate 0.100000:
Learning rate 0.300000:
Learning rate 1.000000:
Learning rate 3.000000:
Learning rate 10.000000:


In [39]:
for learning_rate in [30, 100, 300]:
    print('Learning rate %f:' % learning_rate)
    optimizer = tf.keras.optimizers.Adadelta(learning_rate=learning_rate)

    model = build_model(vocab_size = vocab_size, embedding_dim=embedding_dim, rnn_units=rnn_units, batch_size=BATCH_SIZE)
    model.compile(optimizer=optimizer, loss=loss, metrics=[average_batch_length])
    model.fit(dataset, callbacks=[checkpoint_callback, model_state_resetter_callback])

Learning rate 30.000000:
Learning rate 100.000000:
Learning rate 300.000000:


In [49]:
with open('page_revisions_text', 'rb') as text_file:
    data = text_file.read()

articles = data.split(b'\0')[:2000]
del data

article = articles[151]
del articles

len(article)

7047

In [50]:
import huffman

def huffman_archive_size(model, text):
    archived_size = 0
    ones = 0
    input_eval = [s for s in b' ']
    input_eval = tf.expand_dims(input_eval, 0)
  
    # Empty string to store our results
    text_generated = []

    model.reset_states()

    for byte in text:
        predictions = model(input_eval)
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)
  
        probabilities = tf.nn.softmax(predictions[0])
        codebook = huffman.codebook([index, tensor.numpy()] for index, tensor in enumerate(probabilities))

        code = codebook[byte]
        ones += code.count('1')
        archived_size += len(code)

        input_eval = tf.expand_dims([byte], 0)
  
    return ones, archived_size

In [51]:
tf.train.latest_checkpoint(checkpoint_dir)
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

In [52]:
ones, archived_size = huffman_archive_size(model, article)
print('\nTotal length:', archived_size)

compression_ratio = archived_size / (len(article) * 8)
print('Compression ratio:', compression_ratio)

k = (ones / archived_size)
arithmetic_compression_ratio = compression_ratio * (-k * np.log2(k) - (1-k) * np.log2(1-k))
print('Potential compression ratio with arithmetic coding:', arithmetic_compression_ratio)


Total length: 121289
Compression ratio: 2.151429686391372
Potential compression ratio with arithmetic coding: 2.115375461462686
