In [1]:
import tensorflow as tf

import numpy as np
import os
import time

In [2]:
with open('page_revisions_text', 'rb') as text_file:
    data = text_file.read()
    
articles = data.split(b'\0')
del data

print('Nummber of articles:', len(articles))
print('Nummber of bytes in articles:', sum(len(a) for a in articles))

Nummber of articles: 243427
Nummber of bytes in articles: 887891160


In [3]:
# The unique bytes in the file
vocab = sorted(set(b''.join(articles)))
print ('{} unique characters'.format(len(vocab)))
del vocab

206 unique characters


In [4]:
text_as_int = np.frombuffer(b'. '.join(articles[:2000]), dtype=np.uint8)
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

In [5]:
seq_length = 500

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

sequences = char_dataset.batch(seq_length + 1, drop_remainder=True)

In [6]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [7]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<BatchDataset shapes: ((64, 500), (64, 500)), types: (tf.uint8, tf.uint8)>

In [8]:
# Length of the vocabulary in chars
vocab_size = 256

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [9]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]),
    tf.keras.layers.GRU(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

In [10]:
model = build_model(
  vocab_size = 256,
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

In [11]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [12]:
model.compile(optimizer='adam', loss=loss)

In [13]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [14]:
history = model.fit(dataset, epochs=30, callbacks=[checkpoint_callback])

Train for 438 steps
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [15]:
history = model.fit(dataset, epochs=10, callbacks=[checkpoint_callback])

Train for 438 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [16]:
history = model.fit(dataset, epochs=10, callbacks=[checkpoint_callback])

Train for 438 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [18]:
tf.train.latest_checkpoint(checkpoint_dir)

'./training_checkpoints\\ckpt_1'

In [19]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

In [20]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            65536     
_________________________________________________________________
gru_1 (GRU)                  (1, None, 1024)           3938304   
_________________________________________________________________
dense_1 (Dense)              (1, None, 256)            262400    
Total params: 4,266,240
Trainable params: 4,266,240
Non-trainable params: 0
_________________________________________________________________


In [21]:
import huffman

def huffman_archive_size(model, text):
    archived_size = 0
    ones = 0
    input_eval = [s for s in b' ']
    input_eval = tf.expand_dims(input_eval, 0)
  
    # Empty string to store our results
    text_generated = []

    model.reset_states()

    for byte in text:
        predictions = model(input_eval)
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)
  
        probabilities = tf.nn.softmax(predictions[0])
        codebook = huffman.codebook([index, tensor.numpy()] for index, tensor in enumerate(probabilities))

        code = codebook[byte]
        ones += code.count('1')
        archived_size += len(code)

        # using a categorical distribution to predict the byte returned by the model
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        input_eval = tf.expand_dims([byte], 0)
  
    return ones, archived_size

article = articles[120]
ones, archived_size = huffman_archive_size(model, article)
print('\nTotal length:', archived_size)


Total length: 74824


In [22]:
compression_ratio = archived_size / (len(article) * 8)
compression_ratio

0.36619552875768374

In [23]:
k = (ones / archived_size)
compression_ratio * (-k * np.log2(k) - (1-k) * np.log2(1-k))

0.3613850212453604

## Да опитаме с LSTM

In [29]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]),
    tf.keras.layers.LSTM(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

In [30]:
model = build_model(
  vocab_size = 256,
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

In [31]:
model.compile(optimizer='adam', loss=loss)

In [32]:
history = model.fit(dataset, epochs=30, callbacks=[checkpoint_callback])

Train for 438 steps
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [33]:
history = model.fit(dataset, epochs=10, callbacks=[checkpoint_callback])

Train for 438 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [34]:
history = model.fit(dataset, epochs=10, callbacks=[checkpoint_callback])

Train for 438 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [35]:
history = model.fit(dataset, epochs=10, callbacks=[checkpoint_callback])

Train for 438 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [36]:
history = model.fit(dataset, epochs=10, callbacks=[checkpoint_callback])

Train for 438 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [37]:
history = model.fit(dataset, epochs=10, callbacks=[checkpoint_callback])

Train for 438 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Да опитраме с различни оптимизатори

In [156]:
model = build_model(
  vocab_size = 256,
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

from tensorflow.python.eager import context
from tensorflow.python.keras import backend_config
from tensorflow.python.framework import constant_op
from tensorflow.python.framework import tensor_util
from tensorflow.python.ops import nn
from tensorflow.python.ops import clip_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import variables as variables_module

epsilon = backend_config.epsilon

def get_graph():
    if context.executing_eagerly():
        global _GRAPH
        if _GRAPH is None:
            _GRAPH = func_graph.FuncGraph('keras_graph')
        return _GRAPH
    else:
        return ops.get_default_graph()

def flatten(x):
    return array_ops.reshape(x, [-1])

def cast(x, dtype):
    return math_ops.cast(x, dtype)
  
def _is_symbolic_tensor(x):
    return tensor_util.is_tensor(x) and not isinstance(x, ops.EagerTensor)

# This is based around the `sparse_categorical_crossentropy` implementation in Keras:
# https://github.com/tensorflow/tensorflow/blob/v2.1.0/tensorflow/python/keras/backend.py#L4507-L4582
def loss(target, output, from_logits=False, axis=-1):
    if not from_logits:
        if (isinstance(output, (ops.EagerTensor, variables_module.Variable)) or output.op.type != 'Softmax'):
            epsilon_ = constant_op.constant(epsilon(), dtype=output.dtype.base_dtype)
            output = clip_ops.clip_by_value(output, epsilon_, 1 - epsilon_)
            output = math_ops.log(output)
        else:
            # When softmax activation function is used for output operation, we
            # use logits from the softmax function directly to compute loss in order
            # to prevent collapsing zero when training.
            # See b/117284466
            assert len(output.op.inputs) == 1
            output = output.op.inputs[0]
  
    if isinstance(output.shape, (tuple, list)):
        output_rank = len(output.shape)
    else:
        output_rank = output.shape.ndims

    if output_rank is not None:
        axis %= output_rank
        if axis != output_rank - 1:
            permutation = list(itertools.chain(range(axis), range(axis + 1, output_rank), [axis]))
            output = array_ops.transpose(output, perm=permutation)
    elif axis != -1:
        raise ValueError(
            'Cannot compute sparse categorical crossentropy with `axis={}` on an '
            'output tensor with unknown rank'.format(axis))
  
    target = cast(target, 'int64')
  
    # Try to adjust the shape so that rank of labels = rank of logits - 1.
    output_shape = array_ops.shape_v2(output)
    target_rank = target.shape.ndims
  
    update_shape = (target_rank is not None and output_rank is not None and target_rank != output_rank - 1)
    if update_shape:
        target = flatten(target)
        output = array_ops.reshape(output, [-1, output_shape[-1]])
  
    if __builtins__.any([_is_symbolic_tensor(v) for v in [target, output]]):
        with get_graph().as_default():
            res = huffman_code_lengths(labels=target, logits=output)
    else:
        res = huffman_code_lengths(labels=target, logits=output)
  
    if update_shape and output_rank >= 3:
        # If our output includes timesteps or spatial dimensions we need to reshape
        return array_ops.reshape(res, output_shape[:-1])
    else:
        return res

def huffman_code_lengths(labels, logits):
    category_count = logits.shape[-1] or 0
    return tf.reduce_sum(-tf.math.log(tf.one_hot(labels, depth=category_count) * tf.nn.softmax(logits) + 0.0001), axis=-1)
    
model.compile(optimizer='adam', loss=loss)

history = model.fit(dataset, epochs=1, callbacks=[checkpoint_callback])

Train for 438 steps


In [157]:
history = model.fit(dataset, epochs=10, callbacks=[checkpoint_callback])

Train for 438 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [159]:
tf.train.latest_checkpoint(checkpoint_dir)
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

In [162]:
article = articles[120][:500]
ones, archived_size = huffman_archive_size(model, article)
print('Total length:', archived_size)

compression_ratio = archived_size / (len(article) * 8)
print('Compression ratio:', compression_ratio)

k = (ones / archived_size)
compression_ratio * (-k * np.log2(k) - (1-k) * np.log2(1-k))
print('Potential compression ratio with arithmetic coding:', compression_ratio)

Total length: 3015
Compression ratio: 0.75375
Potential compression ratio with arithmetic coding: 0.75375


In [165]:
epsilon = backend_config.epsilon

def huffman_code_lengths(labels, logits):
    category_count = logits.shape[-1] or 0
    return tf.reduce_sum(tf.one_hot(labels, depth=category_count) * tf.nn.softmax(logits), axis=-1)

model = build_model(
  vocab_size = 256,
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

model.compile(optimizer='adam', loss=loss)

history = model.fit(dataset, epochs=10, callbacks=[checkpoint_callback])

Train for 438 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [167]:
tf.train.latest_checkpoint(checkpoint_dir)
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

In [169]:
article = articles[120][:500]
ones, archived_size = huffman_archive_size(model, article)
print('Total length:', archived_size)

compression_ratio = archived_size / (len(article) * 8)
print('Compression ratio:', compression_ratio)

k = (ones / archived_size)
compression_ratio * (-k * np.log2(k) - (1-k) * np.log2(1-k))
print('Potential compression ratio with arithmetic coding:', compression_ratio)

Total length: 4551
Compression ratio: 1.13775
Potential compression ratio with arithmetic coding: 1.13775
