In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds

In [2]:
def articles():
    with open('page_revisions_text', 'rb') as text_file:
        pending_article_data = b''
        while True:
            data = text_file.read(1024 * 1024)
            if len(data) == 0:
                break

            articles = data.split(b'\0')
            articles[0] = pending_article_data + articles[0]
            for index, article in enumerate(articles):
                if index + 1 == len(articles):
                    pending_article_data = article
                else:
                    yield article

        print(pending_article_data)
        if len(pending_article_data) != 0:
            yield pending_article_data

In [3]:
subword_text_encoder = tfds.features.text.SubwordTextEncoder.load_from_file('vocab_4096')

Да видим как би изглеждало обучение с кодираните статии...

In [4]:
import os

import numpy as np
import itertools

In [5]:
BATCH_SIZE = 64
BATCHED_ITEM_LENGTH = 128
BUFFER_SIZE = 256
TYPE=np.uint16

def articles_generator():
    for index, article in enumerate(itertools.islice(articles(), 0, 2000)):
        yield np.array(subword_text_encoder.encode(article + b'\0'), dtype=TYPE)

    # Pad the article count to the batch size
    # We do this to ensure that no data is dropped
    index += 1
    while index % BATCH_SIZE != 0:
        yield np.array([0], dtype=TYPE)
        index += 1

def subbatches():
    dataset = tf.data.Dataset.from_generator(articles_generator, output_types=TYPE)
    dataset = dataset.shuffle(BUFFER_SIZE)
    dataset = dataset.padded_batch(BATCH_SIZE, padded_shapes=([None]), drop_remainder=True)

    for batch in dataset.as_numpy_iterator():
        remaining = batch
        while remaining.shape[1] > 1:
            yield remaining[:, :BATCHED_ITEM_LENGTH]
            remaining = remaining[:, BATCHED_ITEM_LENGTH-1:]

dataset = tf.data.Dataset.from_generator(subbatches, output_types=TYPE, output_shapes=(BATCH_SIZE, None))
dataset = dataset.map(lambda batch: (batch[:, :-1], batch[:, 1:]))

dataset

<MapDataset shapes: ((64, None), (64, None)), types: (tf.uint16, tf.uint16)>

In [6]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    return tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]),
        tf.keras.layers.LSTM(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.LSTM(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.LSTM(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.LSTM(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size),
    ])

In [7]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

def average_batch_length(true_labels, predictions):
    return tf.shape(true_labels)[1]

model = build_model(vocab_size = subword_text_encoder.vocab_size, embedding_dim=512, rnn_units=512, batch_size=BATCH_SIZE)
model.compile(optimizer='adam', loss=loss, metrics=[average_batch_length])

In [8]:
checkpoint_dir = './training_checkpoints' # Directory where the checkpoints will be saved
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}") # Name of the checkpoint files

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, save_weights_only=True)

In [9]:
class ModelStateResetter(tf.keras.callbacks.Callback):
    def __init__(self):
        self.last_total_length = 0

    def on_batch_end(self, batch, logs={}):
        average_batch_length = logs.get('average_batch_length', 0)
        total_length = int(round(average_batch_length * (batch + 1)))
        current_batch_length = total_length - self.last_total_length
        self.last_total_length = total_length
        
        if current_batch_length < BATCHED_ITEM_LENGTH - 1:
            self.model.reset_states()
        
model_state_resetter_callback = ModelStateResetter()

In [31]:
total_epochs = 17

for epoch in range(total_epochs):
    print('Epoch %d/%d' % (epoch + 1, total_epochs))
    model.fit(dataset, callbacks=[checkpoint_callback, model_state_resetter_callback])

Epoch 1/17
Epoch 2/17
Epoch 3/17
Epoch 4/17
Epoch 5/17
Epoch 6/17
Epoch 7/17
Epoch 8/17
Epoch 9/17
Epoch 10/17
Epoch 11/17
Epoch 12/17
Epoch 13/17
Epoch 14/17
Epoch 15/17
Epoch 16/17
Epoch 17/17


In [32]:
with open('page_revisions_text', 'rb') as text_file:
    data = text_file.read()

article = data.split(b'\0')[120]
del data

encoded_article = np.array(subword_text_encoder.encode(article + b'\0'), dtype=TYPE)

print('Raw:', len(article))
print('Encoded:', len(encoded_article))

Raw: 25541
Encoded: 8222


In [345]:
import ctypes

class Huffman:
    huffman = ctypes.CDLL('x64/Release/huffman')
    
    huffman.create_tree.restype = ctypes.c_void_p
    huffman.destroy_tree.restype = None
    huffman.load_weights.restype = None
    huffman.create_code_string.restype = ctypes.c_char_p
    
    def __init__(self, category_count):
        self.tree = ctypes.c_void_p(self.huffman.create_tree(category_count))

    def __del__(self):
        self.huffman.destroy_tree(self.tree)
        
    def load_weights(self, weights):
        self.huffman.load_weights(self.tree, weights.ctypes.data_as(ctypes.POINTER(ctypes.c_float)))
    
    def get_code_length(self, category):
        return self.huffman.get_code_length(self.tree, category)

    def get_code_zero_count(self, category):
        return self.huffman.get_code_zero_count(self.tree, category)

In [332]:
def huffman_archive_size(model, text):
    archived_size = 0
    zeros = 0
    input_eval = np.array([[0]], dtype=TYPE)
    huffman_tree = Huffman(subword_text_encoder.vocab_size)

    text_generated = []

    model.reset_states()

    for index, byte in enumerate(text):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0) # remove the batch dimension

        weights = tf.nn.softmax(predictions[0]).numpy()
        huffman_tree.load_weights(weights)
        zeros += huffman_tree.get_code_zero_count(byte.item())
        archived_size += huffman_tree.get_code_length(byte.item())

        input_eval = tf.expand_dims([byte], 0)
  
    return archived_size, zeros

In [333]:
tf.train.latest_checkpoint(checkpoint_dir)
model = build_model(vocab_size = subword_text_encoder.vocab_size, embedding_dim=512, rnn_units=512, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

In [335]:
archived_size, zeros = huffman_archive_size(model, encoded_article)
print('Compressed length:', archived_size)

compression_ratio = archived_size / (len(encoded_article) * 8)
print('Compression ratio for encoded:', compression_ratio)

compression_ratio = archived_size / (len(article) * 8)
print('Compression ratio for raw:', compression_ratio)

k = (zeros / archived_size)
compression_ratio = compression_ratio * (-k * np.log2(k) - (1-k) * np.log2(1-k))
print('Potential compression ratio with arithmetic coding:', compression_ratio)

Compressed length: 41754
Compression ratio for encoded: 0.6347908051568961
Compression ratio for raw: 0.20434791120159743
Potential compression ratio with arithmetic coding: 0.20341159341828602


Ще се възползваме от натренирания модел и реалните данни, за да опитаме да направим микрооптимизации по C++ кода за дървета на Хъфман.

In [338]:
import time
import contextlib

@contextlib.contextmanager
def time_measure():
    start = time.perf_counter()
    yield
    end = time.perf_counter()
    print('Completed in %f seconds' % (end - start))

with time_measure():
    huffman_archive_size(model, encoded_article)

with time_measure():
    huffman_archive_size(model, encoded_article)

with time_measure():
    huffman_archive_size(model, encoded_article)

Completed in 58.676098 seconds
Completed in 58.564450 seconds
Completed in 58.636290 seconds


Тъй като Windows е малко капризен на тема заредени файлове, ще трябва да освободим C++ библиотеката преди да я рекомпилираме.

In [342]:
ctypes.windll.kernel32.FreeLibrary(ctypes.c_void_p(Huffman.huffman._handle))

0

Правим промяна в C++ кода - ползваме битови полета, за да смъкнем размера на възлите в дървото от 12 байта до 8 байта. Така би трябвало да се възползваме по-ефективно от кеша на процесора. След това презареждаме и тестваме...

In [343]:
class Huffman:
    huffman = ctypes.CDLL('x64/Release/huffman')
    
    huffman.create_tree.restype = ctypes.c_void_p
    huffman.destroy_tree.restype = None
    huffman.load_weights.restype = None
    huffman.create_code_string.restype = ctypes.c_char_p
    
    def __init__(self, category_count):
        self.tree = ctypes.c_void_p(self.huffman.create_tree(category_count))

    def __del__(self):
        self.huffman.destroy_tree(self.tree)
        
    def load_weights(self, weights):
        self.huffman.load_weights(self.tree, weights.ctypes.data_as(ctypes.POINTER(ctypes.c_float)))
    
    def get_code_length(self, category):
        return self.huffman.get_code_length(self.tree, category)

    def get_code_zero_count(self, category):
        return self.huffman.get_code_zero_count(self.tree, category)

In [346]:
with time_measure():
    huffman_archive_size(model, encoded_article)

with time_measure():
    huffman_archive_size(model, encoded_article)

with time_measure():
    huffman_archive_size(model, encoded_article)

Completed in 58.123267 seconds
Completed in 58.124431 seconds
Completed in 57.691206 seconds


Маргинално по-добре. Но все пак - по-добре.

Ще опитаме една малко по-странна оптимизация. При инициализация на тегла, за категории с вероятност над 50% от оставащата, ще слагаме възли в корена на дървото на Хъфман. Останалата част от дървото ще генерираме мързеливо при пъвво поискване.

In [393]:
ctypes.windll.kernel32.FreeLibrary(ctypes.c_void_p(Huffman.huffman._handle))

0

In [394]:
class Huffman:
    huffman = ctypes.CDLL('x64/Release/huffman')

    huffman.create_tree.restype = ctypes.c_void_p
    huffman.destroy_tree.restype = None
    huffman.load_weights.restype = None
    huffman.create_code_string.restype = ctypes.c_char_p

    def __init__(self, category_count):
        self.tree = ctypes.c_void_p(self.huffman.create_tree(category_count))

    def __del__(self):
        self.huffman.destroy_tree(self.tree)

    def load_weights(self, weights):
        self.huffman.load_weights(self.tree, weights.ctypes.data_as(ctypes.POINTER(ctypes.c_float)))

    def get_code_length(self, category):
        return self.huffman.get_code_length(self.tree, category)

    def get_code_zero_count(self, category):
        return self.huffman.get_code_zero_count(self.tree, category)

In [388]:
with time_measure():
    huffman_archive_size(model, encoded_article)

with time_measure():
    huffman_archive_size(model, encoded_article)

with time_measure():
    huffman_archive_size(model, encoded_article)

Completed in 57.132601 seconds
Completed in 57.169302 seconds
Completed in 57.840507 seconds


Отново - не твърде впечатляващо. Но пък тази оптимизация, би станала по-осезаема при по-добре тренирани модели.

Това беше невпечатляващо. Нека се върнем към тренирането на модела.

In [10]:
tf.train.latest_checkpoint(checkpoint_dir)
model = build_model(vocab_size = subword_text_encoder.vocab_size, embedding_dim=512, rnn_units=512, batch_size=BATCH_SIZE)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.compile(optimizer='adam', loss=loss, metrics=[average_batch_length])

In [11]:
total_epochs = 10

for epoch in range(total_epochs):
    print('Epoch %d/%d' % (epoch + 1, total_epochs))
    model.fit(dataset, callbacks=[checkpoint_callback, model_state_resetter_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [12]:
total_epochs = 5

for epoch in range(total_epochs):
    print('Epoch %d/%d' % (epoch + 1, total_epochs))
    model.fit(dataset, callbacks=[checkpoint_callback, model_state_resetter_callback])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [13]:
total_epochs = 5

for epoch in range(total_epochs):
    print('Epoch %d/%d' % (epoch + 1, total_epochs))
    model.fit(dataset, callbacks=[checkpoint_callback, model_state_resetter_callback])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [14]:
tf.train.latest_checkpoint(checkpoint_dir)
model = build_model(vocab_size = subword_text_encoder.vocab_size, embedding_dim=512, rnn_units=512, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

In [20]:
archived_size, zeros = huffman_archive_size(model, encoded_article)
print('Compressed length:', archived_size)

compression_ratio = archived_size / (len(encoded_article) * 8)
print('Compression ratio for encoded:', compression_ratio)

compression_ratio = archived_size / (len(article) * 8)
print('Compression ratio for raw:', compression_ratio)

k = (zeros / archived_size)
compression_ratio = compression_ratio * (-k * np.log2(k) - (1-k) * np.log2(1-k))
print('Potential compression ratio with arithmetic coding:', compression_ratio)

Compressed length: 37199
Compression ratio for encoded: 0.565540622719533
Compression ratio for raw: 0.1820553228142986
Potential compression ratio with arithmetic coding: 0.18082959868885462


Какво ли би станало, ако сменим loss функцията в движение?

In [24]:
from tensorflow.python.eager import context
from tensorflow.python.keras import backend_config
from tensorflow.python.framework import constant_op
from tensorflow.python.framework import ops
from tensorflow.python.framework import tensor_util
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import nn
from tensorflow.python.ops import clip_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import variables as variables_module

epsilon = backend_config.epsilon

def get_graph():
    if context.executing_eagerly():
        global _GRAPH
        if _GRAPH is None:
            _GRAPH = func_graph.FuncGraph('keras_graph')
        return _GRAPH
    else:
        return ops.get_default_graph()

def flatten(x):
    return array_ops.reshape(x, [-1])

def cast(x, dtype):
    return math_ops.cast(x, dtype)
  
def _is_symbolic_tensor(x):
    return tensor_util.is_tensor(x) and not isinstance(x, ops.EagerTensor)

# This is based around the `sparse_categorical_crossentropy` implementation in Keras:
# https://github.com/tensorflow/tensorflow/blob/v2.1.0/tensorflow/python/keras/backend.py#L4507-L4582
def loss(target, output, from_logits=False, axis=-1):
    if not from_logits:
        if (isinstance(output, (ops.EagerTensor, variables_module.Variable)) or output.op.type != 'Softmax'):
            epsilon_ = constant_op.constant(epsilon(), dtype=output.dtype.base_dtype)
            output = clip_ops.clip_by_value(output, epsilon_, 1 - epsilon_)
            output = math_ops.log(output)
        else:
            # When softmax activation function is used for output operation, we
            # use logits from the softmax function directly to compute loss in order
            # to prevent collapsing zero when training.
            # See b/117284466
            assert len(output.op.inputs) == 1
            output = output.op.inputs[0]
  
    if isinstance(output.shape, (tuple, list)):
        output_rank = len(output.shape)
    else:
        output_rank = output.shape.ndims

    if output_rank is not None:
        axis %= output_rank
        if axis != output_rank - 1:
            permutation = list(itertools.chain(range(axis), range(axis + 1, output_rank), [axis]))
            output = array_ops.transpose(output, perm=permutation)
    elif axis != -1:
        raise ValueError(
            'Cannot compute sparse categorical crossentropy with `axis={}` on an '
            'output tensor with unknown rank'.format(axis))
  
    target = cast(target, 'int64')
  
    # Try to adjust the shape so that rank of labels = rank of logits - 1.
    output_shape = array_ops.shape_v2(output)
    target_rank = target.shape.ndims
  
    update_shape = (target_rank is not None and output_rank is not None and target_rank != output_rank - 1)
    if update_shape:
        target = flatten(target)
        output = array_ops.reshape(output, [-1, output_shape[-1]])
  
    if __builtins__.any([_is_symbolic_tensor(v) for v in [target, output]]):
        with get_graph().as_default():
            res = huffman_code_lengths(labels=target, logits=output)
    else:
        res = huffman_code_lengths(labels=target, logits=output)
  
    if update_shape and output_rank >= 3:
        # If our output includes timesteps or spatial dimensions we need to reshape
        return array_ops.reshape(res, output_shape[:-1])
    else:
        return res

def huffman_code_lengths(labels, logits):
    category_count = logits.shape[-1] or 0
    return tf.reduce_mean(-tf.math.log(tf.one_hot(labels, depth=category_count) * tf.nn.softmax(logits) + 0.0001), axis=-1)

In [25]:
model = build_model(vocab_size = subword_text_encoder.vocab_size, embedding_dim=512, rnn_units=512, batch_size=BATCH_SIZE)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.compile(optimizer='adam', loss=loss, metrics=[average_batch_length])

In [26]:
model.fit(dataset, callbacks=[checkpoint_callback, model_state_resetter_callback])



<tensorflow.python.keras.callbacks.History at 0x143185dc348>

Нищо. Отново. Loss-а не се промени въобще.