In [1]:
import itertools
import os

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

physical_devices = tf.config.list_physical_devices('GPU')
for physical_device in physical_devices:
    tf.config.experimental.set_memory_growth(physical_device, enable=True)

In [2]:
TYPE=np.int16

subword_text_encoder = tfds.features.text.SubwordTextEncoder.load_from_file('vocab_4096')

class Articles:
    EMPTY_ARTICLE = np.array([], dtype=TYPE) # used for padding
    
    def __init__(self, path):
        with open(path, 'rb') as text_file:
            data = text_file.read()

        articles = set(data.split(b'\0'))
        articles = [np.array(subword_text_encoder.encode(article), dtype=TYPE) for article in articles]
        self.articles = sorted(articles, key=len)

    def articles_generator(self, batch_size = 1, start = 0, end = None):
        end = end or len(self.articles)

        for _ in range(batch_size - ((end - start - 1) % batch_size + 1)):
            yield self.EMPTY_ARTICLE

        for article in itertools.islice(self.articles, start, end):
            yield article

    def subbatch_generator(self, batch_size, batch_length, start = 0, end = None):
        end = end or len(self.articles)

        dataset = tf.data.Dataset.from_generator(self.articles_generator, args=(batch_size, start, end), output_types=TYPE)
        dataset = dataset.padded_batch(batch_size, padded_shapes=([None]), drop_remainder=True)
        dataset = dataset.shuffle(100)

        for batch in dataset.as_numpy_iterator():
            remaining = batch
            while remaining.shape[1] > batch_length + 1:
                yield remaining[:, :batch_length + 1]
                remaining = remaining[:, batch_length:]

            yield remaining
            if remaining.shape[1] == batch_length + 1:
                yield np.zeros((batch_size, 2), dtype=TYPE)

    def dataset(self, batch_size, batch_length, start = 0, end = None):
        end = end or len(self.articles)

        dataset = tf.data.Dataset.from_generator(self.subbatch_generator, args=(batch_size, batch_length, start, end), output_types=TYPE, output_shapes=(batch_size, None))
        return dataset.map(lambda batch: (batch[:, :-1], batch[:, 1:]))

In [3]:
articles = Articles('page_revisions_text')

In [4]:
import math

def batch_count(articles, batch_size, batched_item_length):
    return sum(math.ceil(len(x) / batched_item_length) for i, x in enumerate(articles.articles_generator(batch_size, batched_item_length)) if (i + 1) % batch_size == 0)

In [5]:
batch_count(articles, 4096, 12)

31302

In [6]:
steps = 12

for i in range(steps):
    batch_size = 6 * 2**i
    batch_item_length = 4 * 2**(steps - i - 1)
    count = batch_count(articles, batch_size, batch_item_length)
    print("batch size: %6d\t batch item length: %4d\tsteps per epoch: %6d" % (batch_size, batch_item_length, count))

batch size:      6	 batch item length: 8192	steps per epoch:  36599
batch size:     12	 batch item length: 4096	steps per epoch:  20420
batch size:     24	 batch item length: 2048	steps per epoch:  12715
batch size:     48	 batch item length: 1024	steps per epoch:   9349
batch size:     96	 batch item length:  512	steps per epoch:   8214
batch size:    192	 batch item length:  256	steps per epoch:   8075
batch size:    384	 batch item length:  128	steps per epoch:   8856
batch size:    768	 batch item length:   64	steps per epoch:  11003
batch size:   1536	 batch item length:   32	steps per epoch:  15605
batch size:   3072	 batch item length:   16	steps per epoch:  24999
batch size:   6144	 batch item length:    8	steps per epoch:  43949
batch size:  12288	 batch item length:    4	steps per epoch:  82067


In [15]:
BATCH_SIZE = 1536
BATCHED_ITEM_LENGTH = 32

def build_model(vocab_size, embedding_dim, rnn_units):
    return tf.keras.Sequential([
        tf.keras.layers.Masking(mask_value=0, batch_input_shape=[BATCH_SIZE, None]),
        tf.keras.layers.Embedding(vocab_size, embedding_dim),
        tf.keras.layers.GRU(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.GRU(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size),
    ])

checkpoint_dir = './training_checkpoints-4' # Directory where the checkpoints will be saved
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}") # Name of the checkpoint files

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, save_weights_only=True)

class ModelStateResetter(tf.keras.callbacks.Callback):
    def __init__(self):
        self.last_total_length = 0

    def on_batch_end(self, batch, logs={}):
        average_batch_length = logs.get('average_batch_length', 0)
        total_length = int(round(average_batch_length * (batch + 1)))
        current_batch_length = total_length - self.last_total_length
        self.last_total_length = total_length
        
        if current_batch_length < BATCHED_ITEM_LENGTH:
            self.model.reset_states()
        
model_state_resetter_callback = ModelStateResetter()

def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

def average_batch_length(true_labels, predictions):
    return tf.shape(true_labels)[1]

In [16]:
model = build_model(vocab_size = subword_text_encoder.vocab_size, embedding_dim=128, rnn_units=1024)
# model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.compile(optimizer='adam', loss=loss, metrics=[average_batch_length])
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking_3 (Masking)          (1536, None)              0         
_________________________________________________________________
embedding_3 (Embedding)      (1536, None, 128)         518144    
_________________________________________________________________
gru_11 (GRU)                 (1536, None, 1024)        3545088   
_________________________________________________________________
gru_12 (GRU)                 (1536, None, 1024)        6297600   
_________________________________________________________________
dense_3 (Dense)              (1536, None, 4048)        4149200   
Total params: 14,510,032
Trainable params: 14,510,032
Non-trainable params: 0
_________________________________________________________________


In [17]:
model.fit(articles.dataset(BATCH_SIZE, BATCHED_ITEM_LENGTH), epochs=5, callbacks=[checkpoint_callback, model_state_resetter_callback])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x2f679710448>

In [18]:
model.fit(articles.dataset(BATCH_SIZE, BATCHED_ITEM_LENGTH), epochs=5, callbacks=[checkpoint_callback, model_state_resetter_callback])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x2f7f58bfcc8>

In [19]:
model.fit(articles.dataset(BATCH_SIZE, BATCHED_ITEM_LENGTH), callbacks=[checkpoint_callback, model_state_resetter_callback])



<tensorflow.python.keras.callbacks.History at 0x2f7f6c87f48>

In [20]:
model.fit(articles.dataset(BATCH_SIZE, BATCHED_ITEM_LENGTH), callbacks=[checkpoint_callback, model_state_resetter_callback])



<tensorflow.python.keras.callbacks.History at 0x2f7f58bf888>

До тук добре. Числото изглежда много подобно на това, което сме виждали в предишни тетрадки с различен код, но при идентични параметри.

Да видим до каква компресия ще доведе този `loss`.

In [21]:
def build_predicting_model(vocab_size, embedding_dim, rnn_units):
    return tf.keras.Sequential([
        tf.keras.layers.Masking(mask_value=0, batch_input_shape=[1, 1]),
        tf.keras.layers.Embedding(vocab_size, embedding_dim),
        tf.keras.layers.GRU(rnn_units, stateful=True, return_sequences=True),
        tf.keras.layers.GRU(rnn_units, stateful=True, return_sequences=True),
        tf.keras.layers.Dense(vocab_size),
    ])

model = build_predicting_model(vocab_size = subword_text_encoder.vocab_size, embedding_dim=128, rnn_units=1024)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, 1]))

In [22]:
import ctypes

class Huffman:
    huffman = ctypes.CDLL('x64/Release/huffman')
    
    huffman.create_tree.restype = ctypes.c_void_p
    huffman.destroy_tree.restype = None
    huffman.load_weights.restype = None
    huffman.create_code_string.restype = ctypes.c_char_p
    
    def __init__(self, category_count):
        self.tree = ctypes.c_void_p(self.huffman.create_tree(category_count))

    def __del__(self):
        self.huffman.destroy_tree(self.tree)
        
    def load_weights(self, weights):
        self.huffman.load_weights(self.tree, weights.ctypes.data_as(ctypes.POINTER(ctypes.c_float)))
    
    def get_code_length(self, category):
        return self.huffman.get_code_length(self.tree, category)

    def get_code_zero_count(self, category):
        return self.huffman.get_code_zero_count(self.tree, category)

In [23]:
def huffman_archive_size(model, text):
    archived_size = 0
    zeros = 0
    input_eval = np.array([[0]], dtype=TYPE)
    huffman_tree = Huffman(subword_text_encoder.vocab_size)

    text_generated = []

    model.reset_states()

    for index, byte in enumerate(text):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0) # remove the batch dimension

        weights = tf.nn.softmax(predictions[0]).numpy()
        huffman_tree.load_weights(weights)
        zeros += huffman_tree.get_code_zero_count(byte.item())
        archived_size += huffman_tree.get_code_length(byte.item())

        input_eval = tf.expand_dims([byte], 0)
  
    return archived_size, zeros

In [24]:
total_raw = 0
total_compressed = 0

for index, encoded_article in enumerate(articles.articles_generator(1)):
    if index % 1000 == 0:
        article = subword_text_encoder.decode(encoded_article)
        raw = len(article) * 8
        if raw == 0:
            continue
        compressed, _ = huffman_archive_size(model, encoded_article)
        total_raw += raw
        total_compressed += compressed

        print('Article %d:\tLength: %d\tCompression: %f\tAvg Compression: %f' % (index, raw, compressed/raw, total_compressed/total_raw))

Article 1000:	Length: 200	Compression: 0.290000	Avg Compression: 0.290000
Article 2000:	Length: 168	Compression: 0.291667	Avg Compression: 0.290761
Article 3000:	Length: 200	Compression: 0.385000	Avg Compression: 0.323944
Article 4000:	Length: 168	Compression: 0.434524	Avg Compression: 0.349185
Article 5000:	Length: 232	Compression: 0.267241	Avg Compression: 0.329545
Article 6000:	Length: 200	Compression: 0.310000	Avg Compression: 0.326199
Article 7000:	Length: 192	Compression: 0.333333	Avg Compression: 0.327206
Article 8000:	Length: 192	Compression: 0.354167	Avg Compression: 0.330541
Article 9000:	Length: 192	Compression: 0.286458	Avg Compression: 0.325688
Article 10000:	Length: 176	Compression: 0.443182	Avg Compression: 0.336458
Article 11000:	Length: 192	Compression: 0.427083	Avg Compression: 0.344697
Article 12000:	Length: 160	Compression: 0.543750	Avg Compression: 0.358715
Article 13000:	Length: 192	Compression: 0.401042	Avg Compression: 0.362013
Article 14000:	Length: 264	Compres

Article 110000:	Length: 14384	Compression: 0.137653	Avg Compression: 0.174081
Article 111000:	Length: 17920	Compression: 0.168304	Avg Compression: 0.173869
Article 112000:	Length: 14072	Compression: 0.115833	Avg Compression: 0.172246
Article 113000:	Length: 20200	Compression: 0.061584	Avg Compression: 0.167974
Article 114000:	Length: 20384	Compression: 0.062402	Avg Compression: 0.164015
Article 115000:	Length: 20704	Compression: 0.068103	Avg Compression: 0.160497
Article 116000:	Length: 20632	Compression: 0.066644	Avg Compression: 0.157187
Article 117000:	Length: 16792	Compression: 0.190984	Avg Compression: 0.158130
Article 118000:	Length: 20896	Compression: 0.069248	Avg Compression: 0.155147
Article 119000:	Length: 21136	Compression: 0.062737	Avg Compression: 0.152113
Article 120000:	Length: 18768	Compression: 0.210358	Avg Compression: 0.153763
Article 121000:	Length: 21112	Compression: 0.067260	Avg Compression: 0.151092
Article 122000:	Length: 21496	Compression: 0.066710	Avg Compress

Article 215000:	Length: 199728	Compression: 0.189227	Avg Compression: 0.168575
Article 216000:	Length: 210688	Compression: 0.170133	Avg Compression: 0.168632
Article 217000:	Length: 243208	Compression: 0.182634	Avg Compression: 0.169195
Article 218000:	Length: 259328	Compression: 0.178049	Avg Compression: 0.169559
Article 219000:	Length: 348592	Compression: 0.192601	Avg Compression: 0.170767
Article 220000:	Length: 461312	Compression: 0.217889	Avg Compression: 0.173823
