In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds

In [2]:
def articles():
    with open('page_revisions_text', 'rb') as text_file:
        pending_article_data = b''
        while True:
            data = text_file.read(1024 * 1024)
            if len(data) == 0:
                break

            articles = data.split(b'\0')
            articles[0] = pending_article_data + articles[0]
            for index, article in enumerate(articles):
                if index + 1 == len(articles):
                    pending_article_data = article
                else:
                    yield article

        print(pending_article_data)
        if len(pending_article_data) != 0:
            yield pending_article_data

In [3]:
subword_text_encoder = tfds.features.text.SubwordTextEncoder.load_from_file('vocab_4096')

Да видим как би изглеждало обучение с кодираните статии...

In [4]:
import os

import numpy as np
import itertools

In [5]:
BATCH_SIZE = 128
BATCHED_ITEM_LENGTH = 256
BUFFER_SIZE = 256
TYPE=np.int16

def articles_generator():
    for index, article in enumerate(itertools.islice(articles(), 0, 2000)):
        yield np.array(subword_text_encoder.encode(article + b'\0'), dtype=TYPE)

    # Pad the article count to the batch size
    # We do this to ensure that no data is dropped
    index += 1
    while index % BATCH_SIZE != 0:
        yield np.array([0], dtype=TYPE)
        index += 1

def subbatches():
    dataset = tf.data.Dataset.from_generator(articles_generator, output_types=TYPE)
    dataset = dataset.shuffle(BUFFER_SIZE)
    dataset = dataset.padded_batch(BATCH_SIZE, padded_shapes=([None]), drop_remainder=True)

    for batch in dataset.as_numpy_iterator():
        remaining = batch
        while remaining.shape[1] > 1:
            yield remaining[:, :BATCHED_ITEM_LENGTH + 1]
            remaining = remaining[:, BATCHED_ITEM_LENGTH:]

dataset = tf.data.Dataset.from_generator(subbatches, output_types=TYPE, output_shapes=(BATCH_SIZE, None))
dataset = dataset.map(lambda batch: (batch[:, :-1], batch[:, 1:]))

dataset

<MapDataset shapes: ((128, None), (128, None)), types: (tf.int16, tf.int16)>

In [16]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    return tf.keras.Sequential([
        tf.keras.layers.Masking(mask_value=0, batch_input_shape=[batch_size, None]),
        tf.keras.layers.Embedding(vocab_size, embedding_dim),
        tf.keras.layers.LSTM(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.LSTM(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size),
    ])

In [17]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

def average_batch_length(true_labels, predictions):
    return tf.shape(true_labels)[1]

model = build_model(vocab_size = subword_text_encoder.vocab_size, embedding_dim=512, rnn_units=1024, batch_size=BATCH_SIZE)
model.compile(optimizer='adam', loss=loss, metrics=[average_batch_length])

In [18]:
checkpoint_dir = './training_checkpoints' # Directory where the checkpoints will be saved
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}") # Name of the checkpoint files

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, save_weights_only=True)

In [19]:
class ModelStateResetter(tf.keras.callbacks.Callback):
    def __init__(self):
        self.last_total_length = 0

    def on_batch_end(self, batch, logs={}):
        average_batch_length = logs.get('average_batch_length', 0)
        total_length = int(round(average_batch_length * (batch + 1)))
        current_batch_length = total_length - self.last_total_length
        self.last_total_length = total_length
        
        if current_batch_length < BATCHED_ITEM_LENGTH:
            self.model.reset_states()
        
model_state_resetter_callback = ModelStateResetter()

In [20]:
total_epochs = 10

for epoch in range(total_epochs):
    print('Epoch %d/%d' % (epoch + 1, total_epochs))
    model.fit(dataset, callbacks=[checkpoint_callback, model_state_resetter_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [21]:
total_epochs = 7

for epoch in range(total_epochs):
    print('Epoch %d/%d' % (epoch + 1, total_epochs))
    model.fit(dataset, callbacks=[checkpoint_callback, model_state_resetter_callback])

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [22]:
with open('page_revisions_text', 'rb') as text_file:
    data = text_file.read()

article = data.split(b'\0')[120]
del data

encoded_article = np.array(subword_text_encoder.encode(article + b'\0'), dtype=TYPE)

print('Raw:', len(article))
print('Encoded:', len(encoded_article))

Raw: 25541
Encoded: 8222


In [23]:
import ctypes

class Huffman:
    huffman = ctypes.CDLL('x64/Release/huffman')
    
    huffman.create_tree.restype = ctypes.c_void_p
    huffman.destroy_tree.restype = None
    huffman.load_weights.restype = None
    huffman.create_code_string.restype = ctypes.c_char_p
    
    def __init__(self, category_count):
        self.tree = ctypes.c_void_p(self.huffman.create_tree(category_count))

    def __del__(self):
        self.huffman.destroy_tree(self.tree)
        
    def load_weights(self, weights):
        self.huffman.load_weights(self.tree, weights.ctypes.data_as(ctypes.POINTER(ctypes.c_float)))
    
    def get_code_length(self, category):
        return self.huffman.get_code_length(self.tree, category)

    def get_code_zero_count(self, category):
        return self.huffman.get_code_zero_count(self.tree, category)

In [24]:
def huffman_archive_size(model, text):
    archived_size = 0
    zeros = 0
    input_eval = np.array([[0]], dtype=TYPE)
    huffman_tree = Huffman(subword_text_encoder.vocab_size)

    text_generated = []

    model.reset_states()

    for index, byte in enumerate(text):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0) # remove the batch dimension

        weights = tf.nn.softmax(predictions[0]).numpy()
        huffman_tree.load_weights(weights)
        zeros += huffman_tree.get_code_zero_count(byte.item())
        archived_size += huffman_tree.get_code_length(byte.item())

        input_eval = tf.expand_dims([byte], 0)
  
    return archived_size, zeros

In [25]:
tf.train.latest_checkpoint(checkpoint_dir)
model = build_model(vocab_size = subword_text_encoder.vocab_size, embedding_dim=512, rnn_units=1024, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

In [27]:
total_raw = 0
total_compressed = 0

for index, article in enumerate(itertools.islice(articles(), 0, 120)):
    raw = (len(article) + 1) * 8
    encoded_article = np.array(subword_text_encoder.encode(article + b'\0'), dtype=TYPE)
    compressed, _ = huffman_archive_size(model, encoded_article)
    total_raw += raw
    total_compressed += compressed
    print('Article %d:\tCompression: %f\tAvg Compression: %f' % (index, compressed/raw, total_compressed/total_raw))

Article 0:	Compression: 0.430556	Avg Compression: 0.430556
Article 1:	Compression: 0.255952	Avg Compression: 0.308333
Article 2:	Compression: 0.216837	Avg Compression: 0.267202
Article 3:	Compression: 0.291667	Avg Compression: 0.272482
Article 4:	Compression: 0.314286	Avg Compression: 0.280891
Article 5:	Compression: 0.185897	Avg Compression: 0.263498
Article 6:	Compression: 0.193331	Avg Compression: 0.193585
Article 7:	Compression: 0.192568	Avg Compression: 0.193585
Article 8:	Compression: 0.200000	Avg Compression: 0.193589
Article 9:	Compression: 0.181548	Avg Compression: 0.193580
Article 10:	Compression: 0.220395	Avg Compression: 0.193598
Article 11:	Compression: 0.170455	Avg Compression: 0.193580
Article 12:	Compression: 0.176136	Avg Compression: 0.193567
Article 13:	Compression: 0.203947	Avg Compression: 0.193574
Article 14:	Compression: 0.178191	Avg Compression: 0.193562
Article 15:	Compression: 0.317708	Avg Compression: 0.193612
Article 16:	Compression: 0.282143	Avg Compression:

Това определено изглежда доста по-разумно. При това със стойност за loss около `0.32`. Да видим колко компресия можем да постигнем след още трениране...

In [28]:
model = build_model(vocab_size = subword_text_encoder.vocab_size, embedding_dim=512, rnn_units=1024, batch_size=BATCH_SIZE)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.compile(optimizer='adam', loss=loss, metrics=[average_batch_length])

In [29]:
total_epochs = 10

for epoch in range(total_epochs):
    print('Epoch %d/%d' % (epoch + 1, total_epochs))
    model.fit(dataset, callbacks=[checkpoint_callback, model_state_resetter_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [30]:
total_epochs = 8

for epoch in range(total_epochs):
    print('Epoch %d/%d' % (epoch + 1, total_epochs))
    model.fit(dataset, callbacks=[checkpoint_callback, model_state_resetter_callback])

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [31]:
total_epochs = 5

for epoch in range(total_epochs):
    print('Epoch %d/%d' % (epoch + 1, total_epochs))
    model.fit(dataset, callbacks=[checkpoint_callback, model_state_resetter_callback])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [32]:
total_epochs = 5

for epoch in range(total_epochs):
    print('Epoch %d/%d' % (epoch + 1, total_epochs))
    model.fit(dataset, callbacks=[checkpoint_callback, model_state_resetter_callback])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [33]:
total_epochs = 5

for epoch in range(total_epochs):
    print('Epoch %d/%d' % (epoch + 1, total_epochs))
    model.fit(dataset, callbacks=[checkpoint_callback, model_state_resetter_callback])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [34]:
model.fit(dataset, callbacks=[checkpoint_callback, model_state_resetter_callback])



<tensorflow.python.keras.callbacks.History at 0x2731c070c08>

In [35]:
model.fit(dataset, callbacks=[checkpoint_callback, model_state_resetter_callback]);



In [36]:
total_epochs = 3

for epoch in range(total_epochs):
    print('Epoch %d/%d' % (epoch + 1, total_epochs))
    model.fit(dataset, callbacks=[checkpoint_callback, model_state_resetter_callback])

Epoch 1/3
Epoch 2/3
Epoch 3/3


Нека видим какво е научила невронната мрежа...

In [38]:
tf.train.latest_checkpoint(checkpoint_dir)
model = build_model(vocab_size = subword_text_encoder.vocab_size, embedding_dim=512, rnn_units=1024, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

In [39]:
total_raw = 0
total_compressed = 0

for index, article in enumerate(itertools.islice(articles(), 0, 120)):
    raw = (len(article) + 1) * 8
    encoded_article = np.array(subword_text_encoder.encode(article + b'\0'), dtype=TYPE)
    compressed, _ = huffman_archive_size(model, encoded_article)
    total_raw += raw
    total_compressed += compressed
    print('Article %d:\tCompression: %f\tAvg Compression: %f' % (index, compressed/raw, total_compressed/total_raw))

Article 0:	Compression: 0.604167	Avg Compression: 0.604167
Article 1:	Compression: 0.372024	Avg Compression: 0.441667
Article 2:	Compression: 0.295918	Avg Compression: 0.376147
Article 3:	Compression: 0.412500	Avg Compression: 0.383993
Article 4:	Compression: 0.425000	Avg Compression: 0.392241
Article 5:	Compression: 0.272436	Avg Compression: 0.370305
Article 6:	Compression: 0.131288	Avg Compression: 0.132154
Article 7:	Compression: 0.277027	Avg Compression: 0.132245
Article 8:	Compression: 0.300000	Avg Compression: 0.132359
Article 9:	Compression: 0.267857	Avg Compression: 0.132456
Article 10:	Compression: 0.328947	Avg Compression: 0.132582
Article 11:	Compression: 0.272727	Avg Compression: 0.132687
Article 12:	Compression: 0.250000	Avg Compression: 0.132774
Article 13:	Compression: 0.302632	Avg Compression: 0.132884
Article 14:	Compression: 0.263298	Avg Compression: 0.132987
Article 15:	Compression: 0.458333	Avg Compression: 0.133119
Article 16:	Compression: 0.428571	Avg Compression:

Може да е само илюзия, но изглежда, че компресията става по-добра с напредването на статиите. Некато го проверим, гледайки компресирания размер на последните 100 статии, използвани за обучение.

In [41]:
total_raw = 0
total_compressed = 0

for index, article in enumerate(itertools.islice(articles(), 1900, 2000)):
    raw = (len(article) + 1) * 8
    encoded_article = np.array(subword_text_encoder.encode(article + b'\0'), dtype=TYPE)
    compressed, _ = huffman_archive_size(model, encoded_article)
    total_raw += raw
    total_compressed += compressed
    print('Article %d:\tCompression: %f\tAvg Compression: %f' % (index, compressed/raw, total_compressed/total_raw))

Article 0:	Compression: 0.113915	Avg Compression: 0.113915
Article 1:	Compression: 0.111272	Avg Compression: 0.111688
Article 2:	Compression: 0.136296	Avg Compression: 0.119879
Article 3:	Compression: 0.099103	Avg Compression: 0.101286
Article 4:	Compression: 0.123359	Avg Compression: 0.102694
Article 5:	Compression: 0.113522	Avg Compression: 0.103155
Article 6:	Compression: 0.128168	Avg Compression: 0.105730
Article 7:	Compression: 0.111517	Avg Compression: 0.106563
Article 8:	Compression: 0.133822	Avg Compression: 0.110462
Article 9:	Compression: 0.545455	Avg Compression: 0.110580
Article 10:	Compression: 0.128954	Avg Compression: 0.110909
Article 11:	Compression: 0.137648	Avg Compression: 0.111675
Article 12:	Compression: 0.200331	Avg Compression: 0.111987
Article 13:	Compression: 0.119649	Avg Compression: 0.112366
Article 14:	Compression: 0.102357	Avg Compression: 0.112189
Article 15:	Compression: 0.097865	Avg Compression: 0.111682
Article 16:	Compression: 0.094076	Avg Compression:

Да, определено. Трябва да се поработи над dataset-а.