In [1]:
import itertools
import math
import os

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

physical_devices = tf.config.list_physical_devices('GPU')
for physical_device in physical_devices:
    tf.config.experimental.set_memory_growth(physical_device, enable=True)

In [2]:
TYPE=np.int16

class Articles:
    EMPTY_ARTICLE = np.array([], dtype=TYPE) # used for padding
    
    def __init__(self, path):
        with open(path, 'rb') as text_file:
            data = text_file.read()

        self.articles = sorted(set(data.split(b'\0')), key=len)
        self.articles = [np.array(list(article), dtype=TYPE) for article in self.articles]

    def articles_generator(self, batch_size = 1, start = 0, end = None):
        end = end or len(self.articles)

        for _ in range(batch_size - ((end - start - 1) % batch_size + 1)):
            yield self.EMPTY_ARTICLE

        for article in itertools.islice(self.articles, start, end):
            yield article

    def subbatch_generator(self, batch_size, batch_length, start = 0, end = None):
        end = end or len(self.articles)

        dataset = tf.data.Dataset.from_generator(self.articles_generator, args=(batch_size, start, end), output_types=TYPE)
        dataset = dataset.padded_batch(batch_size, padded_shapes=([None]), drop_remainder=True)
        dataset = dataset.shuffle(100)

        for batch in dataset.as_numpy_iterator():
            remaining = batch
            while remaining.shape[1] > batch_length + 1:
                yield remaining[:, :batch_length + 1]
                remaining = remaining[:, batch_length:]

            if remaining.shape[1] == batch_length + 1:
                yield remaining
                yield np.zeros((batch_size, batch_length + 1), dtype=TYPE)
            else:
                yield np.hstack([remaining, np.zeros([batch_size, batch_length - remaining.shape[1] + 1])])

    def steps(self, batch_size, batch_length):
        articles = self.articles_generator(batch_size, batch_length)
        return sum(math.ceil(len(article) / batch_length + 1) for i, article in enumerate(articles) if (i + 1) % batch_size == 0)

    def dataset(self, batch_size, batch_length, start = 0, end = None):
        end = end or len(self.articles)

        dataset = tf.data.Dataset.from_generator(self.subbatch_generator, args=(batch_size, batch_length, start, end), output_types=TYPE, output_shapes=(batch_size, batch_length + 1))
        return dataset.map(lambda batch: (batch[:, :-1], batch[:, 1:]))

In [3]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

def average_final_batch_ratio(true_labels, predictions):
    return 0 ** tf.math.abs(true_labels[-1, -1])

class ModelStateResetter(tf.keras.callbacks.Callback):
    def __init__(self):
        pass

    def on_epoch_begin(self, epoch, logs=None):
        self.last_final_batch_count = 0
        
    def on_batch_end(self, batch, logs={}):
        average_final_batch_ratio = logs.get('average_final_batch_ratio', 0)
        final_batch_count = int(round(average_final_batch_ratio * (batch + 1)))
        is_final = final_batch_count - self.last_final_batch_count
        self.last_final_batch_count = final_batch_count
        
        if is_final:
            self.model.reset_states()

class Model:
    def __init__(self, articles, checkpoint_dir, vocab_size, embedding_dim, rnn_units):
        self._articles = articles
        self._batch_size = None
        self._batched_item_length = None
        self._training_model = None
        self._predicting_model = None
        self._vocab_size = vocab_size
        self._embedding_dim = embedding_dim
        self._rnn_units = rnn_units

        self._checkpoint_dir = checkpoint_dir
        self._checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}") # Name of the checkpoint files

    def training_model(self, batch_size, batched_item_length):
        if self._training_model == None or batch_size != self._batch_size or batched_item_length != self._batched_item_length:
            self._batch_size = batch_size
            self._batched_item_length = batched_item_length
            self._training_model = tf.keras.Sequential([
                tf.keras.layers.Masking(mask_value=0, batch_input_shape=[batch_size, batched_item_length]),
                tf.keras.layers.Embedding(self._vocab_size, self._embedding_dim),
                tf.keras.layers.LSTM(self._rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                tf.keras.layers.LSTM(self._rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                tf.keras.layers.Dense(self._vocab_size),
            ])

            if os.path.isdir(self._checkpoint_dir):
                self._training_model.load_weights(tf.train.latest_checkpoint(self._checkpoint_dir))

            self._training_model.compile(optimizer='adam', loss=loss, metrics=[average_final_batch_ratio])
            self._predicting_model = None
        
        return self._training_model

    @property
    def callbacks(self):
        checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=self._checkpoint_prefix, save_weights_only=True)
        model_state_resetter_callback = ModelStateResetter()
        
        return [checkpoint_callback, model_state_resetter_callback]
    
    def train(self, batch_size, batched_item_length, epochs=1):
        dataset = self._articles.dataset(batch_size, batched_item_length)

        model = self.training_model(batch_size, batched_item_length)

        model.fit(dataset, epochs=epochs, callbacks=self.callbacks)
    
    @property
    def predicting_model(self):
        if self._predicting_model == None:
            self._predicting_model = tf.keras.Sequential([
                tf.keras.layers.Masking(mask_value=0, batch_input_shape=[1, 1]),
                tf.keras.layers.Embedding(self._vocab_size, self._embedding_dim),
                tf.keras.layers.LSTM(self._rnn_units, stateful=True, return_sequences=True),
                tf.keras.layers.LSTM(self._rnn_units, stateful=True, return_sequences=True),
                tf.keras.layers.Dense(self._vocab_size),
            ])
            
            self._predicting_model.load_weights(tf.train.latest_checkpoint(self._checkpoint_dir))
            self._training_model = None
        
        return self._predicting_model
    
    def predict(self, input_eval):
        return self.predicting_model(input_eval)

In [4]:
articles = Articles('page_revisions_code')

In [5]:
steps = 13

for i in range(steps):
    batch_size = 4 * 2**i
    batch_item_length = 3 * 2**(steps - i - 1)
    count = articles.steps(batch_size, batch_item_length)
    print("batch size: %6d\t batch item length: %4d\tsteps per epoch: %6d" % (batch_size, batch_item_length, count))

batch size:      4	 batch item length: 12288	steps per epoch: 108880
batch size:      8	 batch item length: 6144	steps per epoch:  62536
batch size:     16	 batch item length: 3072	steps per epoch:  39720
batch size:     32	 batch item length: 1536	steps per epoch:  29133
batch size:     64	 batch item length:  768	steps per epoch:  24202
batch size:    128	 batch item length:  384	steps per epoch:  22296
batch size:    256	 batch item length:  192	steps per epoch:  22744
batch size:    512	 batch item length:   96	steps per epoch:  25706
batch size:   1024	 batch item length:   48	steps per epoch:  32707
batch size:   2048	 batch item length:   24	steps per epoch:  47309
batch size:   4096	 batch item length:   12	steps per epoch:  76958
batch size:   8192	 batch item length:    6	steps per epoch: 136758
batch size:  16384	 batch item length:    3	steps per epoch: 257153


In [6]:
model = Model(articles, './training_checkpoints-19',
              vocab_size=73,
              embedding_dim=16,
              rnn_units=1024)

In [7]:
model.training_model(256, 256).summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking (Masking)            (256, 256)                0         
_________________________________________________________________
embedding (Embedding)        (256, 256, 16)            1168      
_________________________________________________________________
lstm (LSTM)                  (256, 256, 1024)          4263936   
_________________________________________________________________
lstm_1 (LSTM)                (256, 256, 1024)          8392704   
_________________________________________________________________
dense (Dense)                (256, 256, 73)            74825     
Total params: 12,732,633
Trainable params: 12,732,633
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.train(256, 256, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5


In [None]:
model.train(256, 256, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


## Да опитаме с GRU

In [6]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

def average_final_batch_ratio(true_labels, predictions):
    return 0 ** tf.math.abs(true_labels[-1, -1])

class ModelStateResetter(tf.keras.callbacks.Callback):
    def __init__(self):
        pass

    def on_epoch_begin(self, epoch, logs=None):
        self.last_final_batch_count = 0
        
    def on_batch_end(self, batch, logs={}):
        average_final_batch_ratio = logs.get('average_final_batch_ratio', 0)
        final_batch_count = int(round(average_final_batch_ratio * (batch + 1)))
        is_final = final_batch_count - self.last_final_batch_count
        self.last_final_batch_count = final_batch_count
        
        if is_final:
            self.model.reset_states()

class Model:
    def __init__(self, articles, checkpoint_dir, vocab_size, embedding_dim, rnn_units):
        self._articles = articles
        self._batch_size = None
        self._batched_item_length = None
        self._training_model = None
        self._predicting_model = None
        self._vocab_size = vocab_size
        self._embedding_dim = embedding_dim
        self._rnn_units = rnn_units

        self._checkpoint_dir = checkpoint_dir
        self._checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}") # Name of the checkpoint files

    def training_model(self, batch_size, batched_item_length):
        if self._training_model == None or batch_size != self._batch_size or batched_item_length != self._batched_item_length:
            self._batch_size = batch_size
            self._batched_item_length = batched_item_length
            self._training_model = tf.keras.Sequential([
                tf.keras.layers.Masking(mask_value=0, batch_input_shape=[batch_size, batched_item_length]),
                tf.keras.layers.Embedding(self._vocab_size, self._embedding_dim),
                tf.keras.layers.GRU(self._rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                tf.keras.layers.GRU(self._rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                tf.keras.layers.Dense(self._vocab_size),
            ])

            if os.path.isdir(self._checkpoint_dir):
                self._training_model.load_weights(tf.train.latest_checkpoint(self._checkpoint_dir))

            self._training_model.compile(optimizer='adam', loss=loss, metrics=[average_final_batch_ratio])
            self._predicting_model = None
        
        return self._training_model

    @property
    def callbacks(self):
        checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=self._checkpoint_prefix, save_weights_only=True)
        model_state_resetter_callback = ModelStateResetter()
        
        return [checkpoint_callback, model_state_resetter_callback]
    
    def train(self, batch_size, batched_item_length, epochs=1):
        dataset = self._articles.dataset(batch_size, batched_item_length)

        model = self.training_model(batch_size, batched_item_length)

        model.fit(dataset, epochs=epochs, callbacks=self.callbacks)
    
    @property
    def predicting_model(self):
        if self._predicting_model == None:
            self._predicting_model = tf.keras.Sequential([
                tf.keras.layers.Masking(mask_value=0, batch_input_shape=[1, 1]),
                tf.keras.layers.Embedding(self._vocab_size, self._embedding_dim),
                tf.keras.layers.GRU(self._rnn_units, stateful=True, return_sequences=True),
                tf.keras.layers.GRU(self._rnn_units, stateful=True, return_sequences=True),
                tf.keras.layers.Dense(self._vocab_size),
            ])
            
            self._predicting_model.load_weights(tf.train.latest_checkpoint(self._checkpoint_dir))
            self._training_model = None
        
        return self._predicting_model
    
    def predict(self, input_eval):
        return self.predicting_model(input_eval)

In [7]:
model = Model(articles, './training_checkpoints-20-new',
              vocab_size=72,
              embedding_dim=16,
              rnn_units=1024)

In [8]:
model.training_model(256, 256).summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking (Masking)            (256, 256)                0         
_________________________________________________________________
embedding (Embedding)        (256, 256, 16)            1152      
_________________________________________________________________
gru (GRU)                    (256, 256, 1024)          3201024   
_________________________________________________________________
gru_1 (GRU)                  (256, 256, 1024)          6297600   
_________________________________________________________________
dense (Dense)                (256, 256, 72)            73800     
Total params: 9,573,576
Trainable params: 9,573,576
Non-trainable params: 0
_________________________________________________________________


In [7]:
model.train(256, 256, epochs=4)

Epoch 1/4
Epoch 2/4
Epoch 3/4


InternalError:  [_Derived_]  Failed to call ThenRnnBackward with model config: [rnn_mode, rnn_input_mode, rnn_direction_mode]: 3, 0, 0 , [num_layers, input_size, num_units, dir_count, max_seq_length, batch_size, cell_num_units]: [1, 1024, 1024, 1, 256, 256, 0] 
	 [[{{node gradients/CudnnRNN_grad/CudnnRNNBackprop}}]]
	 [[StatefulPartitionedCall]]
	 [[gradient_tape/sequential/embedding/embedding_lookup/Reshape/_24]] [Op:__inference_train_function_4529]

Function call stack:
train_function -> train_function -> train_function


In [9]:
model.train(256, 256, epochs=2)

Epoch 1/2
Epoch 2/2


In [10]:
import ctypes

class Huffman:
    huffman = ctypes.CDLL('x64/Release/huffman')
    
    huffman.create_tree.restype = ctypes.c_void_p
    huffman.destroy_tree.restype = None
    huffman.load_weights.restype = None
    huffman.create_code_string.restype = ctypes.c_char_p
    
    def __init__(self, category_count):
        self.category_count = category_count
        self.tree = ctypes.c_void_p(self.huffman.create_tree(category_count))

    def __del__(self):
        self.huffman.destroy_tree(self.tree)
        
    def load_weights(self, weights):
        self.huffman.load_weights(self.tree, weights.ctypes.data_as(ctypes.POINTER(ctypes.c_float)))
    
    def get_code_length(self, category):
        return self.huffman.get_code_length(self.tree, category)

    def get_code_zero_count(self, category):
        return self.huffman.get_code_zero_count(self.tree, category)
    
    def archive_size(self, model, text):
        archived_size = math.ceil(math.log2(self.category_count))
        input_eval = np.array([[text[0]]], dtype=TYPE)

        model.predicting_model.reset_states()

        for byte in text[1:]:
            predictions = model.predict(input_eval)
            predictions = tf.squeeze(predictions, 0) # remove the batch dimension

            weights = tf.nn.softmax(predictions[0]).numpy()
            self.load_weights(weights)
            archived_size += self.get_code_length(byte.item())

            input_eval = tf.expand_dims([byte], 0)

        return archived_size

In [12]:
total_raw = 0
total_compressed = 0

huffman = Huffman(72)
for index, article in enumerate(articles.articles_generator(1)):
    if index % 1000 == 0:
        if len(article) == 0:
            continue
        total_raw += len(article) * 8
        total_compressed += huffman.archive_size(model, article)
        print('Article %d:\tLength: %d\tAvg Compression: %f' % (index, len(article), total_compressed/total_raw))

Article 1000:	Length: 18	Avg Compression: 0.270833
Article 2000:	Length: 19	Avg Compression: 0.256757
Article 3000:	Length: 20	Avg Compression: 0.254386
Article 4000:	Length: 21	Avg Compression: 0.253205
Article 5000:	Length: 21	Avg Compression: 0.253788
Article 6000:	Length: 22	Avg Compression: 0.253099
Article 7000:	Length: 22	Avg Compression: 0.256119
Article 8000:	Length: 23	Avg Compression: 0.251506
Article 9000:	Length: 23	Avg Compression: 0.250000
Article 10000:	Length: 24	Avg Compression: 0.248826
Article 11000:	Length: 24	Avg Compression: 0.252637
Article 12000:	Length: 25	Avg Compression: 0.250954
Article 13000:	Length: 25	Avg Compression: 0.251307
Article 14000:	Length: 26	Avg Compression: 0.249601
Article 15000:	Length: 26	Avg Compression: 0.254425
Article 16000:	Length: 27	Avg Compression: 0.253415
Article 17000:	Length: 27	Avg Compression: 0.258270
Article 18000:	Length: 27	Avg Compression: 0.256845
Article 19000:	Length: 28	Avg Compression: 0.265346
Article 20000:	Length

Article 155000:	Length: 3628	Avg Compression: 0.215053
Article 156000:	Length: 3687	Avg Compression: 0.215683
Article 157000:	Length: 3753	Avg Compression: 0.214713
Article 158000:	Length: 3822	Avg Compression: 0.214198
Article 159000:	Length: 3892	Avg Compression: 0.214388
Article 160000:	Length: 3967	Avg Compression: 0.214943
Article 161000:	Length: 4040	Avg Compression: 0.214271
Article 162000:	Length: 4121	Avg Compression: 0.214468
Article 163000:	Length: 4201	Avg Compression: 0.215267
Article 164000:	Length: 4285	Avg Compression: 0.215996
Article 165000:	Length: 4373	Avg Compression: 0.216029
Article 166000:	Length: 4462	Avg Compression: 0.216042
Article 167000:	Length: 4556	Avg Compression: 0.216277
Article 168000:	Length: 4646	Avg Compression: 0.217735
Article 169000:	Length: 4743	Avg Compression: 0.218487
Article 170000:	Length: 4851	Avg Compression: 0.218739
Article 171000:	Length: 4956	Avg Compression: 0.219318
Article 172000:	Length: 5075	Avg Compression: 0.218677
Article 17

## Да опитаме без embedding layer

In [19]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

def average_final_batch_ratio(true_labels, predictions):
    return 0 ** tf.math.abs(true_labels[-1, -1])

class ModelStateResetter(tf.keras.callbacks.Callback):
    def __init__(self):
        pass

    def on_epoch_begin(self, epoch, logs=None):
        self.last_final_batch_count = 0
        
    def on_batch_end(self, batch, logs={}):
        average_final_batch_ratio = logs.get('average_final_batch_ratio', 0)
        final_batch_count = int(round(average_final_batch_ratio * (batch + 1)))
        is_final = final_batch_count - self.last_final_batch_count
        self.last_final_batch_count = final_batch_count
        
        if is_final:
            self.model.reset_states()

class Model:
    def __init__(self, articles, checkpoint_dir, vocab_size, rnn_units):
        self._articles = articles
        self._batch_size = None
        self._batched_item_length = None
        self._training_model = None
        self._predicting_model = None
        self._vocab_size = vocab_size
        self._rnn_units = rnn_units

        self._checkpoint_dir = checkpoint_dir
        self._checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}") # Name of the checkpoint files

    def training_model(self, batch_size, batched_item_length):
        if self._training_model == None or batch_size != self._batch_size or batched_item_length != self._batched_item_length:
            self._batch_size = batch_size
            self._batched_item_length = batched_item_length
            self._training_model = tf.keras.Sequential([
                tf.keras.layers.Masking(mask_value=0, batch_input_shape=[batch_size, batched_item_length]),
                tf.keras.layers.Embedding(self._vocab_size, self._vocab_size, embeddings_initializer='identity', trainable=False, name='onehot'),
                tf.keras.layers.LSTM(self._rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                tf.keras.layers.LSTM(self._rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                tf.keras.layers.Dense(self._vocab_size),
            ])

            if os.path.isdir(self._checkpoint_dir):
                self._training_model.load_weights(tf.train.latest_checkpoint(self._checkpoint_dir))

            self._training_model.compile(optimizer='adam', loss=loss, metrics=[average_final_batch_ratio])
            self._predicting_model = None
        
        return self._training_model

    @property
    def callbacks(self):
        checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=self._checkpoint_prefix, save_weights_only=True)
        model_state_resetter_callback = ModelStateResetter()
        
        return [checkpoint_callback, model_state_resetter_callback]
    
    def train(self, batch_size, batched_item_length, epochs=1):
        dataset = self._articles.dataset(batch_size, batched_item_length)

        model = self.training_model(batch_size, batched_item_length)

        model.fit(dataset, epochs=epochs, callbacks=self.callbacks)
    
    @property
    def predicting_model(self):
        if self._predicting_model == None:
            self._predicting_model = tf.keras.Sequential([
                tf.keras.layers.Masking(mask_value=0, batch_input_shape=[1, 1]),
                tf.keras.layers.Embedding(self._vocab_size, self._vocab_size, embeddings_initializer='identity', trainable=False, name='onehot'),
                tf.keras.layers.LSTM(self._rnn_units, stateful=True, return_sequences=True),
                tf.keras.layers.LSTM(self._rnn_units, stateful=True, return_sequences=True),
                tf.keras.layers.Dense(self._vocab_size),
            ])
            
            self._predicting_model.load_weights(tf.train.latest_checkpoint(self._checkpoint_dir))
            self._training_model = None
        
        return self._predicting_model
    
    def predict(self, input_eval):
        return self.predicting_model(input_eval)

In [20]:
model = Model(articles, './training_checkpoints-21', vocab_size=72, rnn_units=1024)

In [21]:
model.training_model(256, 256).summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking_4 (Masking)          (256, 256)                0         
_________________________________________________________________
onehot (Embedding)           (256, 256, 72)            5184      
_________________________________________________________________
lstm_2 (LSTM)                (256, 256, 1024)          4493312   
_________________________________________________________________
lstm_3 (LSTM)                (256, 256, 1024)          8392704   
_________________________________________________________________
dense_3 (Dense)              (256, 256, 72)            73800     
Total params: 12,965,000
Trainable params: 12,959,816
Non-trainable params: 5,184
_________________________________________________________________


In [22]:
model.train(256, 256, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [27]:
total_raw = 0
total_compressed = 0

huffman = Huffman(72)
for index, article in enumerate(articles.articles_generator(1)):
    if index % 1000 == 0:
        if len(article) == 0:
            continue
        total_raw += len(article) * 8
        total_compressed += huffman.archive_size(model, article)
        print('Article %d:\tLength: %d\tAvg Compression: %f' % (index, len(article), total_compressed/total_raw))

Article 1000:	Length: 18	Avg Compression: 0.270833
Article 2000:	Length: 19	Avg Compression: 0.280405
Article 3000:	Length: 20	Avg Compression: 0.296053
Article 4000:	Length: 21	Avg Compression: 0.285256
Article 5000:	Length: 21	Avg Compression: 0.281566
Article 6000:	Length: 22	Avg Compression: 0.273760
Article 7000:	Length: 22	Avg Compression: 0.278846
Article 8000:	Length: 23	Avg Compression: 0.270331
Article 9000:	Length: 23	Avg Compression: 0.269841
Article 10000:	Length: 24	Avg Compression: 0.265845
Article 11000:	Length: 24	Avg Compression: 0.264768
Article 12000:	Length: 25	Avg Compression: 0.264790
Article 13000:	Length: 25	Avg Compression: 0.264808
Article 14000:	Length: 26	Avg Compression: 0.263179
Article 15000:	Length: 26	Avg Compression: 0.270280
Article 16000:	Length: 27	Avg Compression: 0.267418
Article 17000:	Length: 27	Avg Compression: 0.270674
Article 18000:	Length: 27	Avg Compression: 0.270238
Article 19000:	Length: 28	Avg Compression: 0.276507
Article 20000:	Length

Article 155000:	Length: 3628	Avg Compression: 0.225310
Article 156000:	Length: 3687	Avg Compression: 0.225934
Article 157000:	Length: 3753	Avg Compression: 0.224945
Article 158000:	Length: 3822	Avg Compression: 0.224373
Article 159000:	Length: 3892	Avg Compression: 0.224605
Article 160000:	Length: 3967	Avg Compression: 0.225246
Article 161000:	Length: 4040	Avg Compression: 0.224518
Article 162000:	Length: 4121	Avg Compression: 0.224684
Article 163000:	Length: 4201	Avg Compression: 0.225479
Article 164000:	Length: 4285	Avg Compression: 0.226165
Article 165000:	Length: 4373	Avg Compression: 0.226335
Article 166000:	Length: 4462	Avg Compression: 0.226452
Article 167000:	Length: 4556	Avg Compression: 0.226725
Article 168000:	Length: 4646	Avg Compression: 0.228203
Article 169000:	Length: 4743	Avg Compression: 0.228899
Article 170000:	Length: 4851	Avg Compression: 0.229095
Article 171000:	Length: 4956	Avg Compression: 0.229669
Article 172000:	Length: 5075	Avg Compression: 0.228992
Article 17

## GRU без embedding слой

In [6]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

def average_final_batch_ratio(true_labels, predictions):
    return 0 ** tf.math.abs(true_labels[-1, -1])

class ModelStateResetter(tf.keras.callbacks.Callback):
    def __init__(self):
        pass

    def on_epoch_begin(self, epoch, logs=None):
        self.last_final_batch_count = 0
        
    def on_batch_end(self, batch, logs={}):
        average_final_batch_ratio = logs.get('average_final_batch_ratio', 0)
        final_batch_count = int(round(average_final_batch_ratio * (batch + 1)))
        is_final = final_batch_count - self.last_final_batch_count
        self.last_final_batch_count = final_batch_count
        
        if is_final:
            self.model.reset_states()

class Model:
    def __init__(self, articles, checkpoint_dir, vocab_size, rnn_units):
        self._articles = articles
        self._batch_size = None
        self._batched_item_length = None
        self._training_model = None
        self._predicting_model = None
        self._vocab_size = vocab_size
        self._rnn_units = rnn_units

        self._checkpoint_dir = checkpoint_dir
        self._checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}") # Name of the checkpoint files

    def training_model(self, batch_size, batched_item_length):
        if self._training_model == None or batch_size != self._batch_size or batched_item_length != self._batched_item_length:
            self._batch_size = batch_size
            self._batched_item_length = batched_item_length
            self._training_model = tf.keras.Sequential([
                tf.keras.layers.Masking(mask_value=0, batch_input_shape=[batch_size, batched_item_length]),
                tf.keras.layers.Embedding(self._vocab_size, self._vocab_size, embeddings_initializer='identity', trainable=False, name='onehot'),
                tf.keras.layers.GRU(self._rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                tf.keras.layers.GRU(self._rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                tf.keras.layers.Dense(self._vocab_size),
            ])

            if os.path.isdir(self._checkpoint_dir):
                self._training_model.load_weights(tf.train.latest_checkpoint(self._checkpoint_dir))

            self._training_model.compile(optimizer='adam', loss=loss, metrics=[average_final_batch_ratio])
            self._predicting_model = None
        
        return self._training_model

    @property
    def callbacks(self):
        checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=self._checkpoint_prefix, save_weights_only=True)
        model_state_resetter_callback = ModelStateResetter()
        
        return [checkpoint_callback, model_state_resetter_callback]
    
    def train(self, batch_size, batched_item_length, epochs=1):
        dataset = self._articles.dataset(batch_size, batched_item_length)

        model = self.training_model(batch_size, batched_item_length)

        model.fit(dataset, epochs=epochs, callbacks=self.callbacks)
    
    @property
    def predicting_model(self):
        if self._predicting_model == None:
            self._predicting_model = tf.keras.Sequential([
                tf.keras.layers.Masking(mask_value=0, batch_input_shape=[1, 1]),
                tf.keras.layers.Embedding(self._vocab_size, self._vocab_size, embeddings_initializer='identity', trainable=False, name='onehot'),
                tf.keras.layers.GRU(self._rnn_units, stateful=True, return_sequences=True),
                tf.keras.layers.GRU(self._rnn_units, stateful=True, return_sequences=True),
                tf.keras.layers.Dense(self._vocab_size),
            ])
            
            self._predicting_model.load_weights(tf.train.latest_checkpoint(self._checkpoint_dir))
            self._training_model = None
        
        return self._predicting_model
    
    def predict(self, input_eval):
        return self.predicting_model(input_eval)

In [29]:
model = Model(articles, './training_checkpoints-22', vocab_size=72, rnn_units=1024)
model.training_model(256, 256).summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking_7 (Masking)          (256, 256)                0         
_________________________________________________________________
onehot (Embedding)           (256, 256, 72)            5184      
_________________________________________________________________
gru_4 (GRU)                  (256, 256, 1024)          3373056   
_________________________________________________________________
gru_5 (GRU)                  (256, 256, 1024)          6297600   
_________________________________________________________________
dense_5 (Dense)              (256, 256, 72)            73800     
Total params: 9,749,640
Trainable params: 9,744,456
Non-trainable params: 5,184
_________________________________________________________________


In [30]:
model.train(256, 256, epochs=4)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [7]:
model = Model(articles, './training_checkpoints-23', vocab_size=72, rnn_units=1024)
model.training_model(256, 256).summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking (Masking)            (256, 256)                0         
_________________________________________________________________
onehot (Embedding)           (256, 256, 72)            5184      
_________________________________________________________________
gru (GRU)                    (256, 256, 1024)          3373056   
_________________________________________________________________
gru_1 (GRU)                  (256, 256, 1024)          6297600   
_________________________________________________________________
dense (Dense)                (256, 256, 72)            73800     
Total params: 9,749,640
Trainable params: 9,744,456
Non-trainable params: 5,184
_________________________________________________________________


In [32]:
model.train(256, 256, epochs=4)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
    1/16559 [..............................] - ETA: 4:58:35

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:GPU:0 to /job:localhost/replica:0/task:0/device:CPU:0 in order to run Identity: GPU sync failed [Op:Identity]

In [8]:
model.train(256, 256)



In [11]:
total_raw = 0
total_compressed = 0

huffman = Huffman(72)
for index, article in enumerate(articles.articles_generator(1)):
    if index % 1000 == 0:
        if len(article) == 0:
            continue
        total_raw += len(article) * 8
        total_compressed += huffman.archive_size(model, article)
        print('Article %d:\tLength: %d\tAvg Compression: %f' % (index, len(article), total_compressed/total_raw))

Article 1000:	Length: 18	Avg Compression: 0.305556
Article 2000:	Length: 19	Avg Compression: 0.314189
Article 3000:	Length: 20	Avg Compression: 0.315789
Article 4000:	Length: 21	Avg Compression: 0.350962
Article 5000:	Length: 21	Avg Compression: 0.351010
Article 6000:	Length: 22	Avg Compression: 0.335744
Article 7000:	Length: 22	Avg Compression: 0.329545
Article 8000:	Length: 23	Avg Compression: 0.323795
Article 9000:	Length: 23	Avg Compression: 0.316799
Article 10000:	Length: 24	Avg Compression: 0.323357
Article 11000:	Length: 24	Avg Compression: 0.319620
Article 12000:	Length: 25	Avg Compression: 0.316317
Article 13000:	Length: 25	Avg Compression: 0.310976
Article 14000:	Length: 26	Avg Compression: 0.305911
Article 15000:	Length: 26	Avg Compression: 0.301991
Article 16000:	Length: 27	Avg Compression: 0.300888
Article 17000:	Length: 27	Avg Compression: 0.302163
Article 18000:	Length: 27	Avg Compression: 0.297917
Article 19000:	Length: 28	Avg Compression: 0.296038
Article 20000:	Length

Article 155000:	Length: 3628	Avg Compression: 0.208545
Article 156000:	Length: 3687	Avg Compression: 0.209015
Article 157000:	Length: 3753	Avg Compression: 0.209741
Article 158000:	Length: 3822	Avg Compression: 0.209137
Article 159000:	Length: 3892	Avg Compression: 0.209859
Article 160000:	Length: 3967	Avg Compression: 0.210367
Article 161000:	Length: 4040	Avg Compression: 0.210346
Article 162000:	Length: 4121	Avg Compression: 0.211201
Article 163000:	Length: 4201	Avg Compression: 0.211453
Article 164000:	Length: 4285	Avg Compression: 0.211825
Article 165000:	Length: 4373	Avg Compression: 0.212595
Article 166000:	Length: 4462	Avg Compression: 0.213179
Article 167000:	Length: 4556	Avg Compression: 0.214475
Article 168000:	Length: 4646	Avg Compression: 0.215261
Article 169000:	Length: 4743	Avg Compression: 0.215590
Article 170000:	Length: 4851	Avg Compression: 0.215449
Article 171000:	Length: 4956	Avg Compression: 0.215686
Article 172000:	Length: 5075	Avg Compression: 0.215918
Article 17

In [12]:
model.train(256, 256, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [9]:
model.train(256, 256, epochs=3)

Epoch 1/3
Epoch 2/3
    1/16559 [..............................] - ETA: 4:59:41

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:GPU:0 to /job:localhost/replica:0/task:0/device:CPU:0 in order to run Identity: GPU sync failed [Op:Identity]

In [8]:
model.train(256, 256, epochs=2)

Epoch 1/2
Epoch 2/2


In [9]:
model.train(256, 256, epochs=2)

Epoch 1/2
Epoch 2/2
    1/16559 [..............................] - ETA: 6:56:26

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:GPU:0 to /job:localhost/replica:0/task:0/device:CPU:0 in order to run Identity: GPU sync failed [Op:Identity]

In [8]:
model.train(192, 256, epochs=2)

Epoch 1/2
Epoch 2/2
    1/21170 [..............................] - ETA: 4:36:36

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:GPU:0 to /job:localhost/replica:0/task:0/device:CPU:0 in order to run Identity: GPU sync failed [Op:Identity]

In [8]:
model.train(192, 256, epochs=1)



## Повече слоеве

In [10]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

def average_final_batch_ratio(true_labels, predictions):
    return 0 ** tf.math.abs(true_labels[-1, -1])

class ModelStateResetter(tf.keras.callbacks.Callback):
    def __init__(self):
        pass

    def on_epoch_begin(self, epoch, logs=None):
        self.last_final_batch_count = 0
        
    def on_batch_end(self, batch, logs={}):
        average_final_batch_ratio = logs.get('average_final_batch_ratio', 0)
        final_batch_count = int(round(average_final_batch_ratio * (batch + 1)))
        is_final = final_batch_count - self.last_final_batch_count
        self.last_final_batch_count = final_batch_count
        
        if is_final:
            self.model.reset_states()

class Model:
    def __init__(self, articles, checkpoint_dir, vocab_size, rnn_units):
        self._articles = articles
        self._batch_size = None
        self._batched_item_length = None
        self._training_model = None
        self._predicting_model = None
        self._vocab_size = vocab_size
        self._rnn_units = rnn_units

        self._checkpoint_dir = checkpoint_dir
        self._checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}") # Name of the checkpoint files

    def training_model(self, batch_size, batched_item_length):
        if self._training_model == None or batch_size != self._batch_size or batched_item_length != self._batched_item_length:
            self._batch_size = batch_size
            self._batched_item_length = batched_item_length
            self._training_model = tf.keras.Sequential([
                tf.keras.layers.Masking(mask_value=0, batch_input_shape=[batch_size, batched_item_length]),
                tf.keras.layers.Embedding(self._vocab_size, self._vocab_size, embeddings_initializer='identity', trainable=False, name='onehot'),
                tf.keras.layers.GRU(self._rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                tf.keras.layers.GRU(self._rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                tf.keras.layers.GRU(self._rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                tf.keras.layers.GRU(self._rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                tf.keras.layers.Dense(self._vocab_size),
            ])

            if os.path.isdir(self._checkpoint_dir):
                self._training_model.load_weights(tf.train.latest_checkpoint(self._checkpoint_dir))

            self._training_model.compile(optimizer='adam', loss=loss, metrics=[average_final_batch_ratio])
            self._predicting_model = None
        
        return self._training_model

    @property
    def callbacks(self):
        checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=self._checkpoint_prefix, save_weights_only=True)
        model_state_resetter_callback = ModelStateResetter()
        
        return [checkpoint_callback, model_state_resetter_callback]
    
    def train(self, batch_size, batched_item_length, epochs=1):
        dataset = self._articles.dataset(batch_size, batched_item_length)

        model = self.training_model(batch_size, batched_item_length)

        model.fit(dataset, epochs=epochs, callbacks=self.callbacks)
    
    @property
    def predicting_model(self):
        if self._predicting_model == None:
            self._predicting_model = tf.keras.Sequential([
                tf.keras.layers.Masking(mask_value=0, batch_input_shape=[1, 1]),
                tf.keras.layers.Embedding(self._vocab_size, self._vocab_size, embeddings_initializer='identity', trainable=False, name='onehot'),
                tf.keras.layers.GRU(self._rnn_units, stateful=True, return_sequences=True),
                tf.keras.layers.GRU(self._rnn_units, stateful=True, return_sequences=True),
                tf.keras.layers.GRU(self._rnn_units, stateful=True, return_sequences=True),
                tf.keras.layers.GRU(self._rnn_units, stateful=True, return_sequences=True),
                tf.keras.layers.Dense(self._vocab_size),
            ])
            
            self._predicting_model.load_weights(tf.train.latest_checkpoint(self._checkpoint_dir))
            self._training_model = None
        
        return self._predicting_model
    
    def predict(self, input_eval):
        return self.predicting_model(input_eval)

In [14]:
model = Model(articles, './training_checkpoints-24', vocab_size=72, rnn_units=768)
model.training_model(192, 256).summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking_1 (Masking)          (192, 256)                0         
_________________________________________________________________
onehot (Embedding)           (192, 256, 72)            5184      
_________________________________________________________________
gru_4 (GRU)                  (192, 256, 768)           1939968   
_________________________________________________________________
gru_5 (GRU)                  (192, 256, 768)           3543552   
_________________________________________________________________
gru_6 (GRU)                  (192, 256, 768)           3543552   
_________________________________________________________________
gru_7 (GRU)                  (192, 256, 768)           3543552   
_________________________________________________________________
dense_1 (Dense)              (192, 256, 72)           

In [15]:
model.train(192, 256, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
    8/21170 [..............................] - ETA: 4:47:11 - loss: nan - average_final_batch_ratio: 1.0000

KeyboardInterrupt: 