In [1]:
import html
import re

import tensorflow_datasets as tfds

class SubalphabetSubwordTextEncoder:
    HTML_ESCAPE_PATTERN = re.compile('&(((#\d+)|([A-Za-z]+));&?)+')
    SPECIAL_CHAR = 'X'

    def __init__(self, subalphabet, vocab_list=None):
        self._subword_text_encoder = tfds.features.text.SubwordTextEncoder(vocab_list)
        self._subalphabet = subalphabet

    def encode(self, s):
        subword_max_code = len(self.subwords)
        codes = self._build_encoding_dict(self._subalphabet)
        preprocessed_string = self.preprocess_string(s)

        result = []
        unicode_buffer = bytearray()
        for id in self._subword_text_encoder.encode(preprocessed_string):
            if id <= subword_max_code:
                result.append(id)
            else:
                unicode_buffer.append(id - subword_max_code - 1)
                try:
                    id = ord(unicode_buffer.decode())
                    result.append(subword_max_code + codes[id] + 1)
                    unicode_buffer = bytearray()
                except UnicodeDecodeError:
                    pass
        return result

    def decode(self, ids):
        subword_max_code = len(self.subwords)

        processed_ids = []
        for id in ids:
            if id <= subword_max_code:
                processed_ids.append(id)
            else:
                char = self._subalphabet[id - subword_max_code - 1].encode()
                processed_ids += [codepoint + subword_max_code + 1 for codepoint in char]

        return self._subword_text_encoder.decode(processed_ids)
    
    @classmethod
    def build_from_corpus(cls, corpus_generator, subalphabet_size, target_vocab_size, max_subword_length=20, max_corpus_chars=None, reserved_tokens=None):
        simplified_strings = []
        char_counts = defaultdict(lambda: 0)

        for string in corpus_generator:
            simplified_string = cls._unescape_string(string.lower())
            simplified_strings.append(simplified_string)
            for char in simplified_string:
                char_counts[char] += 1

        sorted_char_counts = sorted(char_counts.items(), key = lambda item: item[1], reverse = True)
        chars = [char for char, char_count in sorted_char_counts[:subalphabet_size - 1]]
        chars = list(reversed(chars)) # We reverse the chars so that the most common ones are last
        chars = cls._ensure_special_character_presence(chars)

        def simplified_corpus():
            for string in simplified_strings:
                yield cls._compact_string(chars, string)

        subword_text_encoder = tfds.features.text.SubwordTextEncoder.build_from_corpus(simplified_corpus(), target_vocab_size + 256 - subalphabet_size)
        
        return cls(chars, subword_text_encoder.subwords)
    
    @classmethod
    def load_from_file(cls, filename_prefix):
        subword_text_encoder = tfds.features.text.SubwordTextEncoder.load_from_file(filename_prefix)
        with open(filename_prefix + '.subalphabet', 'rb') as text_file:
            subalphabet = ['\0', cls.SPECIAL_CHAR] + text_file.read().decode().split('\0')

        return cls(subalphabet, subword_text_encoder.subwords)
    
    def save_to_file(self, filename_prefix):
        self._subword_text_encoder.save_to_file(filename_prefix)
        with open(filename_prefix + '.subalphabet', 'wb') as text_file:
            text_file.write('\0'.join(self._subalphabet[2:]).encode())

    def preprocess_string(self, string):
        return self._compact_string(self._subalphabet, self._unescape_string(string.lower()))

    @property
    def vocab_size(self):
        return self._subword_text_encoder.vocab_size - 256 + len(self._subalphabet)
    
    @property
    def subwords(self):
        return self._subword_text_encoder.subwords

    @classmethod
    def _ensure_special_character_presence(cls, chars):
        if '\0' in chars:
            chars = [char for char in chars if char != '\0']
        else:
            chars = chars[1:]

        return ['\0', cls.SPECIAL_CHAR] + chars
    
    @classmethod
    def _unescape_html_symbol(cls, escaped):
        escaped_symbols = escaped.group(0).split(';')
        escaped_symbols = escaped_symbols[:-1] # strip the last string empty string
        escaped_symbols = (escaped_symbol + ';' for escaped_symbol in escaped_symbols)
        result = ''

        for escaped_symbol in escaped_symbols:
            code = escaped_symbol[1:] if escaped_symbol[0] == '&' else escaped_symbol

            if code[0] == '#':
                result += chr(int(code[1:-1]))
            elif code in html.entities.html5:
                result += html.entities.html5[code]
            else:
                result += escaped_symbol

        return result
    
    @classmethod
    def _unescape_string(cls, string):
        return cls.HTML_ESCAPE_PATTERN.sub(cls._unescape_html_symbol, string)
    
    @classmethod
    def _compact_string(cls, chars, string):
        if len(string) == 0:
            return string
        else:
            compacted_string = string[0]

            for char in string[1:]:
                if char not in chars or char == cls.SPECIAL_CHAR:
                    if compacted_string[-1] != cls.SPECIAL_CHAR:
                        compacted_string += cls.SPECIAL_CHAR
                else:
                    compacted_string += char

            return compacted_string
    
    @classmethod
    def _build_encoding_dict(cls, chars):
        return {ord(char): index for index, char in enumerate(chars)}

In [2]:
encoder = SubalphabetSubwordTextEncoder.load_from_file('72_1024')

In [3]:
import itertools
import math
import os

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

physical_devices = tf.config.list_physical_devices('GPU')
for physical_device in physical_devices:
    tf.config.experimental.set_memory_growth(physical_device, enable=True)

In [4]:
TYPE=np.int16

class Articles:
    EMPTY_ARTICLE = np.array([], dtype=TYPE) # used for padding
    
    def __init__(self, path):
        with open(path, 'rb') as text_file:
            data = text_file.read().decode()

        self.articles = sorted(set(data.split('\0')[:20000]), key=len)
        self._encoded_articles = None

    @property
    def encoded_articles(self):
        if self._encoded_articles == None:
            self._encoded_articles = [np.array(encoder.encode(article), dtype=TYPE) for article in self.articles]
        
        return self._encoded_articles

    def articles_generator(self, batch_size = 1, start = 0, end = None):
        end = end or len(self.articles)

        for _ in range(batch_size - ((end - start - 1) % batch_size + 1)):
            yield self.EMPTY_ARTICLE

        for article in itertools.islice(self.encoded_articles, start, end):
            yield article

    def subbatch_generator(self, batch_size, batch_length, start = 0, end = None):
        end = end or len(self.articles)

        dataset = tf.data.Dataset.from_generator(self.articles_generator, args=(batch_size, start, end), output_types=TYPE)
        dataset = dataset.padded_batch(batch_size, padded_shapes=([None]), drop_remainder=True)
        dataset = dataset.shuffle(100)

        for batch in dataset.as_numpy_iterator():
            remaining = batch
            while remaining.shape[1] > batch_length + 1:
                yield remaining[:, :batch_length + 1]
                remaining = remaining[:, batch_length:]

            if remaining.shape[1] == batch_length + 1:
                yield remaining
                yield np.zeros((batch_size, batch_length + 1), dtype=TYPE)
            else:
                yield np.hstack([remaining, np.zeros([batch_size, batch_length - remaining.shape[1] + 1])])

    def steps(self, batch_size, batch_length):
        articles = self.articles_generator(batch_size, batch_length)
        return sum(math.ceil(len(article) / batch_length + 1) for i, article in enumerate(articles) if (i + 1) % batch_size == 0)

    def dataset(self, batch_size, batch_length, start = 0, end = None):
        end = end or len(self.articles)

        dataset = tf.data.Dataset.from_generator(self.subbatch_generator, args=(batch_size, batch_length, start, end), output_types=TYPE, output_shapes=(batch_size, batch_length + 1))
        return dataset.map(lambda batch: (batch[:, :-1], batch[:, 1:]))

In [5]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

def average_final_batch_ratio(true_labels, predictions):
    return 0 ** tf.math.abs(true_labels[-1, -1])

class ModelStateResetter(tf.keras.callbacks.Callback):
    def __init__(self):
        pass

    def on_epoch_begin(self, epoch, logs=None):
        self.last_final_batch_count = 0
        
    def on_batch_end(self, batch, logs={}):
        average_final_batch_ratio = logs.get('average_final_batch_ratio', 0)
        final_batch_count = int(round(average_final_batch_ratio * (batch + 1)))
        is_final = final_batch_count - self.last_final_batch_count
        self.last_final_batch_count = final_batch_count
        
        if is_final:
            self.model.reset_states()

class Model:
    def __init__(self, articles, checkpoint_dir, vocab_size, embedding_dim, rnn_units):
        self._articles = articles
        self._batch_size = None
        self._batched_item_length = None
        self._training_model = None
        self._predicting_model = None
        self._vocab_size = vocab_size
        self._embedding_dim = embedding_dim
        self._rnn_units = rnn_units

        self._checkpoint_dir = checkpoint_dir
        self._checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}") # Name of the checkpoint files

    def training_model(self, batch_size, batched_item_length):
        if self._training_model == None or batch_size != self._batch_size or batched_item_length != self._batched_item_length:
            self._batch_size = batch_size
            self._batched_item_length = batched_item_length
            self._training_model = tf.keras.Sequential([
                tf.keras.layers.Masking(mask_value=0, batch_input_shape=[batch_size, batched_item_length]),
                tf.keras.layers.Embedding(self._vocab_size, self._embedding_dim),
                tf.keras.layers.GRU(self._rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                tf.keras.layers.GRU(self._rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                tf.keras.layers.Dense(self._vocab_size),
            ])

            if os.path.isdir(self._checkpoint_dir):
                self._training_model.load_weights(tf.train.latest_checkpoint(self._checkpoint_dir))

            self._training_model.compile(optimizer='adam', loss=loss, metrics=[average_final_batch_ratio])
            self._predicting_model = None
        
        return self._training_model

    @property
    def callbacks(self):
        checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=self._checkpoint_prefix, save_weights_only=True)
        model_state_resetter_callback = ModelStateResetter()
        
        return [checkpoint_callback, model_state_resetter_callback]
    
    def train(self, batch_size, batched_item_length, epochs=1):
        dataset = self._articles.dataset(batch_size, batched_item_length)

        model = self.training_model(batch_size, batched_item_length)

        model.fit(dataset, epochs=epochs, callbacks=self.callbacks)
    
    @property
    def predicting_model(self):
        if self._predicting_model == None:
            self._predicting_model = tf.keras.Sequential([
                tf.keras.layers.Masking(mask_value=0, batch_input_shape=[1, 1]),
                tf.keras.layers.Embedding(self._vocab_size, self._embedding_dim),
                tf.keras.layers.GRU(self._rnn_units, stateful=True, return_sequences=True),
                tf.keras.layers.GRU(self._rnn_units, stateful=True, return_sequences=True),
                tf.keras.layers.Dense(self._vocab_size),
            ])
            
            self._predicting_model.load_weights(tf.train.latest_checkpoint(self._checkpoint_dir))
            self._training_model = None
        
        return self._predicting_model
    
    def predict(self, input_eval):
        return self.predicting_model(input_eval)

In [6]:
articles = Articles('page_revisions_text_with_title')

In [7]:
model = Model(articles, './training_checkpoints-25', vocab_size = encoder.vocab_size, embedding_dim=64, rnn_units=1024)

In [8]:
model.training_model(256, 192).summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking (Masking)            (256, 192)                0         
_________________________________________________________________
embedding (Embedding)        (256, 192, 64)            64896     
_________________________________________________________________
gru (GRU)                    (256, 192, 1024)          3348480   
_________________________________________________________________
gru_1 (GRU)                  (256, 192, 1024)          6297600   
_________________________________________________________________
dense (Dense)                (256, 192, 1014)          1039350   
Total params: 10,750,326
Trainable params: 10,750,326
Non-trainable params: 0
_________________________________________________________________


In [12]:
model.train(256, 192, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [13]:
import ctypes

class Huffman:
    huffman = ctypes.CDLL('x64/Release/huffman')
    
    huffman.create_tree.restype = ctypes.c_void_p
    huffman.destroy_tree.restype = None
    huffman.load_weights.restype = None
    huffman.create_code_string.restype = ctypes.c_char_p
    
    def __init__(self, category_count):
        self.category_count = category_count
        self.tree = ctypes.c_void_p(self.huffman.create_tree(category_count))

    def __del__(self):
        self.huffman.destroy_tree(self.tree)
        
    def load_weights(self, weights):
        self.huffman.load_weights(self.tree, weights.ctypes.data_as(ctypes.POINTER(ctypes.c_float)))
    
    def get_code_length(self, category):
        return self.huffman.get_code_length(self.tree, category)

    def get_code_zero_count(self, category):
        return self.huffman.get_code_zero_count(self.tree, category)
    
    def archive_size(self, model, text):
        archived_size = math.ceil(math.log2(self.category_count))
        input_eval = np.array([[text[0]]], dtype=TYPE)

        model.predicting_model.reset_states()

        for byte in text[1:]:
            predictions = model.predict(input_eval)
            predictions = tf.squeeze(predictions, 0) # remove the batch dimension

            weights = tf.nn.softmax(predictions[0]).numpy()
            self.load_weights(weights)
            archived_size += self.get_code_length(byte.item())

            input_eval = tf.expand_dims([byte], 0)

        return archived_size

In [15]:
total_raw = 0
total_compressed = 0

huffman = Huffman(encoder.vocab_size)
for index, encoded_article in enumerate(articles.articles_generator(1)):
    if index % 100 == 0:
        article = encoder.decode(encoded_article)
        if len(article) == 0:
            continue
        total_raw += len(article) * 8
        total_compressed += huffman.archive_size(model, encoded_article)
        print('Article %d:\tLength: %d\tAvg Compression: %f' % (index, len(article), total_compressed/total_raw))

Article 0:	Length: 12	Avg Compression: 0.572917
Article 100:	Length: 28	Avg Compression: 0.590625
Article 200:	Length: 30	Avg Compression: 0.580357
Article 300:	Length: 31	Avg Compression: 0.606436
Article 400:	Length: 32	Avg Compression: 0.612782
Article 500:	Length: 33	Avg Compression: 0.608434
Article 600:	Length: 34	Avg Compression: 0.613750
Article 700:	Length: 35	Avg Compression: 0.608511
Article 800:	Length: 36	Avg Compression: 0.613930
Article 900:	Length: 37	Avg Compression: 0.602679
Article 1000:	Length: 38	Avg Compression: 0.600072
Article 1100:	Length: 39	Avg Compression: 0.599026
Article 1200:	Length: 39	Avg Compression: 0.600236
Article 1300:	Length: 40	Avg Compression: 0.599946
Article 1400:	Length: 41	Avg Compression: 0.594802
Article 1500:	Length: 41	Avg Compression: 0.599359
Article 1600:	Length: 42	Avg Compression: 0.602466
Article 1700:	Length: 42	Avg Compression: 0.599405
Article 1800:	Length: 43	Avg Compression: 0.603269
Article 1900:	Length: 44	Avg Compression: 0

Article 15700:	Length: 11993	Avg Compression: 0.520490
Article 15800:	Length: 12274	Avg Compression: 0.519436
Article 15900:	Length: 12522	Avg Compression: 0.519417
Article 16000:	Length: 12843	Avg Compression: 0.519216
Article 16100:	Length: 13022	Avg Compression: 0.519412
Article 16200:	Length: 13335	Avg Compression: 0.520001
Article 16300:	Length: 13582	Avg Compression: 0.520906
Article 16400:	Length: 13943	Avg Compression: 0.520872
Article 16500:	Length: 14159	Avg Compression: 0.521914
Article 16600:	Length: 14497	Avg Compression: 0.523003
Article 16700:	Length: 14766	Avg Compression: 0.522815
Article 16800:	Length: 15138	Avg Compression: 0.522954
Article 16900:	Length: 15589	Avg Compression: 0.523196
Article 17000:	Length: 15906	Avg Compression: 0.523986
Article 17100:	Length: 16145	Avg Compression: 0.523922
Article 17200:	Length: 16798	Avg Compression: 0.524581
Article 17300:	Length: 17120	Avg Compression: 0.524358
Article 17400:	Length: 17810	Avg Compression: 0.525381
Article 17

In [16]:
model.train(256, 192, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
   1/1950 [..............................] - ETA: 29:51

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:GPU:0 to /job:localhost/replica:0/task:0/device:CPU:0 in order to run Identity: GPU sync failed [Op:Identity]

In [11]:
total_raw = 0
total_compressed = 0

huffman = Huffman(encoder.vocab_size)
for index, encoded_article in enumerate(articles.articles_generator(1)):
    if index % 100 == 0:
        article = encoder.decode(encoded_article)
        if len(article) == 0:
            continue
        total_raw += len(article) * 8
        total_compressed += huffman.archive_size(model, encoded_article)
        print('Article %d:\tLength: %d\tAvg Compression: %f' % (index, len(article), total_compressed/total_raw))

Article 0:	Length: 12	Avg Compression: 0.395833
Article 100:	Length: 28	Avg Compression: 0.409375
Article 200:	Length: 30	Avg Compression: 0.398214
Article 300:	Length: 31	Avg Compression: 0.413366
Article 400:	Length: 32	Avg Compression: 0.403195
Article 500:	Length: 33	Avg Compression: 0.403614
Article 600:	Length: 34	Avg Compression: 0.413125
Article 700:	Length: 35	Avg Compression: 0.406915
Article 800:	Length: 36	Avg Compression: 0.412362
Article 900:	Length: 37	Avg Compression: 0.408685
Article 1000:	Length: 38	Avg Compression: 0.409321
Article 1100:	Length: 39	Avg Compression: 0.412013
Article 1200:	Length: 39	Avg Compression: 0.422759
Article 1300:	Length: 40	Avg Compression: 0.429957
Article 1400:	Length: 41	Avg Compression: 0.432673
Article 1500:	Length: 41	Avg Compression: 0.440705
Article 1600:	Length: 42	Avg Compression: 0.442815
Article 1700:	Length: 42	Avg Compression: 0.445238
Article 1800:	Length: 43	Avg Compression: 0.443165
Article 1900:	Length: 44	Avg Compression: 0

Article 15700:	Length: 11993	Avg Compression: 0.345856
Article 15800:	Length: 12274	Avg Compression: 0.345071
Article 15900:	Length: 12522	Avg Compression: 0.345067
Article 16000:	Length: 12843	Avg Compression: 0.344023
Article 16100:	Length: 13022	Avg Compression: 0.343778
Article 16200:	Length: 13335	Avg Compression: 0.344051
Article 16300:	Length: 13582	Avg Compression: 0.343821
Article 16400:	Length: 13868	Avg Compression: 0.344475
Article 16500:	Length: 14159	Avg Compression: 0.344367
Article 16600:	Length: 14497	Avg Compression: 0.344150
Article 16700:	Length: 14766	Avg Compression: 0.344383
Article 16800:	Length: 15189	Avg Compression: 0.344072
Article 16900:	Length: 15589	Avg Compression: 0.344215
Article 17000:	Length: 15922	Avg Compression: 0.344016
Article 17100:	Length: 16145	Avg Compression: 0.343492
Article 17200:	Length: 16798	Avg Compression: 0.343322
Article 17300:	Length: 17120	Avg Compression: 0.342287
Article 17400:	Length: 17810	Avg Compression: 0.343090
Article 17

In [12]:
model.train(256, 192, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [13]:
total_raw = 0
total_compressed = 0

huffman = Huffman(encoder.vocab_size)
for index, encoded_article in enumerate(articles.articles_generator(1)):
    if index % 100 == 0:
        article = encoder.decode(encoded_article)
        if len(article) == 0:
            continue
        total_raw += len(article) * 8
        total_compressed += huffman.archive_size(model, encoded_article)
        print('Article %d:\tLength: %d\tAvg Compression: %f' % (index, len(article), total_compressed/total_raw))

Article 0:	Length: 12	Avg Compression: 0.302083
Article 100:	Length: 28	Avg Compression: 0.253125
Article 200:	Length: 30	Avg Compression: 0.287500
Article 300:	Length: 31	Avg Compression: 0.295792
Article 400:	Length: 32	Avg Compression: 0.296992
Article 500:	Length: 33	Avg Compression: 0.292922
Article 600:	Length: 34	Avg Compression: 0.296875
Article 700:	Length: 35	Avg Compression: 0.296277
Article 800:	Length: 36	Avg Compression: 0.307196
Article 900:	Length: 37	Avg Compression: 0.302760
Article 1000:	Length: 38	Avg Compression: 0.305636
Article 1100:	Length: 39	Avg Compression: 0.302922
Article 1200:	Length: 39	Avg Compression: 0.317512
Article 1300:	Length: 40	Avg Compression: 0.323276
Article 1400:	Length: 41	Avg Compression: 0.325000
Article 1500:	Length: 41	Avg Compression: 0.336767
Article 1600:	Length: 42	Avg Compression: 0.342474
Article 1700:	Length: 42	Avg Compression: 0.343452
Article 1800:	Length: 43	Avg Compression: 0.341753
Article 1900:	Length: 44	Avg Compression: 0

Article 15700:	Length: 11993	Avg Compression: 0.283097
Article 15800:	Length: 12274	Avg Compression: 0.282279
Article 15900:	Length: 12522	Avg Compression: 0.282002
Article 16000:	Length: 12843	Avg Compression: 0.281036
Article 16100:	Length: 13022	Avg Compression: 0.281048
Article 16200:	Length: 13335	Avg Compression: 0.281324
Article 16300:	Length: 13582	Avg Compression: 0.280787
Article 16400:	Length: 13868	Avg Compression: 0.281467
Article 16500:	Length: 14159	Avg Compression: 0.280902
Article 16600:	Length: 14497	Avg Compression: 0.280152
Article 16700:	Length: 14766	Avg Compression: 0.280525
Article 16800:	Length: 15189	Avg Compression: 0.280420
Article 16900:	Length: 15589	Avg Compression: 0.280653
Article 17000:	Length: 15922	Avg Compression: 0.280165
Article 17100:	Length: 16145	Avg Compression: 0.279694
Article 17200:	Length: 16798	Avg Compression: 0.279180
Article 17300:	Length: 17120	Avg Compression: 0.278282
Article 17400:	Length: 17810	Avg Compression: 0.279066
Article 17

In [14]:
model.train(256, 192, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
   1/1950 [..............................] - ETA: 45:22

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:GPU:0 to /job:localhost/replica:0/task:0/device:CPU:0 in order to run Identity: GPU sync failed [Op:Identity]

In [10]:
total_raw = 0
total_compressed = 0

huffman = Huffman(encoder.vocab_size)
for index, encoded_article in enumerate(articles.articles_generator(1)):
    if index % 100 == 0:
        article = encoder.decode(encoded_article)
        if len(article) == 0:
            continue
        total_raw += len(article) * 8
        total_compressed += huffman.archive_size(model, encoded_article)
        print('Article %d:\tLength: %d\tAvg Compression: %f' % (index, len(article), total_compressed/total_raw))

Article 0:	Length: 12	Avg Compression: 0.270833
Article 100:	Length: 28	Avg Compression: 0.337500
Article 200:	Length: 30	Avg Compression: 0.307143
Article 300:	Length: 31	Avg Compression: 0.299505
Article 400:	Length: 32	Avg Compression: 0.313910
Article 500:	Length: 33	Avg Compression: 0.317018
Article 600:	Length: 34	Avg Compression: 0.303125
Article 700:	Length: 35	Avg Compression: 0.307979
Article 800:	Length: 36	Avg Compression: 0.316882
Article 900:	Length: 37	Avg Compression: 0.311688
Article 1000:	Length: 38	Avg Compression: 0.312500
Article 1100:	Length: 39	Avg Compression: 0.307468
Article 1200:	Length: 39	Avg Compression: 0.299233
Article 1300:	Length: 40	Avg Compression: 0.307381
Article 1400:	Length: 41	Avg Compression: 0.300248
Article 1500:	Length: 41	Avg Compression: 0.305632
Article 1600:	Length: 42	Avg Compression: 0.302721
Article 1700:	Length: 42	Avg Compression: 0.295833
Article 1800:	Length: 43	Avg Compression: 0.298105
Article 1900:	Length: 44	Avg Compression: 0

Article 15700:	Length: 11993	Avg Compression: 0.259960
Article 15800:	Length: 12274	Avg Compression: 0.259224
Article 15900:	Length: 12522	Avg Compression: 0.259023
Article 16000:	Length: 12843	Avg Compression: 0.258175
Article 16100:	Length: 13022	Avg Compression: 0.258373
Article 16200:	Length: 13335	Avg Compression: 0.258688
Article 16300:	Length: 13582	Avg Compression: 0.258092
Article 16400:	Length: 13943	Avg Compression: 0.258477
Article 16500:	Length: 14159	Avg Compression: 0.257893
Article 16600:	Length: 14497	Avg Compression: 0.257078
Article 16700:	Length: 14766	Avg Compression: 0.257635
Article 16800:	Length: 15138	Avg Compression: 0.258364
Article 16900:	Length: 15589	Avg Compression: 0.258718
Article 17000:	Length: 15922	Avg Compression: 0.258189
Article 17100:	Length: 16145	Avg Compression: 0.257737
Article 17200:	Length: 16798	Avg Compression: 0.257222
Article 17300:	Length: 17120	Avg Compression: 0.256367
Article 17400:	Length: 17810	Avg Compression: 0.257121
Article 17

In [11]:
model.train(256, 192, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [12]:
total_raw = 0
total_compressed = 0

huffman = Huffman(encoder.vocab_size)
for index, encoded_article in enumerate(articles.articles_generator(1)):
    if index % 100 == 0:
        article = encoder.decode(encoded_article)
        total_raw += len(article) * 8
        total_compressed += huffman.archive_size(model, encoded_article)
        print('Article %d:\tLength: %d\tAvg Compression: %f' % (index, len(article), total_compressed/total_raw))

Article 0:	Length: 12	Avg Compression: 0.260417
Article 100:	Length: 28	Avg Compression: 0.334375
Article 200:	Length: 30	Avg Compression: 0.301786
Article 300:	Length: 31	Avg Compression: 0.295792
Article 400:	Length: 32	Avg Compression: 0.307331
Article 500:	Length: 33	Avg Compression: 0.302711
Article 600:	Length: 34	Avg Compression: 0.286875
Article 700:	Length: 35	Avg Compression: 0.293085
Article 800:	Length: 36	Avg Compression: 0.302122
Article 900:	Length: 37	Avg Compression: 0.300325
Article 1000:	Length: 38	Avg Compression: 0.303468
Article 1100:	Length: 39	Avg Compression: 0.296429
Article 1200:	Length: 39	Avg Compression: 0.285672
Article 1300:	Length: 40	Avg Compression: 0.293912
Article 1400:	Length: 41	Avg Compression: 0.288366
Article 1500:	Length: 41	Avg Compression: 0.294185
Article 1600:	Length: 42	Avg Compression: 0.290391
Article 1700:	Length: 42	Avg Compression: 0.283135
Article 1800:	Length: 43	Avg Compression: 0.285290
Article 1900:	Length: 44	Avg Compression: 0

Article 15700:	Length: 11993	Avg Compression: 0.251687
Article 15800:	Length: 12274	Avg Compression: 0.251073
Article 15900:	Length: 12522	Avg Compression: 0.250911
Article 16000:	Length: 12843	Avg Compression: 0.250114
Article 16100:	Length: 13022	Avg Compression: 0.250302
Article 16200:	Length: 13335	Avg Compression: 0.250629
Article 16300:	Length: 13582	Avg Compression: 0.249946
Article 16400:	Length: 13943	Avg Compression: 0.250405
Article 16500:	Length: 14159	Avg Compression: 0.249618
Article 16600:	Length: 14497	Avg Compression: 0.248725
Article 16700:	Length: 14766	Avg Compression: 0.249282
Article 16800:	Length: 15138	Avg Compression: 0.249918
Article 16900:	Length: 15589	Avg Compression: 0.250320
Article 17000:	Length: 15922	Avg Compression: 0.249785
Article 17100:	Length: 16145	Avg Compression: 0.249417
Article 17200:	Length: 16798	Avg Compression: 0.248803
Article 17300:	Length: 17120	Avg Compression: 0.247943
Article 17400:	Length: 17810	Avg Compression: 0.248783
Article 17

In [13]:
model.train(256, 192, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [14]:
total_raw = 0
total_compressed = 0

huffman = Huffman(encoder.vocab_size)
for index, encoded_article in enumerate(articles.articles_generator(1)):
    if index % 100 == 0:
        article = encoder.decode(encoded_article)
        total_raw += len(article) * 8
        total_compressed += huffman.archive_size(model, encoded_article)
        print('Article %d:\tLength: %d\tAvg Compression: %f' % (index, len(article), total_compressed/total_raw))

Article 0:	Length: 12	Avg Compression: 0.281250
Article 100:	Length: 28	Avg Compression: 0.337500
Article 200:	Length: 30	Avg Compression: 0.307143
Article 300:	Length: 31	Avg Compression: 0.299505
Article 400:	Length: 32	Avg Compression: 0.305451
Article 500:	Length: 33	Avg Compression: 0.304217
Article 600:	Length: 34	Avg Compression: 0.285000
Article 700:	Length: 35	Avg Compression: 0.288298
Article 800:	Length: 36	Avg Compression: 0.297970
Article 900:	Length: 37	Avg Compression: 0.297078
Article 1000:	Length: 38	Avg Compression: 0.300217
Article 1100:	Length: 39	Avg Compression: 0.294481
Article 1200:	Length: 39	Avg Compression: 0.283019
Article 1300:	Length: 40	Avg Compression: 0.287177
Article 1400:	Length: 41	Avg Compression: 0.281931
Article 1500:	Length: 41	Avg Compression: 0.288004
Article 1600:	Length: 42	Avg Compression: 0.284226
Article 1700:	Length: 42	Avg Compression: 0.277778
Article 1800:	Length: 43	Avg Compression: 0.280275
Article 1900:	Length: 44	Avg Compression: 0

Article 15700:	Length: 11993	Avg Compression: 0.244421
Article 15800:	Length: 12274	Avg Compression: 0.243870
Article 15900:	Length: 12522	Avg Compression: 0.243772
Article 16000:	Length: 12843	Avg Compression: 0.242916
Article 16100:	Length: 13022	Avg Compression: 0.243183
Article 16200:	Length: 13335	Avg Compression: 0.243465
Article 16300:	Length: 13582	Avg Compression: 0.242864
Article 16400:	Length: 13943	Avg Compression: 0.243480
Article 16500:	Length: 14159	Avg Compression: 0.242656
Article 16600:	Length: 14497	Avg Compression: 0.241601
Article 16700:	Length: 14766	Avg Compression: 0.242115
Article 16800:	Length: 15138	Avg Compression: 0.242628
Article 16900:	Length: 15589	Avg Compression: 0.242975
Article 17000:	Length: 15922	Avg Compression: 0.242376
Article 17100:	Length: 16145	Avg Compression: 0.242022
Article 17200:	Length: 16798	Avg Compression: 0.241522
Article 17300:	Length: 17120	Avg Compression: 0.240824
Article 17400:	Length: 17810	Avg Compression: 0.241888
Article 17

In [15]:
model.train(256, 192, epochs=10)

Epoch 1/10


Epoch 2/10
Epoch 3/10
   1/1950 [..............................] - ETA: 29:10

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:GPU:0 to /job:localhost/replica:0/task:0/device:CPU:0 in order to run Identity: GPU sync failed [Op:Identity]

In [14]:
model.train(256, 192, epochs=8)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [15]:
total_raw = 0
total_compressed = 0

huffman = Huffman(encoder.vocab_size)
for index, encoded_article in enumerate(articles.articles_generator(1)):
    if index % 100 == 0:
        article = encoder.decode(encoded_article)
        total_raw += len(article) * 8
        total_compressed += huffman.archive_size(model, encoded_article)
        print('Article %d:\tLength: %d\tAvg Compression: %f' % (index, len(article), total_compressed/total_raw))

Article 0:	Length: 12	Avg Compression: 0.291667
Article 100:	Length: 28	Avg Compression: 0.228125
Article 200:	Length: 30	Avg Compression: 0.291071
Article 300:	Length: 31	Avg Compression: 0.295792
Article 400:	Length: 32	Avg Compression: 0.294173
Article 500:	Length: 33	Avg Compression: 0.279367
Article 600:	Length: 34	Avg Compression: 0.274375
Article 700:	Length: 35	Avg Compression: 0.269681
Article 800:	Length: 36	Avg Compression: 0.260148
Article 900:	Length: 37	Avg Compression: 0.264205
Article 1000:	Length: 38	Avg Compression: 0.271315
Article 1100:	Length: 39	Avg Compression: 0.270455
Article 1200:	Length: 39	Avg Compression: 0.266215
Article 1300:	Length: 40	Avg Compression: 0.255388
Article 1400:	Length: 41	Avg Compression: 0.256931
Article 1500:	Length: 41	Avg Compression: 0.261447
Article 1600:	Length: 42	Avg Compression: 0.264456
Article 1700:	Length: 42	Avg Compression: 0.263095
Article 1800:	Length: 43	Avg Compression: 0.260030
Article 1900:	Length: 44	Avg Compression: 0

Article 15700:	Length: 11993	Avg Compression: 0.234219
Article 15800:	Length: 12274	Avg Compression: 0.233826
Article 15900:	Length: 12522	Avg Compression: 0.233875
Article 16000:	Length: 12843	Avg Compression: 0.233196
Article 16100:	Length: 13022	Avg Compression: 0.233402
Article 16200:	Length: 13324	Avg Compression: 0.234056
Article 16300:	Length: 13582	Avg Compression: 0.233434
Article 16400:	Length: 13868	Avg Compression: 0.234022
Article 16500:	Length: 14159	Avg Compression: 0.233238
Article 16600:	Length: 14497	Avg Compression: 0.232315
Article 16700:	Length: 14766	Avg Compression: 0.233031
Article 16800:	Length: 15189	Avg Compression: 0.233149
Article 16900:	Length: 15589	Avg Compression: 0.233600
Article 17000:	Length: 15906	Avg Compression: 0.232761
Article 17100:	Length: 16145	Avg Compression: 0.232501
Article 17200:	Length: 16798	Avg Compression: 0.231923
Article 17300:	Length: 17120	Avg Compression: 0.231234
Article 17400:	Length: 17810	Avg Compression: 0.232116
Article 17

In [16]:
model.train(256, 192, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
   1/1950 [..............................] - ETA: 43:51

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:GPU:0 to /job:localhost/replica:0/task:0/device:CPU:0 in order to run Identity: GPU sync failed [Op:Identity]

In [10]:
total_raw = 0
total_compressed = 0

huffman = Huffman(encoder.vocab_size)
for index, encoded_article in enumerate(articles.articles_generator(1)):
    if index % 100 == 0:
        article = encoder.decode(encoded_article)
        total_raw += len(article) * 8
        total_compressed += huffman.archive_size(model, encoded_article)
        print('Article %d:\tLength: %d\tAvg Compression: %f' % (index, len(article), total_compressed/total_raw))

Article 0:	Length: 12	Avg Compression: 0.270833
Article 100:	Length: 28	Avg Compression: 0.246875
Article 200:	Length: 30	Avg Compression: 0.298214
Article 300:	Length: 31	Avg Compression: 0.300743
Article 400:	Length: 32	Avg Compression: 0.263158
Article 500:	Length: 33	Avg Compression: 0.262801
Article 600:	Length: 34	Avg Compression: 0.276875
Article 700:	Length: 35	Avg Compression: 0.262234
Article 800:	Length: 36	Avg Compression: 0.257841
Article 900:	Length: 37	Avg Compression: 0.262175
Article 1000:	Length: 38	Avg Compression: 0.256503
Article 1100:	Length: 39	Avg Compression: 0.256169
Article 1200:	Length: 39	Avg Compression: 0.248526
Article 1300:	Length: 40	Avg Compression: 0.253233
Article 1400:	Length: 41	Avg Compression: 0.254455
Article 1500:	Length: 41	Avg Compression: 0.257097
Article 1600:	Length: 42	Avg Compression: 0.259779
Article 1700:	Length: 42	Avg Compression: 0.262698
Article 1800:	Length: 43	Avg Compression: 0.264302
Article 1900:	Length: 44	Avg Compression: 0

Article 15700:	Length: 11993	Avg Compression: 0.234066
Article 15800:	Length: 12274	Avg Compression: 0.233555
Article 15900:	Length: 12522	Avg Compression: 0.233528
Article 16000:	Length: 12843	Avg Compression: 0.232837
Article 16100:	Length: 13022	Avg Compression: 0.233039
Article 16200:	Length: 13335	Avg Compression: 0.233213
Article 16300:	Length: 13582	Avg Compression: 0.232515
Article 16400:	Length: 13868	Avg Compression: 0.232852
Article 16500:	Length: 14159	Avg Compression: 0.232189
Article 16600:	Length: 14497	Avg Compression: 0.231291
Article 16700:	Length: 14766	Avg Compression: 0.231952
Article 16800:	Length: 15118	Avg Compression: 0.231429
Article 16900:	Length: 15589	Avg Compression: 0.231916
Article 17000:	Length: 15922	Avg Compression: 0.231370
Article 17100:	Length: 16145	Avg Compression: 0.231109
Article 17200:	Length: 16798	Avg Compression: 0.230609
Article 17300:	Length: 17360	Avg Compression: 0.230932
Article 17400:	Length: 17810	Avg Compression: 0.231818
Article 17

In [11]:
model.train(256, 192, epochs=12)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
   1/1950 [..............................] - ETA: 1:36:16

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:GPU:0 to /job:localhost/replica:0/task:0/device:CPU:0 in order to run Identity: GPU sync failed [Op:Identity]

In [10]:
model.train(256, 192, epochs=6)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [11]:
model.train(256, 192, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
   1/1950 [..............................] - ETA: 37:03

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:GPU:0 to /job:localhost/replica:0/task:0/device:CPU:0 in order to run Identity: GPU sync failed [Op:Identity]

In [10]:
model.train(256, 192, epochs=8)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [14]:
model.train(256, 192, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [15]:
total_raw = 0
total_compressed = 0

huffman = Huffman(encoder.vocab_size)
for index, encoded_article in enumerate(articles.articles_generator(1)):
    if index % 100 == 0:
        article = encoder.decode(encoded_article)
        total_raw += len(article) * 8
        total_compressed += huffman.archive_size(model, encoded_article)
        print('Article %d:\tLength: %d\tAvg Compression: %f' % (index, len(article), total_compressed/total_raw))

Article 0:	Length: 12	Avg Compression: 0.250000
Article 100:	Length: 28	Avg Compression: 0.287500
Article 200:	Length: 30	Avg Compression: 0.251786
Article 300:	Length: 31	Avg Compression: 0.237624
Article 400:	Length: 32	Avg Compression: 0.219925
Article 500:	Length: 33	Avg Compression: 0.240211
Article 600:	Length: 34	Avg Compression: 0.258750
Article 700:	Length: 35	Avg Compression: 0.254787
Article 800:	Length: 36	Avg Compression: 0.250000
Article 900:	Length: 37	Avg Compression: 0.242289
Article 1000:	Length: 38	Avg Compression: 0.241691
Article 1100:	Length: 39	Avg Compression: 0.241558
Article 1200:	Length: 39	Avg Compression: 0.235849
Article 1300:	Length: 40	Avg Compression: 0.237608
Article 1400:	Length: 41	Avg Compression: 0.234406
Article 1500:	Length: 41	Avg Compression: 0.233516
Article 1600:	Length: 42	Avg Compression: 0.236820
Article 1700:	Length: 42	Avg Compression: 0.237302
Article 1800:	Length: 43	Avg Compression: 0.241828
Article 1900:	Length: 44	Avg Compression: 0

Article 15700:	Length: 11993	Avg Compression: 0.229267
Article 15800:	Length: 12274	Avg Compression: 0.228779
Article 15900:	Length: 12522	Avg Compression: 0.228677
Article 16000:	Length: 12843	Avg Compression: 0.228025
Article 16100:	Length: 13022	Avg Compression: 0.228200
Article 16200:	Length: 13324	Avg Compression: 0.228748
Article 16300:	Length: 13582	Avg Compression: 0.228051
Article 16400:	Length: 13868	Avg Compression: 0.228679
Article 16500:	Length: 14159	Avg Compression: 0.227888
Article 16600:	Length: 14497	Avg Compression: 0.226889
Article 16700:	Length: 14766	Avg Compression: 0.227600
Article 16800:	Length: 15189	Avg Compression: 0.227720
Article 16900:	Length: 15589	Avg Compression: 0.228260
Article 17000:	Length: 15922	Avg Compression: 0.227685
Article 17100:	Length: 16145	Avg Compression: 0.227485
Article 17200:	Length: 16798	Avg Compression: 0.226831
Article 17300:	Length: 17120	Avg Compression: 0.226112
Article 17400:	Length: 17810	Avg Compression: 0.226871
Article 17

## Нека запишем примерно предвиждане на данни; ще го използваме за тестване на кодиране на huffman и аритметично кодиране

In [21]:
text = articles.encoded_articles[18000]
input_eval = np.array([[text[0]]], dtype=TYPE)

model.predicting_model.reset_states()

char_actual = []
char_predictions = []

for byte in text[1:]:
    predictions = model.predict(input_eval)
    predictions = tf.squeeze(predictions, 0) # remove the batch dimension

    weights = tf.nn.softmax(predictions[0]).numpy()
    char_actual.append(byte)
    char_predictions.append(weights)

    input_eval = tf.expand_dims([byte], 0)

In [35]:
with open('char-actual', 'wb') as file:
    file.write(np.array(char_actual).tobytes())

In [36]:
with open('char-predictions', 'wb') as file:
    file.write(np.array(char_predictions).tobytes())

In [37]:
np.array(char_predictions).shape

(8680, 1014)

In [38]:
char_predictions[0]

array([1.7285488e-06, 3.6423372e-03, 1.1795628e-06, ..., 7.9423105e-03,
       2.4557270e-03, 2.2815516e-06], dtype=float32)