In [1]:
import itertools
import os

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

physical_devices = tf.config.list_physical_devices('GPU')
for physical_device in physical_devices:
    tf.config.experimental.set_memory_growth(physical_device, enable=True)

In [2]:
#config = tf.compat.v1.ConfigProto()
#config.gpu_options.per_process_gpu_memory_fraction = 2
#config.gpu_options.allow_growth = True
#
#tf.compat.v1.keras.backend.set_session(tf.compat.v1.Session(config=config))

In [3]:
def articles():
    with open('page_revisions_text', 'rb') as text_file:
        pending_article_data = b''
        while True:
            data = text_file.read(1024 ** 2)
            if len(data) == 0:
                break

            articles = data.split(b'\0')
            articles[0] = pending_article_data + articles[0]
            for index, article in enumerate(articles):
                if index + 1 == len(articles):
                    pending_article_data = article
                else:
                    yield article

        print(pending_article_data)
        if len(pending_article_data) != 0:
            yield pending_article_data

In [4]:
subword_text_encoder = tfds.features.text.SubwordTextEncoder.load_from_file('vocab_4096')

In [5]:
BATCH_SIZE = 2048
BATCHED_ITEM_LENGTH = 24
TYPE=np.int16

def articles_generator():
    for index, article in enumerate(itertools.islice(articles(), 0, 25000)):
        yield np.array(subword_text_encoder.encode(article + b'\0'), dtype=TYPE)

    # Pad the article count to the batch size
    # We do this to ensure that no data is dropped
    index += 1
    while index % BATCH_SIZE != 0:
        yield np.array([0], dtype=TYPE)
        index += 1

def subbatches():
    dataset = tf.data.Dataset.from_generator(articles_generator, output_types=TYPE)
    dataset = dataset.shuffle(1500)
    dataset = dataset.padded_batch(BATCH_SIZE, padded_shapes=([None]), drop_remainder=True)
    dataset = dataset.shuffle(50)

    for batch in dataset.as_numpy_iterator():
        remaining = batch
        while remaining.shape[1] > 1:
            yield remaining[:, :BATCHED_ITEM_LENGTH + 1]
            remaining = remaining[:, BATCHED_ITEM_LENGTH:]

dataset = tf.data.Dataset.from_generator(subbatches, output_types=TYPE, output_shapes=(BATCH_SIZE, None))
dataset = dataset.map(lambda batch: (batch[:, :-1], batch[:, 1:]))

dataset

<MapDataset shapes: ((2048, None), (2048, None)), types: (tf.int16, tf.int16)>

In [6]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    return tf.keras.Sequential([
        tf.keras.layers.Masking(mask_value=0, batch_input_shape=[batch_size, None]),
        tf.keras.layers.Embedding(vocab_size, embedding_dim),
        tf.keras.layers.GRU(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.GRU(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.GRU(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size),
    ])

In [7]:
checkpoint_dir = './training_checkpoints-2' # Directory where the checkpoints will be saved
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}") # Name of the checkpoint files

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, save_weights_only=True)

class ModelStateResetter(tf.keras.callbacks.Callback):
    def __init__(self):
        self.last_total_length = 0

    def on_batch_end(self, batch, logs={}):
        average_batch_length = logs.get('average_batch_length', 0)
        total_length = int(round(average_batch_length * (batch + 1)))
        current_batch_length = total_length - self.last_total_length
        self.last_total_length = total_length
        
        if current_batch_length < BATCHED_ITEM_LENGTH:
            self.model.reset_states()
        
model_state_resetter_callback = ModelStateResetter()

def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

def average_batch_length(true_labels, predictions):
    return tf.shape(true_labels)[1]

model = build_model(vocab_size = subword_text_encoder.vocab_size, embedding_dim=512, rnn_units=768, batch_size=BATCH_SIZE)
# model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.compile(optimizer='adam', loss=loss, metrics=[average_batch_length])

In [None]:
total_epochs = 4

for epoch in range(total_epochs):
    print('Epoch %d/%d' % (epoch + 1, total_epochs))
    model.fit(dataset, callbacks=[checkpoint_callback, model_state_resetter_callback])

Epoch 1/4
Epoch 2/4
  14715/Unknown - 10330s 702ms/step - loss: 0.2103 - average_batch_length: 23.9956

In [None]:
total_epochs = 4

for epoch in range(total_epochs):
    print('Epoch %d/%d' % (epoch + 1, total_epochs))
    model.fit(dataset, callbacks=[checkpoint_callback, model_state_resetter_callback])

Epoch 1/4
Epoch 2/4
Epoch 3/4
    973/Unknown - 759s 780ms/step - loss: 0.3644 - average_batch_length: 24.0000

In [8]:
total_epochs = 2

for epoch in range(total_epochs):
    print('Epoch %d/%d' % (epoch + 1, total_epochs))
    model.fit(dataset, callbacks=[checkpoint_callback, model_state_resetter_callback])

Epoch 1/2
Epoch 2/2
   3568/Unknown - 2612s 732ms/step - loss: 0.2435 - average_batch_length: 23.9980

KeyboardInterrupt: 

In [30]:
def build_predicting_model(vocab_size, embedding_dim, rnn_units):
    return tf.keras.Sequential([
        tf.keras.layers.Masking(mask_value=0, batch_input_shape=[1, 1]),
        tf.keras.layers.Embedding(vocab_size, embedding_dim),
        tf.keras.layers.GRU(rnn_units, return_sequences=True),
        tf.keras.layers.GRU(rnn_units, return_sequences=True),
        tf.keras.layers.GRU(rnn_units, return_sequences=True),
        tf.keras.layers.Dense(vocab_size),
    ])

model = build_predicting_model(vocab_size = subword_text_encoder.vocab_size, embedding_dim=512, rnn_units=768)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, 1]))

In [31]:
import ctypes

class Huffman:
    huffman = ctypes.CDLL('x64/Release/huffman')
    
    huffman.create_tree.restype = ctypes.c_void_p
    huffman.destroy_tree.restype = None
    huffman.load_weights.restype = None
    huffman.create_code_string.restype = ctypes.c_char_p
    
    def __init__(self, category_count):
        self.tree = ctypes.c_void_p(self.huffman.create_tree(category_count))

    def __del__(self):
        self.huffman.destroy_tree(self.tree)
        
    def load_weights(self, weights):
        self.huffman.load_weights(self.tree, weights.ctypes.data_as(ctypes.POINTER(ctypes.c_float)))
    
    def get_code_length(self, category):
        return self.huffman.get_code_length(self.tree, category)

    def get_code_zero_count(self, category):
        return self.huffman.get_code_zero_count(self.tree, category)

In [32]:
def huffman_archive_size(model, text):
    archived_size = 0
    zeros = 0
    input_eval = np.array([[0]], dtype=TYPE)
    huffman_tree = Huffman(subword_text_encoder.vocab_size)

    text_generated = []

    model.reset_states()

    for index, byte in enumerate(text):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0) # remove the batch dimension

        weights = tf.nn.softmax(predictions[0]).numpy()
        huffman_tree.load_weights(weights)
        zeros += huffman_tree.get_code_zero_count(byte.item())
        archived_size += huffman_tree.get_code_length(byte.item())

        input_eval = tf.expand_dims([byte], 0)
  
    return archived_size, zeros

In [33]:
total_raw = 0
total_compressed = 0

for index, article in enumerate(itertools.islice(articles(), 0, 512)):
    raw = (len(article) + 1) * 8
    encoded_article = np.array(subword_text_encoder.encode(article + b'\0'), dtype=TYPE)
    compressed, _ = huffman_archive_size(model, encoded_article)
    total_raw += raw
    total_compressed += compressed
    print('Article %d:\tCompression: %f\tAvg Compression: %f' % (index, compressed/raw, total_compressed/total_raw))

Article 0:	Compression: 9.611111	Avg Compression: 9.611111
Article 1:	Compression: 4.333333	Avg Compression: 5.916667
Article 2:	Compression: 3.719388	Avg Compression: 4.928899
Article 3:	Compression: 5.833333	Avg Compression: 5.124101
Article 4:	Compression: 5.035714	Avg Compression: 5.106322
Article 5:	Compression: 4.442308	Avg Compression: 4.984742
Article 6:	Compression: 0.359880	Avg Compression: 0.376640
Article 7:	Compression: 4.722973	Avg Compression: 0.379374
Article 8:	Compression: 4.415625	Avg Compression: 0.382117
Article 9:	Compression: 4.172619	Avg Compression: 0.384820
Article 10:	Compression: 4.703947	Avg Compression: 0.387605
Article 11:	Compression: 4.000000	Avg Compression: 0.390300
Article 12:	Compression: 4.005682	Avg Compression: 0.392995
Article 13:	Compression: 4.641447	Avg Compression: 0.395728
Article 14:	Compression: 3.803191	Avg Compression: 0.398438
Article 15:	Compression: 7.291667	Avg Compression: 0.401235
Article 16:	Compression: 5.032143	Avg Compression:

Article 137:	Compression: 4.878472	Avg Compression: 0.421416
Article 138:	Compression: 2.953629	Avg Compression: 0.421798
Article 139:	Compression: 0.403534	Avg Compression: 0.420760
Article 140:	Compression: 0.370418	Avg Compression: 0.417437
Article 141:	Compression: 0.437974	Avg Compression: 0.417593
Article 142:	Compression: 0.407071	Avg Compression: 0.417415
Article 143:	Compression: 0.344310	Avg Compression: 0.416730
Article 144:	Compression: 4.570513	Avg Compression: 0.417066
Article 145:	Compression: 0.372025	Avg Compression: 0.415459
Article 146:	Compression: 0.369252	Avg Compression: 0.414700
Article 147:	Compression: 0.371935	Avg Compression: 0.413552
Article 148:	Compression: 0.339783	Avg Compression: 0.409262
Article 149:	Compression: 0.333956	Avg Compression: 0.404510
Article 150:	Compression: 5.121429	Avg Compression: 0.404789
Article 151:	Compression: 0.358488	Avg Compression: 0.404245
Article 152:	Compression: 0.359980	Avg Compression: 0.400195
Article 153:	Compression

Article 272:	Compression: 0.388442	Avg Compression: 0.398853
Article 273:	Compression: 0.397660	Avg Compression: 0.398853
Article 274:	Compression: 0.546896	Avg Compression: 0.399044
Article 275:	Compression: 0.555197	Avg Compression: 0.399157
Article 276:	Compression: 0.421503	Avg Compression: 0.399183
Article 277:	Compression: 0.342048	Avg Compression: 0.398853
Article 278:	Compression: 0.450414	Avg Compression: 0.399150
Article 279:	Compression: 0.390655	Avg Compression: 0.399096
Article 280:	Compression: 7.250000	Avg Compression: 0.399190
Article 281:	Compression: 0.377581	Avg Compression: 0.398740
Article 282:	Compression: 0.395033	Avg Compression: 0.398732
Article 283:	Compression: 0.373618	Avg Compression: 0.398498
Article 284:	Compression: 0.425301	Avg Compression: 0.398847
Article 285:	Compression: 3.831522	Avg Compression: 0.398933
Article 286:	Compression: 5.021429	Avg Compression: 0.399021
Article 287:	Compression: 0.524901	Avg Compression: 0.399211
Article 288:	Compression





Article 333:	Compression: 0.360780	Avg Compression: 0.394604
Article 334:	Compression: 0.371112	Avg Compression: 0.394536
Article 335:	Compression: 0.375629	Avg Compression: 0.394444
Article 336:	Compression: 0.502053	Avg Compression: 0.394654
Article 337:	Compression: 0.539279	Avg Compression: 0.394814
Article 338:	Compression: 0.629167	Avg Compression: 0.394923
Article 339:	Compression: 0.461779	Avg Compression: 0.395037
Article 340:	Compression: 0.371066	Avg Compression: 0.394753
Article 341:	Compression: 0.346277	Avg Compression: 0.394726
Article 342:	Compression: 0.374404	Avg Compression: 0.394707
Article 343:	Compression: 5.017361	Avg Compression: 0.394771
Article 344:	Compression: 0.510524	Avg Compression: 0.395250
Article 345:	Compression: 0.520832	Avg Compression: 0.395946
Article 346:	Compression: 0.366179	Avg Compression: 0.395642
Article 347:	Compression: 2.286164	Avg Compression: 0.395757
Article 348:	Compression: 0.407253	Avg Compression: 0.395844
Article 349:	Compression

Article 425:	Compression: 1.183333	Avg Compression: 0.396125
Article 426:	Compression: 0.375299	Avg Compression: 0.395976
Article 427:	Compression: 0.350201	Avg Compression: 0.395652
Article 428:	Compression: 0.417591	Avg Compression: 0.395736
Article 429:	Compression: 0.495609	Avg Compression: 0.395894
Article 430:	Compression: 0.397997	Avg Compression: 0.395900
Article 431:	Compression: 6.687500	Avg Compression: 0.395946
Article 432:	Compression: 0.919563	Avg Compression: 0.396030
Article 433:	Compression: 0.361158	Avg Compression: 0.395257
Article 434:	Compression: 0.382844	Avg Compression: 0.395202
Article 435:	Compression: 7.192708	Avg Compression: 0.395247
Article 436:	Compression: 0.463860	Avg Compression: 0.395407
Article 437:	Compression: 0.432903	Avg Compression: 0.395558
Article 438:	Compression: 0.448812	Avg Compression: 0.395717
Article 439:	Compression: 0.501854	Avg Compression: 0.395891
Article 440:	Compression: 0.487048	Avg Compression: 0.396081
Article 441:	Compression