In [1]:
import tensorflow as tf

import numpy as np
import os
import time

In [2]:
BATCH_SIZE = 4
BUFFER_SIZE = 128

with open('page_revisions_text', 'rb') as text_file:
    data = text_file.read()

articles = data.split(b'\0')[:2000]

def articles_generator():
    for article in articles:
        yield np.frombuffer(article, dtype=np.uint8)

def split_input_target(chunk):
    return chunk[:, :-1], chunk[:, 1:]

dataset = tf.data.Dataset.from_generator(articles_generator, tf.uint8)
dataset = dataset.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE, padded_shapes=([None]), drop_remainder=True)
dataset = dataset.map(split_input_target)

dataset

<MapDataset shapes: ((4, None), (4, None)), types: (tf.uint8, tf.uint8)>

In [3]:
vocab_size = 256
embedding_dim = 256
rnn_units = 384

def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    return tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]),
        tf.keras.layers.LSTM(rnn_units, return_sequences=True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size),
    ])

In [4]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [5]:
model = build_model(vocab_size = vocab_size, embedding_dim=embedding_dim, rnn_units=rnn_units, batch_size=BATCH_SIZE)
model.compile(optimizer='adam', loss=loss)

In [6]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, save_weights_only=True)

In [7]:
history = model.fit(dataset, epochs=10, callbacks=[checkpoint_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [8]:
history = model.fit(dataset, epochs=3, callbacks=[checkpoint_callback])

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [9]:
history = model.fit(dataset, epochs=2, callbacks=[checkpoint_callback])

Epoch 1/2
Epoch 2/2


In [10]:
history = model.fit(dataset, epochs=2, callbacks=[checkpoint_callback])

Epoch 1/2
Epoch 2/2


In [11]:
history = model.fit(dataset, epochs=2, callbacks=[checkpoint_callback])

Epoch 1/2
Epoch 2/2


In [12]:
history = model.fit(dataset, epochs=2, callbacks=[checkpoint_callback])

Epoch 1/2
Epoch 2/2


In [14]:
with open('page_revisions_text', 'rb') as text_file:
    data = text_file.read()

articles = data.split(b'\0')[:2000]
del data

article = articles[120]
del articles

len(article)

25541

In [15]:
import huffman

def huffman_archive_size(model, text):
    archived_size = 0
    ones = 0
    input_eval = [s for s in b' ']
    input_eval = tf.expand_dims(input_eval, 0)
  
    # Empty string to store our results
    text_generated = []

    model.reset_states()

    for byte in text:
        predictions = model(input_eval)
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)
  
        probabilities = tf.nn.softmax(predictions[0])
        codebook = huffman.codebook([index, tensor.numpy()] for index, tensor in enumerate(probabilities))

        code = codebook[byte]
        ones += code.count('1')
        archived_size += len(code)

        input_eval = tf.expand_dims([byte], 0)
  
    return ones, archived_size

In [16]:
tf.train.latest_checkpoint(checkpoint_dir)
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

In [17]:
ones, archived_size = huffman_archive_size(model, article)
print('\nTotal length:', archived_size)

compression_ratio = archived_size / (len(article) * 8)
print('Compression ratio:', compression_ratio)

k = (ones / archived_size)
compression_ratio * (-k * np.log2(k) - (1-k) * np.log2(1-k))
print('Potential compression ratio with arithmetic coding:', compression_ratio)


Total length: 180886
Compression ratio: 0.8852726987980111
Potential compression ratio with arithmetic coding: 0.8852726987980111


Не работи добре. Това, което не се вижда в Jupyter тетрадката е, че GPU-то е натоварено на под 50%. Това контрастира с досегашните експерименти, където се използваха над 90%.