In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds

In [2]:
def articles():
    with open('page_revisions_text', 'rb') as text_file:
        pending_article_data = b''
        while True:
            data = text_file.read(1024 * 1024)
            if len(data) == 0:
                break

            articles = data.split(b'\0')
            articles[0] = pending_article_data + articles[0]
            for index, article in enumerate(articles):
                if index + 1 == len(articles):
                    pending_article_data = article
                else:
                    yield article

        print(pending_article_data)
        if len(pending_article_data) != 0:
            yield pending_article_data

In [35]:
subword_text_encoder = tfds.features.text.SubwordTextEncoder.build_from_corpus(articles(), 260)

b''


<SubwordTextEncoder vocab_size=276>

In [38]:
subword_text_encoder.subwords

['e_',
 'er',
 'in',
 'an',
 ']]',
 'd_',
 'on',
 ', ',
 'or',
 's_',
 'at',
 'en',
 'ar',
 ' [[',
 'es',
 'th',
 'al',
 'is',
 'the_']

In [47]:
len(subword_text_encoder.subwords)

19

Получихме ужасно малък речник от под-думи. За `vocab_size` получаваме 276, а реално имаме 19 под-думи в `subwords`.

In [45]:
subword_text_encoder.vocab_size - len(subword_text_encoder.subwords)

257

Разликата в размерите е подозрително близо до 256. Изглежда очакваният размер, който зададохме (в случая - 260) включва в себе си и единичните символи.
# 🤦‍♂️

<br>

Смятането отне прилично много време, а на практика не доведе до нищо смисленo. И документацията не беше споменала нищо по въпроса.

Изглежда ще трябва да минем през процедурата още веднъж, но с по-голям речник. Нека дори опитаме с **доста** по-голям.

In [48]:
subword_text_encoder = tfds.features.text.SubwordTextEncoder.build_from_corpus(articles(), 2048)

b''


In [49]:
subword_text_encoder

<SubwordTextEncoder vocab_size=2046>

In [50]:
subword_text_encoder.subwords

['the_',
 ', ',
 ' [[',
 'of_',
 's_',
 'and_',
 ']] ',
 '. ',
 'a_',
 'in_',
 'to_',
 'd_',
 'is_',
 'e_',
 '[[',
 ' (',
 'ed_',
 'y_',
 ']], ',
 'the',
 'The_',
 't_',
 'on_',
 'ing_',
 ']]\n[[',
 'for_',
 'was_',
 'are_',
 'in',
 '.  ',
 '% ',
 ']]',
 'as_',
 'on',
 'with_',
 'n_',
 'r_',
 '.\n\n',
 'al_',
 'er',
 'an_',
 'and',
 ')|',
 'that_',
 'by_',
 'of',
 'l_',
 'or_',
 'from_',
 "''",
 'ly_',
 'ro',
 'es',
 'le',
 '" ',
 "'''",
 ']], [[',
 '\\&undsc',
 'es_',
 'an',
 'at_',
 ') ',
 'is',
 'to',
 'http',
 '://',
 'ng_',
 ' "',
 ': ',
 'Category',
 'as',
 '{{',
 'li',
 'or',
 '18',
 'us',
 'ne',
 'st_',
 'de',
 'it_',
 'er_',
 " ''",
 'al',
 '; ',
 'ra',
 'his_',
 'have_',
 'ol',
 'lo',
 'be_',
 'hi',
 'which_',
 ']]. ',
 'en',
 'ng',
 'mi',
 'ad',
 'na',
 'ri',
 'sup2',
 'ed',
 'www',
 'ge',
 'sa',
 'se',
 'The',
 'ga',
 'it',
 'ha',
 'ar',
 'Re',
 '</',
 'ing',
 'Se',
 'age_',
 "'' ",
 'ta',
 'fa',
 'la',
 'com',
 'Census',
 ']] (',
 'th',
 'go',
 'me',
 'be',
 'do',
 'pi',
 

Новият размер е:

In [6]:
sum(len(subword_text_encoder.encode(a)) for a in articles())

b''


350030507

Нека запишем речника във файл:

In [51]:
subword_text_encoder.save_to_file('vocab_2046')

При изпълнение по-надолу, тетрадката заби. Добре е, че имаме записан речника. Продължаваме от тук, като индексите на изпълнениет редове ще са малко объркани.

In [5]:
subword_text_encoder = tfds.features.text.SubwordTextEncoder.load_from_file('vocab_2046')

Да видим как би изглеждало обучение с кодираните статии...

In [3]:
import os

import numpy as np
import itertools

In [25]:
BATCH_SIZE = 64
BATCHED_ITEM_LENGTH = 128
BUFFER_SIZE = 256
TYPE=np.uint16

def articles_generator():
    for index, article in enumerate(itertools.islice(articles(), 0, 2000)):
        yield np.array(subword_text_encoder.encode(article + b'\0'), dtype=TYPE)

    # Pad the article count to the batch size
    # We do this to ensure that no data is dropped
    index += 1
    while index % BATCH_SIZE != 0:
        yield np.array([0], dtype=TYPE)
        index += 1

def subbatches():
    dataset = tf.data.Dataset.from_generator(articles_generator, output_types=TYPE)
    dataset = dataset.shuffle(BUFFER_SIZE)
    dataset = dataset.padded_batch(BATCH_SIZE, padded_shapes=([None]), drop_remainder=True)

    for batch in dataset.as_numpy_iterator():
        remaining = batch
        while remaining.shape[1] > 1:
            yield remaining[:, :BATCHED_ITEM_LENGTH]
            remaining = remaining[:, BATCHED_ITEM_LENGTH-1:]

dataset = tf.data.Dataset.from_generator(subbatches, output_types=TYPE, output_shapes=(BATCH_SIZE, None))
dataset = dataset.map(lambda batch: (batch[:, :-1], batch[:, 1:]))

dataset

<MapDataset shapes: ((64, None), (64, None)), types: (tf.uint16, tf.uint16)>

In [26]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    return tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]),
        tf.keras.layers.LSTM(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size),
    ])

In [27]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

def average_batch_length(true_labels, predictions):
    return tf.shape(true_labels)[1]

model = build_model(vocab_size = subword_text_encoder.vocab_size, embedding_dim=256, rnn_units=1024, batch_size=BATCH_SIZE)
model.compile(optimizer='adam', loss=loss, metrics=[average_batch_length])

In [28]:
checkpoint_dir = './training_checkpoints' # Directory where the checkpoints will be saved
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}") # Name of the checkpoint files

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, save_weights_only=True)

In [29]:
class ModelStateResetter(tf.keras.callbacks.Callback):
    def __init__(self):
        self.last_total_length = 0

    def on_batch_end(self, batch, logs={}):
        average_batch_length = logs.get('average_batch_length', 0)
        total_length = int(round(average_batch_length * (batch + 1)))
        current_batch_length = total_length - self.last_total_length
        self.last_total_length = total_length
        
        if current_batch_length < BATCHED_ITEM_LENGTH - 1:
            self.model.reset_states()
        
model_state_resetter_callback = ModelStateResetter()

In [30]:
total_epochs = 12

for epoch in range(total_epochs):
    print('Epoch %d/%d' % (epoch + 1, total_epochs))
    model.fit(dataset, callbacks=[checkpoint_callback, model_state_resetter_callback])

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


In [9]:
with open('page_revisions_text', 'rb') as text_file:
    data = text_file.read()

article = data.split(b'\0')[120]
del data

encoded_article = np.array(subword_text_encoder.encode(article + b'\0'), dtype=TYPE)

print('Raw:', len(article))
print('Encded:', len(encoded_article))

Raw: 25541
Encded: 9379


In [17]:
import huffman

def huffman_archive_size(model, text):
    archived_size = 0
    ones = 0
    input_eval = np.array([[0]], dtype=TYPE)
  
    # Empty string to store our results
    text_generated = []

    model.reset_states()

    for index, byte in enumerate(text):
        if index % 50 == 0:
            print('%d/%d' % (index, len(text)))

        predictions = model(input_eval)
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)
  
        probabilities = tf.nn.softmax(predictions[0])
        codebook = huffman.codebook([index, tensor.numpy()] for index, tensor in enumerate(probabilities))

        code = codebook[byte]
        ones += code.count('1')
        archived_size += len(code)

        input_eval = tf.expand_dims([byte], 0)
  
    return ones, archived_size

In [18]:
tf.train.latest_checkpoint(checkpoint_dir)
model = build_model(vocab_size = subword_text_encoder.vocab_size, embedding_dim=256, rnn_units=1024, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

In [19]:
ones, archived_size = huffman_archive_size(model, encoded_article)
print('\nCompressed length:', archived_size)

compression_ratio = archived_size / (len(encoded_article) * 8)
print('Compression ratio for encoded:', compression_ratio)

compression_ratio = archived_size / (len(article) * 8)
print('Compression ratio for raw:', compression_ratio)

k = (ones / archived_size)
compression_ratio = compression_ratio * (-k * np.log2(k) - (1-k) * np.log2(1-k))
print('Potential compression ratio with arithmetic coding:', compression_ratio)

0/9379
50/9379
100/9379
150/9379
200/9379
250/9379
300/9379
350/9379
400/9379
450/9379
500/9379
550/9379
600/9379
650/9379
700/9379
750/9379
800/9379
850/9379
900/9379
950/9379
1000/9379
1050/9379
1100/9379
1150/9379
1200/9379
1250/9379
1300/9379
1350/9379
1400/9379
1450/9379
1500/9379
1550/9379
1600/9379
1650/9379
1700/9379
1750/9379
1800/9379
1850/9379
1900/9379
1950/9379
2000/9379
2050/9379
2100/9379
2150/9379
2200/9379
2250/9379
2300/9379
2350/9379
2400/9379
2450/9379
2500/9379
2550/9379
2600/9379
2650/9379
2700/9379
2750/9379
2800/9379
2850/9379
2900/9379
2950/9379
3000/9379
3050/9379
3100/9379
3150/9379
3200/9379
3250/9379
3300/9379
3350/9379
3400/9379
3450/9379
3500/9379
3550/9379
3600/9379
3650/9379
3700/9379
3750/9379
3800/9379
3850/9379
3900/9379
3950/9379
4000/9379
4050/9379
4100/9379
4150/9379
4200/9379
4250/9379
4300/9379
4350/9379
4400/9379
4450/9379
4500/9379
4550/9379
4600/9379
4650/9379
4700/9379
4750/9379
4800/9379
4850/9379
4900/9379
4950/9379
5000/9379
5050/9379
510

Това са най-добрите резултати до момента. Вроятно с по-голям речник ще постигнем и повече. Но нека преди това да видим, дали можем да натренираме невронната мрежа малко повече...

In [21]:
model = build_model(vocab_size = subword_text_encoder.vocab_size, embedding_dim=256, rnn_units=1024, batch_size=BATCH_SIZE)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.compile(optimizer='adam', loss=loss, metrics=[average_batch_length])

In [26]:
total_epochs = 5

for epoch in range(total_epochs):
    print('Epoch %d/%d' % (epoch + 1, total_epochs))
    model.fit(dataset, callbacks=[checkpoint_callback, model_state_resetter_callback])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [27]:
tf.train.latest_checkpoint(checkpoint_dir)
model = build_model(vocab_size = subword_text_encoder.vocab_size, embedding_dim=256, rnn_units=1024, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

In [28]:
ones, archived_size = huffman_archive_size(model, encoded_article)
print('\nCompressed length:', archived_size)

compression_ratio = archived_size / (len(encoded_article) * 8)
print('Compression ratio for encoded:', compression_ratio)

compression_ratio = archived_size / (len(article) * 8)
print('Compression ratio for raw:', compression_ratio)

k = (ones / archived_size)
compression_ratio = compression_ratio * (-k * np.log2(k) - (1-k) * np.log2(1-k))
print('Potential compression ratio with arithmetic coding:', compression_ratio)

0/9379
50/9379
100/9379
150/9379
200/9379
250/9379
300/9379
350/9379
400/9379
450/9379
500/9379
550/9379
600/9379
650/9379
700/9379
750/9379
800/9379
850/9379
900/9379
950/9379
1000/9379
1050/9379
1100/9379
1150/9379
1200/9379
1250/9379
1300/9379
1350/9379
1400/9379
1450/9379
1500/9379
1550/9379
1600/9379
1650/9379
1700/9379
1750/9379
1800/9379
1850/9379
1900/9379
1950/9379
2000/9379
2050/9379
2100/9379
2150/9379
2200/9379
2250/9379
2300/9379
2350/9379
2400/9379
2450/9379
2500/9379
2550/9379
2600/9379
2650/9379
2700/9379
2750/9379
2800/9379
2850/9379
2900/9379
2950/9379
3000/9379
3050/9379
3100/9379
3150/9379
3200/9379
3250/9379
3300/9379
3350/9379
3400/9379
3450/9379
3500/9379
3550/9379
3600/9379
3650/9379
3700/9379
3750/9379
3800/9379
3850/9379
3900/9379
3950/9379
4000/9379
4050/9379
4100/9379
4150/9379
4200/9379
4250/9379
4300/9379
4350/9379
4400/9379
4450/9379
4500/9379
4550/9379
4600/9379
4650/9379
4700/9379
4750/9379
4800/9379
4850/9379
4900/9379
4950/9379
5000/9379
5050/9379
510