В предишната тетрадка получихме доста високи стойности за `loss`. Преди това да се случи променихме две неща - започнахме да експериментираме с обучаване на целият набор данни. И също така направихме промени в реда на обхождане на статиите при обучение. Да проверим първото би отнело значително време. Затова можем да проверим второто.

За целта ще вземем класа `Article` от предишната тетардка и ще го променим да прилича повече на предишни експерименти. След това ще преминем към обучение...

In [1]:
import itertools
import os

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

physical_devices = tf.config.list_physical_devices('GPU')
for physical_device in physical_devices:
    tf.config.experimental.set_memory_growth(physical_device, enable=True)

In [2]:
TYPE=np.int16

subword_text_encoder = tfds.features.text.SubwordTextEncoder.load_from_file('vocab_4096')

class Articles:
    EMPTY_ARTICLE = np.array([], dtype=TYPE) # used for padding
    
    def __init__(self, path):
        with open(path, 'rb') as text_file:
            data = text_file.read()

        articles = set(data.split(b'\0'))
        articles = [np.array(subword_text_encoder.encode(article), dtype=TYPE) for article in articles]
        self.articles = sorted(articles, key=len)

    def articles_generator(self, batch_size = 1, start = 0, end = None):
        end = end or len(self.articles)

        for _ in range(batch_size - ((end - start - 1) % batch_size + 1)):
            yield self.EMPTY_ARTICLE

        for article in itertools.islice(self.articles, start, end):
            yield article

    def subbatch_generator(self, batch_size, batch_length, start = 0, end = None):
        end = end or len(self.articles)

        dataset = tf.data.Dataset.from_generator(self.articles_generator, args=(batch_size, start, end), output_types=TYPE)
        dataset = dataset.padded_batch(batch_size, padded_shapes=([None]), drop_remainder=True)
        dataset = dataset.shuffle(100)

        for batch in dataset.as_numpy_iterator():
            remaining = batch
            while remaining.shape[1] > batch_length + 1:
                yield remaining[:, :batch_length + 1]
                remaining = remaining[:, batch_length:]

            yield remaining
            if remaining.shape[1] == batch_length + 1:
                yield np.zeros((batch_size, 2), dtype=TYPE)

    def dataset(self, batch_size, batch_length, start = 0, end = None):
        end = end or len(self.articles)

        dataset = tf.data.Dataset.from_generator(self.subbatch_generator, args=(batch_size, batch_length, start, end), output_types=TYPE, output_shapes=(batch_size, None))
        return dataset.map(lambda batch: (batch[:, :-1], batch[:, 1:]))

In [3]:
articles = Articles('page_revisions_text')

In [4]:
import math

def batch_count(articles, batch_size, batched_item_length):
    return sum(math.ceil(len(x) / batched_item_length) for i, x in enumerate(articles.articles_generator(batch_size, batched_item_length)) if (i + 1) % batch_size == 0)

In [5]:
batch_count(articles, 4096, 12)

31302

In [6]:
steps = 12

for i in range(steps):
    batch_size = 4 * 2**i
    batch_item_length = 4 * 2**(steps - i - 1)
    count = batch_count(articles, batch_size, batch_item_length)
    print("batch size: %6d\t batch item length: %4d\tsteps per epoch: %6d" % (batch_size, batch_item_length, count))

batch size:      4	 batch item length: 8192	steps per epoch:  54883
batch size:      8	 batch item length: 4096	steps per epoch:  30600
batch size:     16	 batch item length: 2048	steps per epoch:  19009
batch size:     32	 batch item length: 1024	steps per epoch:  13899
batch size:     64	 batch item length:  512	steps per epoch:  12037
batch size:    128	 batch item length:  256	steps per epoch:  11533
batch size:    256	 batch item length:  128	steps per epoch:  12121
batch size:    512	 batch item length:   64	steps per epoch:  14172
batch size:   1024	 batch item length:   32	steps per epoch:  18709
batch size:   2048	 batch item length:   16	steps per epoch:  28057
batch size:   4096	 batch item length:    8	steps per epoch:  46942
batch size:   8192	 batch item length:    4	steps per epoch:  84947


In [13]:
BATCH_SIZE = 1536
BATCHED_ITEM_LENGTH = 32

def build_model(vocab_size, embedding_dim, rnn_units):
    return tf.keras.Sequential([
        tf.keras.layers.Masking(mask_value=0, batch_input_shape=[BATCH_SIZE, None]),
        tf.keras.layers.Embedding(vocab_size, embedding_dim),
        tf.keras.layers.GRU(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.GRU(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size),
    ])

checkpoint_dir = './training_checkpoints-3' # Directory where the checkpoints will be saved
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}") # Name of the checkpoint files

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, save_weights_only=True)

class ModelStateResetter(tf.keras.callbacks.Callback):
    def __init__(self):
        self.last_total_length = 0

    def on_batch_end(self, batch, logs={}):
        average_batch_length = logs.get('average_batch_length', 0)
        total_length = int(round(average_batch_length * (batch + 1)))
        current_batch_length = total_length - self.last_total_length
        self.last_total_length = total_length
        
        if current_batch_length < BATCHED_ITEM_LENGTH:
            self.model.reset_states()
        
model_state_resetter_callback = ModelStateResetter()

def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

def average_batch_length(true_labels, predictions):
    return tf.shape(true_labels)[1]

In [14]:
model = build_model(vocab_size = subword_text_encoder.vocab_size, embedding_dim=512, rnn_units=1024)
# model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.compile(optimizer='adam', loss=loss, metrics=[average_batch_length])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking_2 (Masking)          (1536, None)              0         
_________________________________________________________________
embedding_2 (Embedding)      (1536, None, 512)         2072576   
_________________________________________________________________
gru_4 (GRU)                  (1536, None, 1024)        4724736   
_________________________________________________________________
gru_5 (GRU)                  (1536, None, 1024)        6297600   
_________________________________________________________________
dense_2 (Dense)              (1536, None, 4048)        4149200   
Total params: 17,244,112
Trainable params: 17,244,112
Non-trainable params: 0
_________________________________________________________________


In [15]:
model.fit(articles.dataset(BATCH_SIZE, BATCHED_ITEM_LENGTH), epochs=5, callbacks=[checkpoint_callback, model_state_resetter_callback])

Epoch 1/5
   3892/Unknown - 3041s 781ms/step - loss: 3.4269 - average_batch_length: 31.5626

KeyboardInterrupt: 

До тук добре. Числото изглежда много подобно на това, което сме виждали в предишни тетрадки с различен код, но при идентични параметри.

Да видим до каква компресия ще доведе този `loss`.

In [12]:
def build_predicting_model(vocab_size, embedding_dim, rnn_units):
    return tf.keras.Sequential([
        tf.keras.layers.Masking(mask_value=0, batch_input_shape=[1, 1]),
        tf.keras.layers.Embedding(vocab_size, embedding_dim),
        #tf.keras.layers.GRU(rnn_units, return_sequences=True),
        #tf.keras.layers.GRU(rnn_units, return_sequences=True),
        #tf.keras.layers.GRU(rnn_units, return_sequences=True),
        tf.keras.layers.LSTM(rnn_units, return_sequences=True, stateful=True),
        tf.keras.layers.LSTM(rnn_units, return_sequences=True, stateful=True),
        tf.keras.layers.Dense(vocab_size),
    ])

model = build_predicting_model(vocab_size = subword_text_encoder.vocab_size, embedding_dim=512, rnn_units=1024)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, 1]))

In [13]:
import ctypes

class Huffman:
    huffman = ctypes.CDLL('x64/Release/huffman')
    
    huffman.create_tree.restype = ctypes.c_void_p
    huffman.destroy_tree.restype = None
    huffman.load_weights.restype = None
    huffman.create_code_string.restype = ctypes.c_char_p
    
    def __init__(self, category_count):
        self.tree = ctypes.c_void_p(self.huffman.create_tree(category_count))

    def __del__(self):
        self.huffman.destroy_tree(self.tree)
        
    def load_weights(self, weights):
        self.huffman.load_weights(self.tree, weights.ctypes.data_as(ctypes.POINTER(ctypes.c_float)))
    
    def get_code_length(self, category):
        return self.huffman.get_code_length(self.tree, category)

    def get_code_zero_count(self, category):
        return self.huffman.get_code_zero_count(self.tree, category)

In [14]:
def huffman_archive_size(model, text):
    archived_size = 0
    zeros = 0
    input_eval = np.array([[0]], dtype=TYPE)
    huffman_tree = Huffman(subword_text_encoder.vocab_size)

    text_generated = []

    model.reset_states()

    for index, byte in enumerate(text):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0) # remove the batch dimension

        weights = tf.nn.softmax(predictions[0]).numpy()
        huffman_tree.load_weights(weights)
        zeros += huffman_tree.get_code_zero_count(byte.item())
        archived_size += huffman_tree.get_code_length(byte.item())

        input_eval = tf.expand_dims([byte], 0)
  
    return archived_size, zeros

In [15]:
total_raw = 0
total_compressed = 0

for index, encoded_article in enumerate(articles.articles_generator(1)):
    raw = len(encoded_article) * 8
    if raw == 0:
        continue
    compressed, _ = huffman_archive_size(model, encoded_article)
    total_raw += raw
    total_compressed += compressed

    if index % 100 == 0:
        print('Article %d:\tLength: %d\tCompression: %f\tAvg Compression: %f' % (index, raw, compressed/raw, total_compressed/total_raw))

Article 0:	Length: 56	Compression: 1.035714	Avg Compression: 1.035714
Article 100:	Length: 104	Compression: 0.932692	Avg Compression: 0.808464
Article 200:	Length: 72	Compression: 1.069444	Avg Compression: 0.794248
Article 300:	Length: 78112	Compression: 0.786038	Avg Compression: 0.792500
Article 400:	Length: 40	Compression: 1.450000	Avg Compression: 0.798137
Article 500:	Length: 3600	Compression: 0.831667	Avg Compression: 0.797820
Article 600:	Length: 88	Compression: 0.818182	Avg Compression: 0.796234
Article 700:	Length: 78144	Compression: 0.794815	Avg Compression: 0.796330
Article 800:	Length: 38392	Compression: 0.731038	Avg Compression: 0.797411
Article 900:	Length: 2840	Compression: 0.779225	Avg Compression: 0.798297
Article 1000:	Length: 5960	Compression: 0.774832	Avg Compression: 0.797415
Article 1100:	Length: 104	Compression: 0.990385	Avg Compression: 0.797880
Article 1200:	Length: 120	Compression: 1.100000	Avg Compression: 0.798434
Article 1300:	Length: 11192	Compression: 0.74

Да разгледаме друг вариант за обхождане на статиите в dataset-а.

Заради по-малко похабено място при padding, групирането на статии с еднакъв размер заедно води до по-бързи итерации. Което би било добре дошло.

Също така прави броят на итерации предвидим. Което поради чисто имплементационни детайли работи по-добре с Keras.

Това, обаче, ограничава възможността ни да разбъркваме входните данни. Нека разгледаме какво би било отражението на всичко това върху `loss` функцията и върху размерът на компресираните данни...

In [16]:
class Articles:
    EMPTY_ARTICLE = np.array([], dtype=TYPE) # used for padding
    
    def __init__(self, path):
        with open(path, 'rb') as text_file:
            data = text_file.read()

        articles = [np.array(subword_text_encoder.encode(article), dtype=TYPE) for article in data.split(b'\0')[:10000]]
        self.articles = sorted(articles, key=len)

    def articles_generator(self, batch_size = 1, start = 0, end = None):
        end = end or len(self.articles)

        for _ in range(batch_size - ((end - start - 1) % batch_size + 1)):
            yield self.EMPTY_ARTICLE

        for article in itertools.islice(self.articles, start, end):
            yield article

    def subbatch_generator(self, batch_size, batch_length, start = 0, end = None):
        end = end or len(self.articles)

        dataset = tf.data.Dataset.from_generator(self.articles_generator, args=(batch_size, start, end), output_types=TYPE)
        dataset = dataset.padded_batch(batch_size, padded_shapes=([None]), drop_remainder=True)
        dataset = dataset.shuffle(100)

        for batch in dataset.as_numpy_iterator():
            remaining = batch
            while remaining.shape[1] > 1:
                yield remaining[:, :batch_length + 1]
                remaining = remaining[:, batch_length:]

    def dataset(self, batch_size, batch_length, start = 0, end = None):
        end = end or len(self.articles)

        dataset = tf.data.Dataset.from_generator(self.subbatch_generator, args=(batch_size, batch_length, start, end), output_types=TYPE, output_shapes=(batch_size, None))
        return dataset.map(lambda batch: (batch[:, :-1], batch[:, 1:]))

In [17]:
articles = Articles('page_revisions_text')

In [18]:
model = build_model(vocab_size = subword_text_encoder.vocab_size, embedding_dim=512, rnn_units=1024)
model.compile(optimizer='adam', loss=loss, metrics=[average_batch_length])

In [19]:
model.fit(articles.dataset(BATCH_SIZE, BATCHED_ITEM_LENGTH), callbacks=[checkpoint_callback, model_state_resetter_callback])



<tensorflow.python.keras.callbacks.History at 0x1f58037d1c8>

In [None]:
total_raw = 0
total_compressed = 0

for index, encoded_article in enumerate(articles.articles_generator(1)):
    raw = len(encoded_article) * 8
    if raw == 0:
        continue
    compressed, _ = huffman_archive_size(model, encoded_article)
    total_raw += raw
    total_compressed += compressed

    if index % 100 == 0:
        print('Article %d:\tLength: %d\tCompression: %f\tAvg Compression: %f' % (index, raw, compressed/raw, total_compressed/total_raw))

Article 100:	Length: 48	Compression: 1.229167	Avg Compression: 1.286173
Article 200:	Length: 56	Compression: 1.250000	Avg Compression: 1.264769
Article 300:	Length: 56	Compression: 1.303571	Avg Compression: 1.263903
Article 400:	Length: 64	Compression: 1.265625	Avg Compression: 1.254929
Article 500:	Length: 64	Compression: 1.390625	Avg Compression: 1.254209
Article 600:	Length: 64	Compression: 1.250000	Avg Compression: 1.252549
Article 700:	Length: 64	Compression: 0.937500	Avg Compression: 1.242738
Article 800:	Length: 72	Compression: 1.138889	Avg Compression: 1.232653
Article 900:	Length: 72	Compression: 1.208333	Avg Compression: 1.232040
Article 1000:	Length: 72	Compression: 1.194444	Avg Compression: 1.233190
Article 1100:	Length: 72	Compression: 1.402778	Avg Compression: 1.234418
Article 1200:	Length: 80	Compression: 1.137500	Avg Compression: 1.232741
Article 1300:	Length: 80	Compression: 1.075000	Avg Compression: 1.232191
Article 1400:	Length: 80	Compression: 1.125000	Avg Compressi

In [16]:
total_raw = 0
total_compressed = 0

for index, encoded_article in enumerate(itertools.islice(articles.articles_generator(1), 7601, 10000)):
    raw = len(encoded_article) * 8
    if raw == 0:
        continue
    compressed, _ = huffman_archive_size(model, encoded_article)
    total_raw += raw
    total_compressed += compressed

    if (7601 + index) % 100 == 0:
        print('Article %d:\tLength: %d\tCompression: %f\tAvg Compression: %f' % (index, raw, compressed/raw, total_compressed/total_raw))

Article 99:	Length: 31016	Compression: 1.159724	Avg Compression: 1.160828
Article 199:	Length: 32480	Compression: 1.164070	Avg Compression: 1.157721
Article 299:	Length: 34064	Compression: 1.218853	Avg Compression: 1.157919
Article 399:	Length: 35456	Compression: 1.174272	Avg Compression: 1.158292
Article 499:	Length: 37096	Compression: 1.180370	Avg Compression: 1.159346
Article 599:	Length: 38712	Compression: 1.095268	Avg Compression: 1.159092
Article 699:	Length: 40576	Compression: 1.146712	Avg Compression: 1.158533
Article 799:	Length: 42376	Compression: 1.173424	Avg Compression: 1.158731
Article 899:	Length: 44472	Compression: 1.187466	Avg Compression: 1.158734
Article 999:	Length: 46896	Compression: 1.138263	Avg Compression: 1.158351
Article 1099:	Length: 49352	Compression: 1.139549	Avg Compression: 1.159018
Article 1199:	Length: 51952	Compression: 1.154239	Avg Compression: 1.159643
Article 1299:	Length: 55392	Compression: 1.160041	Avg Compression: 1.160231
Article 1399:	Length: 5

In [17]:
print('Article %d:\tLength: %d\tCompression: %f\tAvg Compression: %f' % (index, raw, compressed/raw, total_compressed/total_raw))

Article 2398:	Length: 606424	Compression: 1.209142	Avg Compression: 1.168509


In [18]:
TYPE=np.int16

subword_text_encoder = tfds.features.text.SubwordTextEncoder.load_from_file('vocab_4096')

class Articles:
    EMPTY_ARTICLE = np.array([], dtype=TYPE) # used for padding
    
    def __init__(self, path):
        with open(path, 'rb') as text_file:
            data = text_file.read()

        articles = set(data.split(b'\0')[:10000])
        articles = [np.array(subword_text_encoder.encode(article), dtype=TYPE) for article in articles]
        self.articles = sorted(articles, key=len)

    def articles_generator(self, batch_size = 1, start = 0, end = None):
        end = end or len(self.articles)

        for _ in range(batch_size - ((end - start - 1) % batch_size + 1)):
            yield self.EMPTY_ARTICLE

        for article in itertools.islice(self.articles, start, end):
            yield article

    def subbatch_generator(self, batch_size, batch_length, start = 0, end = None):
        end = end or len(self.articles)

        dataset = tf.data.Dataset.from_generator(self.articles_generator, args=(batch_size, start, end), output_types=TYPE)
        dataset = dataset.padded_batch(batch_size, padded_shapes=([None]), drop_remainder=True)
        dataset = dataset.shuffle(100)

        for batch in dataset.as_numpy_iterator():
            remaining = batch
            while remaining.shape[1] > 1:
                yield remaining[:, :batch_length + 1]
                remaining = remaining[:, batch_length:]

    def dataset(self, batch_size, batch_length, start = 0, end = None):
        end = end or len(self.articles)

        dataset = tf.data.Dataset.from_generator(self.subbatch_generator, args=(batch_size, batch_length, start, end), output_types=TYPE, output_shapes=(batch_size, None))
        return dataset.map(lambda batch: (batch[:, :-1], batch[:, 1:]))

In [19]:
articles = Articles('page_revisions_text')

In [20]:
BATCH_SIZE = 192
BATCHED_ITEM_LENGTH = 256

def build_model(vocab_size, embedding_dim, rnn_units):
    return tf.keras.Sequential([
        tf.keras.layers.Masking(mask_value=0, batch_input_shape=[BATCH_SIZE, None]),
        tf.keras.layers.Embedding(vocab_size, embedding_dim),
        # tf.keras.layers.GRU(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.LSTM(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.LSTM(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size),
    ])

checkpoint_dir = './training_checkpoints-3' # Directory where the checkpoints will be saved
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}") # Name of the checkpoint files

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, save_weights_only=True)

class ModelStateResetter(tf.keras.callbacks.Callback):
    def __init__(self):
        self.last_total_length = 0

    def on_batch_end(self, batch, logs={}):
        average_batch_length = logs.get('average_batch_length', 0)
        total_length = int(round(average_batch_length * (batch + 1)))
        current_batch_length = total_length - self.last_total_length
        self.last_total_length = total_length
        
        if current_batch_length < BATCHED_ITEM_LENGTH:
            self.model.reset_states()
        
model_state_resetter_callback = ModelStateResetter()

def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

def average_batch_length(true_labels, predictions):
    return tf.shape(true_labels)[1]

In [21]:
model = build_model(vocab_size = subword_text_encoder.vocab_size, embedding_dim=512, rnn_units=1024)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.compile(optimizer='adam', loss=loss, metrics=[average_batch_length])

In [None]:
dataset = articles.dataset(BATCH_SIZE, BATCHED_ITEM_LENGTH)
model.fit(dataset, epochs=6, callbacks=[checkpoint_callback, model_state_resetter_callback])

Epoch 1/6
    730/Unknown - 689s 944ms/step - loss: 3.6365 - average_batch_length: 246.9233

In [None]:
dataset = articles.dataset(BATCH_SIZE, BATCHED_ITEM_LENGTH)
model.fit(dataset, epochs=5, callbacks=[checkpoint_callback, model_state_resetter_callback])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
 75/812 [=>............................] - ETA: 11:21 - loss: 3.5320 - average_batch_length: 237.5200

In [None]:
dataset = articles.dataset(BATCH_SIZE, BATCHED_ITEM_LENGTH)
model.fit(dataset, callbacks=[checkpoint_callback, model_state_resetter_callback])

    121/Unknown - 116s 957ms/step - loss: 3.5313 - average_batch_length: 242.2562

In [None]:
total_raw = 0
total_compressed = 0

for index, encoded_article in enumerate(articles.articles_generator(1)):
    raw = len(encoded_article) * 8
    if raw == 0:
        continue
    compressed, _ = huffman_archive_size(model, encoded_article)
    total_raw += raw
    total_compressed += compressed

    if index % 100 == 0:
        print('Article %d:\tLength: %d\tCompression: %f\tAvg Compression: %f' % (index, raw, compressed/raw, total_compressed/total_raw))

Article 100:	Length: 56	Compression: 1.178571	Avg Compression: 1.145208
Article 200:	Length: 56	Compression: 1.178571	Avg Compression: 1.114327
Article 300:	Length: 64	Compression: 1.031250	Avg Compression: 1.094505
Article 400:	Length: 64	Compression: 1.015625	Avg Compression: 1.071536
Article 500:	Length: 64	Compression: 0.937500	Avg Compression: 1.055387
Article 600:	Length: 64	Compression: 0.937500	Avg Compression: 1.047612
Article 700:	Length: 72	Compression: 0.916667	Avg Compression: 1.038866
Article 800:	Length: 72	Compression: 0.944444	Avg Compression: 1.033067
Article 900:	Length: 72	Compression: 1.138889	Avg Compression: 1.026084
Article 1000:	Length: 80	Compression: 0.950000	Avg Compression: 1.021565
Article 1100:	Length: 80	Compression: 1.025000	Avg Compression: 1.014067
Article 1200:	Length: 80	Compression: 1.125000	Avg Compression: 1.007953
Article 1300:	Length: 80	Compression: 0.925000	Avg Compression: 1.001440
Article 1400:	Length: 88	Compression: 0.829545	Avg Compressi