In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds

In [2]:
def articles():
    with open('page_revisions_text', 'rb') as text_file:
        pending_article_data = b''
        while True:
            data = text_file.read(1024 * 1024)
            if len(data) == 0:
                break

            articles = data.split(b'\0')
            articles[0] = pending_article_data + articles[0]
            for index, article in enumerate(articles):
                if index + 1 == len(articles):
                    pending_article_data = article
                else:
                    yield article

        print(pending_article_data)
        if len(pending_article_data) != 0:
            yield pending_article_data

In [3]:
subword_text_encoder = tfds.features.text.SubwordTextEncoder.load_from_file('vocab_4096')

Да видим как би изглеждало обучение с кодираните статии...

In [4]:
import os

import numpy as np
import itertools

In [5]:
BATCH_SIZE = 128
BATCHED_ITEM_LENGTH = 256
BUFFER_SIZE = 256
TYPE=np.uint16

def articles_generator():
    for index, article in enumerate(itertools.islice(articles(), 0, 2000)):
        yield np.array(subword_text_encoder.encode(article + b'\0'), dtype=TYPE)

    # Pad the article count to the batch size
    # We do this to ensure that no data is dropped
    index += 1
    while index % BATCH_SIZE != 0:
        yield np.array([0], dtype=TYPE)
        index += 1

def subbatches():
    dataset = tf.data.Dataset.from_generator(articles_generator, output_types=TYPE)
    dataset = dataset.shuffle(BUFFER_SIZE)
    dataset = dataset.padded_batch(BATCH_SIZE, padded_shapes=([None]), drop_remainder=True)

    for batch in dataset.as_numpy_iterator():
        remaining = batch
        while remaining.shape[1] > 1:
            yield remaining[:, :BATCHED_ITEM_LENGTH + 1]
            remaining = remaining[:, BATCHED_ITEM_LENGTH:]

dataset = tf.data.Dataset.from_generator(subbatches, output_types=TYPE, output_shapes=(BATCH_SIZE, None))
dataset = dataset.map(lambda batch: (batch[:, :-1], batch[:, 1:]))

dataset

<MapDataset shapes: ((128, None), (128, None)), types: (tf.uint16, tf.uint16)>

In [6]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    return tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]),
        tf.keras.layers.LSTM(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.LSTM(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size),
    ])

In [7]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

def average_batch_length(true_labels, predictions):
    return tf.shape(true_labels)[1]

model = build_model(vocab_size = subword_text_encoder.vocab_size, embedding_dim=512, rnn_units=1024, batch_size=BATCH_SIZE)
model.compile(optimizer='adam', loss=loss, metrics=[average_batch_length])

In [8]:
checkpoint_dir = './training_checkpoints' # Directory where the checkpoints will be saved
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}") # Name of the checkpoint files

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, save_weights_only=True)

In [9]:
class ModelStateResetter(tf.keras.callbacks.Callback):
    def __init__(self):
        self.last_total_length = 0

    def on_batch_end(self, batch, logs={}):
        average_batch_length = logs.get('average_batch_length', 0)
        total_length = int(round(average_batch_length * (batch + 1)))
        current_batch_length = total_length - self.last_total_length
        self.last_total_length = total_length
        
        if current_batch_length < BATCHED_ITEM_LENGTH:
            self.model.reset_states()
        
model_state_resetter_callback = ModelStateResetter()

In [10]:
total_epochs = 17

for epoch in range(total_epochs):
    print('Epoch %d/%d' % (epoch + 1, total_epochs))
    model.fit(dataset, callbacks=[checkpoint_callback, model_state_resetter_callback])

Epoch 1/17
Epoch 2/17
Epoch 3/17
Epoch 4/17
Epoch 5/17
Epoch 6/17
Epoch 7/17
Epoch 8/17
Epoch 9/17
Epoch 10/17
Epoch 11/17
Epoch 12/17
Epoch 13/17
Epoch 14/17
Epoch 15/17
Epoch 16/17
Epoch 17/17


In [12]:
total_epochs = 8

for epoch in range(total_epochs):
    print('Epoch %d/%d' % (epoch + 1, total_epochs))
    model.fit(dataset, callbacks=[checkpoint_callback, model_state_resetter_callback])

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [13]:
total_epochs = 5

for epoch in range(total_epochs):
    print('Epoch %d/%d' % (epoch + 1, total_epochs))
    model.fit(dataset, callbacks=[checkpoint_callback, model_state_resetter_callback])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [14]:
total_epochs = 5

for epoch in range(total_epochs):
    print('Epoch %d/%d' % (epoch + 1, total_epochs))
    model.fit(dataset, callbacks=[checkpoint_callback, model_state_resetter_callback])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [15]:
total_epochs = 3

for epoch in range(total_epochs):
    print('Epoch %d/%d' % (epoch + 1, total_epochs))
    model.fit(dataset, callbacks=[checkpoint_callback, model_state_resetter_callback])

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [16]:
with open('page_revisions_text', 'rb') as text_file:
    data = text_file.read()

article = data.split(b'\0')[120]
del data

encoded_article = np.array(subword_text_encoder.encode(article + b'\0'), dtype=TYPE)

print('Raw:', len(article))
print('Encoded:', len(encoded_article))

Raw: 25541
Encoded: 8222


In [17]:
import ctypes

class Huffman:
    huffman = ctypes.CDLL('x64/Release/huffman')
    
    huffman.create_tree.restype = ctypes.c_void_p
    huffman.destroy_tree.restype = None
    huffman.load_weights.restype = None
    huffman.create_code_string.restype = ctypes.c_char_p
    
    def __init__(self, category_count):
        self.tree = ctypes.c_void_p(self.huffman.create_tree(category_count))

    def __del__(self):
        self.huffman.destroy_tree(self.tree)
        
    def load_weights(self, weights):
        self.huffman.load_weights(self.tree, weights.ctypes.data_as(ctypes.POINTER(ctypes.c_float)))
    
    def get_code_length(self, category):
        return self.huffman.get_code_length(self.tree, category)

    def get_code_zero_count(self, category):
        return self.huffman.get_code_zero_count(self.tree, category)

In [18]:
def huffman_archive_size(model, text):
    archived_size = 0
    zeros = 0
    input_eval = np.array([[0]], dtype=TYPE)
    huffman_tree = Huffman(subword_text_encoder.vocab_size)

    text_generated = []

    model.reset_states()

    for index, byte in enumerate(text):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0) # remove the batch dimension

        weights = tf.nn.softmax(predictions[0]).numpy()
        huffman_tree.load_weights(weights)
        zeros += huffman_tree.get_code_zero_count(byte.item())
        archived_size += huffman_tree.get_code_length(byte.item())

        input_eval = tf.expand_dims([byte], 0)
  
    return archived_size, zeros

In [19]:
tf.train.latest_checkpoint(checkpoint_dir)
model = build_model(vocab_size = subword_text_encoder.vocab_size, embedding_dim=512, rnn_units=1024, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

In [20]:
archived_size, zeros = huffman_archive_size(model, encoded_article)
print('Compressed length:', archived_size)

compression_ratio = archived_size / (len(encoded_article) * 8)
print('Compression ratio for encoded:', compression_ratio)

compression_ratio = archived_size / (len(article) * 8)
print('Compression ratio for raw:', compression_ratio)

k = (zeros / archived_size)
compression_ratio = compression_ratio * (-k * np.log2(k) - (1-k) * np.log2(1-k))
print('Potential compression ratio with arithmetic coding:', compression_ratio)

Compressed length: 34985
Compression ratio for encoded: 0.5318809292143031
Compression ratio for raw: 0.1712198034532712
Potential compression ratio with arithmetic coding: 0.171157297572381


Вече имаме сравнително бърза имплементация на Хъфман код. Можем да си позволим да видим как изглеждат резултатите за повече от една статия.

In [25]:
total_raw = 0
total_compressed = 0

for index, article in enumerate(itertools.islice(articles(), 0, 2000)):
    raw = (len(article) + 1) * 8
    encoded_article = np.array(subword_text_encoder.encode(article + b'\0'), dtype=TYPE)
    compressed, _ = huffman_archive_size(model, encoded_article)
    total_raw += raw
    total_compressed += compressed
    print('Article %d:\tCompression: %f\tAvg Compression: %f' % (index, compressed/raw, total_compressed/total_raw))

Article 0:	Compression: 23.722222	Avg Compression: 23.722222
Article 1:	Compression: 10.196429	Avg Compression: 14.254167
Article 2:	Compression: 8.719388	Avg Compression: 11.766055
Article 3:	Compression: 14.320833	Avg Compression: 12.317446
Article 4:	Compression: 12.214286	Avg Compression: 12.296695
Article 5:	Compression: 10.948718	Avg Compression: 12.049883
Article 6:	Compression: 0.158998	Avg Compression: 0.202087
Article 7:	Compression: 11.543919	Avg Compression: 0.209222
Article 8:	Compression: 10.728125	Avg Compression: 0.216371
Article 9:	Compression: 10.181548	Avg Compression: 0.223477
Article 10:	Compression: 11.286184	Avg Compression: 0.230610
Article 11:	Compression: 9.710227	Avg Compression: 0.237682
Article 12:	Compression: 9.727273	Avg Compression: 0.244756
Article 13:	Compression: 11.223684	Avg Compression: 0.251820
Article 14:	Compression: 9.095745	Avg Compression: 0.258852
Article 15:	Compression: 17.781250	Avg Compression: 0.265964
Article 16:	Compression: 12.23571

Article 136:	Compression: 9.304348	Avg Compression: 0.283898
Article 137:	Compression: 11.920139	Avg Compression: 0.284917
Article 138:	Compression: 6.925403	Avg Compression: 0.285919
Article 139:	Compression: 0.163121	Avg Compression: 0.278942
Article 140:	Compression: 0.160984	Avg Compression: 0.271156
Article 141:	Compression: 0.282908	Avg Compression: 0.271245
Article 142:	Compression: 0.176447	Avg Compression: 0.269639
Article 143:	Compression: 0.188564	Avg Compression: 0.268880
Article 144:	Compression: 10.942308	Avg Compression: 0.269742
Article 145:	Compression: 0.150426	Avg Compression: 0.265487
Article 146:	Compression: 0.181304	Avg Compression: 0.264103
Article 147:	Compression: 0.183549	Avg Compression: 0.261942
Article 148:	Compression: 0.155690	Avg Compression: 0.255762
Article 149:	Compression: 0.153408	Avg Compression: 0.249304
Article 150:	Compression: 12.217857	Avg Compression: 0.250011
Article 151:	Compression: 0.191597	Avg Compression: 0.249324
Article 152:	Compress

Article 270:	Compression: 0.196649	Avg Compression: 0.207944
Article 271:	Compression: 0.224436	Avg Compression: 0.207987
Article 272:	Compression: 0.224284	Avg Compression: 0.208036
Article 273:	Compression: 0.438238	Avg Compression: 0.208195
Article 274:	Compression: 0.346210	Avg Compression: 0.208373
Article 275:	Compression: 0.452256	Avg Compression: 0.208549
Article 276:	Compression: 0.306796	Avg Compression: 0.208665
Article 277:	Compression: 0.166321	Avg Compression: 0.208420
Article 278:	Compression: 0.190454	Avg Compression: 0.208316
Article 279:	Compression: 0.197697	Avg Compression: 0.208249
Article 280:	Compression: 17.739583	Avg Compression: 0.208490
Article 281:	Compression: 0.154257	Avg Compression: 0.207360
Article 282:	Compression: 0.247833	Avg Compression: 0.207450
Article 283:	Compression: 0.181123	Avg Compression: 0.207205
Article 284:	Compression: 0.182806	Avg Compression: 0.206889
Article 285:	Compression: 9.323370	Avg Compression: 0.207117
Article 286:	Compressio

Article 404:	Compression: 0.145860	Avg Compression: 0.197700
Article 405:	Compression: 0.176913	Avg Compression: 0.197479
Article 406:	Compression: 0.208535	Avg Compression: 0.197512
Article 407:	Compression: 0.205052	Avg Compression: 0.197529
Article 408:	Compression: 0.183880	Avg Compression: 0.197497
Article 409:	Compression: 0.213114	Avg Compression: 0.197507
Article 410:	Compression: 0.191618	Avg Compression: 0.197494
Article 411:	Compression: 15.828704	Avg Compression: 0.197625
Article 412:	Compression: 0.177438	Avg Compression: 0.197532
Article 413:	Compression: 14.245833	Avg Compression: 0.197663
Article 414:	Compression: 0.150652	Avg Compression: 0.197150
Article 415:	Compression: 0.729013	Avg Compression: 0.197283
Article 416:	Compression: 0.183345	Avg Compression: 0.197202
Article 417:	Compression: 0.233138	Avg Compression: 0.197252
Article 418:	Compression: 0.204460	Avg Compression: 0.197271
Article 419:	Compression: 0.144336	Avg Compression: 0.196046
Article 420:	Compressi

Article 538:	Compression: 0.176134	Avg Compression: 0.195839
Article 539:	Compression: 0.146086	Avg Compression: 0.195796
Article 540:	Compression: 0.139604	Avg Compression: 0.195610
Article 541:	Compression: 0.185198	Avg Compression: 0.195562
Article 542:	Compression: 0.489031	Avg Compression: 0.195595
Article 543:	Compression: 0.190241	Avg Compression: 0.195571
Article 544:	Compression: 0.146032	Avg Compression: 0.194679
Article 545:	Compression: 12.621324	Avg Compression: 0.194775
Article 546:	Compression: 0.138179	Avg Compression: 0.194609
Article 547:	Compression: 0.216015	Avg Compression: 0.194633
Article 548:	Compression: 0.157731	Avg Compression: 0.194510
Article 549:	Compression: 0.166232	Avg Compression: 0.194395
Article 550:	Compression: 0.247064	Avg Compression: 0.194439
Article 551:	Compression: 0.176787	Avg Compression: 0.194395
Article 552:	Compression: 0.186696	Avg Compression: 0.194378
Article 553:	Compression: 23.722222	Avg Compression: 0.194472
Article 554:	Compressi

Article 672:	Compression: 0.188512	Avg Compression: 0.197697
Article 673:	Compression: 0.178632	Avg Compression: 0.197598
Article 674:	Compression: 0.169492	Avg Compression: 0.197480
Article 675:	Compression: 15.819444	Avg Compression: 0.197562
Article 676:	Compression: 0.224370	Avg Compression: 0.197588
Article 677:	Compression: 0.197115	Avg Compression: 0.197586
Article 678:	Compression: 0.198671	Avg Compression: 0.197589
Article 679:	Compression: 19.426136	Avg Compression: 0.197671
Article 680:	Compression: 15.833333	Avg Compression: 0.197752
Article 681:	Compression: 0.326557	Avg Compression: 0.197812
Article 682:	Compression: 0.328859	Avg Compression: 0.197870
Article 683:	Compression: 0.315473	Avg Compression: 0.197919
Article 684:	Compression: 8.966146	Avg Compression: 0.198000
Article 685:	Compression: 0.238229	Avg Compression: 0.198020
Article 686:	Compression: 0.269618	Avg Compression: 0.198067
Article 687:	Compression: 22.453947	Avg Compression: 0.198149
Article 688:	Compres

Article 806:	Compression: 0.132715	Avg Compression: 0.197436
Article 807:	Compression: 0.187476	Avg Compression: 0.197423
Article 808:	Compression: 0.175645	Avg Compression: 0.197404
Article 809:	Compression: 7.820455	Avg Compression: 0.197472
Article 810:	Compression: 0.185345	Avg Compression: 0.197458
Article 811:	Compression: 7.914352	Avg Compression: 0.197526
Article 812:	Compression: 7.914352	Avg Compression: 0.197594
Article 813:	Compression: 7.914352	Avg Compression: 0.197662
Article 814:	Compression: 16.427885	Avg Compression: 0.197731
Article 815:	Compression: 0.444510	Avg Compression: 0.197791
Article 816:	Compression: 0.201631	Avg Compression: 0.197793
Article 817:	Compression: 12.920455	Avg Compression: 0.197862
Article 818:	Compression: 0.217406	Avg Compression: 0.197879
Article 819:	Compression: 0.183642	Avg Compression: 0.197856
Article 820:	Compression: 0.183660	Avg Compression: 0.197809
Article 821:	Compression: 0.175027	Avg Compression: 0.197779
Article 822:	Compressi

Article 940:	Compression: 0.252243	Avg Compression: 0.197031
Article 941:	Compression: 0.816638	Avg Compression: 0.197083
Article 942:	Compression: 0.942308	Avg Compression: 0.197134
Article 943:	Compression: 1.235354	Avg Compression: 0.197188
Article 944:	Compression: 0.263418	Avg Compression: 0.197221
Article 945:	Compression: 0.201868	Avg Compression: 0.197225
Article 946:	Compression: 8.525000	Avg Compression: 0.197284
Article 947:	Compression: 12.518382	Avg Compression: 0.197343
Article 948:	Compression: 12.153571	Avg Compression: 0.197402
Article 949:	Compression: 0.239889	Avg Compression: 0.197425
Article 950:	Compression: 0.181783	Avg Compression: 0.197404
Article 951:	Compression: 11.641892	Avg Compression: 0.197464
Article 952:	Compression: 9.172872	Avg Compression: 0.197523
Article 953:	Compression: 15.218750	Avg Compression: 0.197583
Article 954:	Compression: 0.124150	Avg Compression: 0.197443
Article 955:	Compression: 0.180143	Avg Compression: 0.197414
Article 956:	Compres

Article 1074:	Compression: 0.195435	Avg Compression: 0.197879
Article 1075:	Compression: 0.179723	Avg Compression: 0.197844
Article 1076:	Compression: 12.639706	Avg Compression: 0.197897
Article 1077:	Compression: 0.369913	Avg Compression: 0.197942
Article 1078:	Compression: 0.135202	Avg Compression: 0.197678
Article 1079:	Compression: 0.462725	Avg Compression: 0.197715
Article 1080:	Compression: 0.344888	Avg Compression: 0.197756
Article 1081:	Compression: 0.318846	Avg Compression: 0.197793
Article 1082:	Compression: 0.152474	Avg Compression: 0.197722
Article 1083:	Compression: 0.193099	Avg Compression: 0.197719
Article 1084:	Compression: 0.167522	Avg Compression: 0.197693
Article 1085:	Compression: 0.176443	Avg Compression: 0.197675
Article 1086:	Compression: 0.174624	Avg Compression: 0.197653
Article 1087:	Compression: 0.198285	Avg Compression: 0.197653
Article 1088:	Compression: 0.293802	Avg Compression: 0.197661
Article 1089:	Compression: 0.429118	Avg Compression: 0.197696
Article

Article 1206:	Compression: 0.215468	Avg Compression: 0.199559
Article 1207:	Compression: 0.163242	Avg Compression: 0.199531
Article 1208:	Compression: 0.154220	Avg Compression: 0.199402
Article 1209:	Compression: 0.137692	Avg Compression: 0.199305
Article 1210:	Compression: 0.131290	Avg Compression: 0.199215
Article 1211:	Compression: 0.235802	Avg Compression: 0.199233
Article 1212:	Compression: 15.847222	Avg Compression: 0.199282
Article 1213:	Compression: 0.373860	Avg Compression: 0.199318
Article 1214:	Compression: 0.197632	Avg Compression: 0.199317
Article 1215:	Compression: 19.551136	Avg Compression: 0.199367
Article 1216:	Compression: 0.388829	Avg Compression: 0.199408
Article 1217:	Compression: 12.950758	Avg Compression: 0.199457
Article 1218:	Compression: 0.108453	Avg Compression: 0.199179
Article 1219:	Compression: 0.149447	Avg Compression: 0.198803
Article 1220:	Compression: 0.412720	Avg Compression: 0.198839
Article 1221:	Compression: 11.547297	Avg Compression: 0.198888
Arti

Article 1338:	Compression: 0.147172	Avg Compression: 0.197640
Article 1339:	Compression: 0.157141	Avg Compression: 0.197507
Article 1340:	Compression: 14.741379	Avg Compression: 0.197551
Article 1341:	Compression: 0.396873	Avg Compression: 0.197580
Article 1342:	Compression: 0.134479	Avg Compression: 0.197349
Article 1343:	Compression: 0.162640	Avg Compression: 0.197311
Article 1344:	Compression: 0.156399	Avg Compression: 0.197272
Article 1345:	Compression: 0.146156	Avg Compression: 0.197185
Article 1346:	Compression: 0.144110	Avg Compression: 0.197145
Article 1347:	Compression: 0.143615	Avg Compression: 0.197064
Article 1348:	Compression: 0.144146	Avg Compression: 0.197004
Article 1349:	Compression: 0.136224	Avg Compression: 0.196836
Article 1350:	Compression: 0.128749	Avg Compression: 0.196770
Article 1351:	Compression: 0.139989	Avg Compression: 0.196699
Article 1352:	Compression: 18.505435	Avg Compression: 0.196742
Article 1353:	Compression: 0.129159	Avg Compression: 0.196494
Articl

Article 1470:	Compression: 0.154947	Avg Compression: 0.195244
Article 1471:	Compression: 0.165948	Avg Compression: 0.195207
Article 1472:	Compression: 0.743972	Avg Compression: 0.195243
Article 1473:	Compression: 0.267434	Avg Compression: 0.195259
Article 1474:	Compression: 0.172902	Avg Compression: 0.195235
Article 1475:	Compression: 0.173472	Avg Compression: 0.195202
Article 1476:	Compression: 17.781250	Avg Compression: 0.195241
Article 1477:	Compression: 0.155758	Avg Compression: 0.195126
Article 1478:	Compression: 0.187530	Avg Compression: 0.195120
Article 1479:	Compression: 0.179022	Avg Compression: 0.195112
Article 1480:	Compression: 7.822727	Avg Compression: 0.195150
Article 1481:	Compression: 0.143649	Avg Compression: 0.195061
Article 1482:	Compression: 0.158970	Avg Compression: 0.194994
Article 1483:	Compression: 0.154178	Avg Compression: 0.194836
Article 1484:	Compression: 0.231511	Avg Compression: 0.194847
Article 1485:	Compression: 0.134094	Avg Compression: 0.194715
Article

Article 1602:	Compression: 0.171989	Avg Compression: 0.193342
Article 1603:	Compression: 0.200591	Avg Compression: 0.193345
Article 1604:	Compression: 0.304043	Avg Compression: 0.193366
Article 1605:	Compression: 0.197060	Avg Compression: 0.193367
Article 1606:	Compression: 13.053030	Avg Compression: 0.193403
Article 1607:	Compression: 0.160733	Avg Compression: 0.193364
Article 1608:	Compression: 21.312500	Avg Compression: 0.193400
Article 1609:	Compression: 0.172342	Avg Compression: 0.193382
Article 1610:	Compression: 13.015152	Avg Compression: 0.193418
Article 1611:	Compression: 17.120000	Avg Compression: 0.193453
Article 1612:	Compression: 0.521857	Avg Compression: 0.193483
Article 1613:	Compression: 0.157102	Avg Compression: 0.193435
Article 1614:	Compression: 13.394531	Avg Compression: 0.193471
Article 1615:	Compression: 0.121289	Avg Compression: 0.193176
Article 1616:	Compression: 0.180249	Avg Compression: 0.193173
Article 1617:	Compression: 0.235471	Avg Compression: 0.193186
Art

Article 1736:	Compression: 12.954545	Avg Compression: 0.192284
Article 1737:	Compression: 0.200007	Avg Compression: 0.192288
Article 1738:	Compression: 0.334523	Avg Compression: 0.192306
Article 1739:	Compression: 0.193545	Avg Compression: 0.192307
Article 1740:	Compression: 16.456731	Avg Compression: 0.192340
Article 1741:	Compression: 8.767857	Avg Compression: 0.192372
Article 1742:	Compression: 0.777460	Avg Compression: 0.192398
Article 1743:	Compression: 0.217253	Avg Compression: 0.192403
Article 1744:	Compression: 0.171646	Avg Compression: 0.192389
Article 1745:	Compression: 0.288086	Avg Compression: 0.192408
Article 1746:	Compression: 0.473104	Avg Compression: 0.192431
Article 1747:	Compression: 0.673593	Avg Compression: 0.192460
Article 1748:	Compression: 0.416317	Avg Compression: 0.192489
Article 1749:	Compression: 0.222399	Avg Compression: 0.192493
Article 1750:	Compression: 0.476643	Avg Compression: 0.192523
Article 1751:	Compression: 20.363095	Avg Compression: 0.192555
Artic

Article 1868:	Compression: 0.984600	Avg Compression: 0.193357
Article 1869:	Compression: 19.477273	Avg Compression: 0.193388
Article 1870:	Compression: 0.263189	Avg Compression: 0.193401
Article 1871:	Compression: 0.544388	Avg Compression: 0.193408
Article 1872:	Compression: 0.309439	Avg Compression: 0.193429
Article 1873:	Compression: 0.469479	Avg Compression: 0.193444
Article 1874:	Compression: 15.819444	Avg Compression: 0.193475
Article 1875:	Compression: 0.207149	Avg Compression: 0.193480
Article 1876:	Compression: 22.453947	Avg Compression: 0.193511
Article 1877:	Compression: 0.171238	Avg Compression: 0.193499
Article 1878:	Compression: 0.236545	Avg Compression: 0.193511
Article 1879:	Compression: 0.165124	Avg Compression: 0.193496
Article 1880:	Compression: 0.171967	Avg Compression: 0.193464
Article 1881:	Compression: 14.200000	Avg Compression: 0.193495
Article 1882:	Compression: 14.685345	Avg Compression: 0.193526
Article 1883:	Compression: 0.232997	Avg Compression: 0.193542
Art

Това е неприятен резултат. Някои статии въобще не се компресират добре. Огромната част дори стават десетки по-големи.

След проверка на подредния номер в данните, се вижда че лошите резултати идват от най-кратките статии. Изглежда RNN мрежата се държи зле преди да е събрала достатъчно вътрешен state. При нормално обучение няма причина това да се случва.

На ум идва причина - подравняването на статиите - TODO - да обясня проблема с подравняване вдясно/вляво.