In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds

In [2]:
def articles():
    with open('page_revisions_text', 'rb') as text_file:
        pending_article_data = b''
        while True:
            data = text_file.read(1024 * 1024)
            if len(data) == 0:
                break

            articles = data.split(b'\0')
            articles[0] = pending_article_data + articles[0]
            for index, article in enumerate(articles):
                if index + 1 == len(articles):
                    pending_article_data = article
                else:
                    yield article

        print(pending_article_data)
        if len(pending_article_data) != 0:
            yield pending_article_data

In [3]:
subword_text_encoder = tfds.features.text.SubwordTextEncoder.load_from_file('vocab_4096')

Да видим как би изглеждало обучение с кодираните статии...

In [4]:
import os

import numpy as np
import itertools

In [5]:
BATCH_SIZE = 128
BATCHED_ITEM_LENGTH = 256
BUFFER_SIZE = 256
TYPE=np.uint16

def articles_generator():
    for index, article in enumerate(itertools.islice(articles(), 0, 2000)):
        yield np.array(subword_text_encoder.encode(article + b'\0')[::-1], dtype=TYPE)

    # Pad the article count to the batch size
    # We do this to ensure that no data is dropped
    index += 1
    while index % BATCH_SIZE != 0:
        yield np.array([0], dtype=TYPE)
        index += 1

def subbatches():
    dataset = tf.data.Dataset.from_generator(articles_generator, output_types=TYPE)
    dataset = dataset.shuffle(BUFFER_SIZE)
    dataset = dataset.padded_batch(BATCH_SIZE, padded_shapes=([None]), drop_remainder=True)

    for batch in dataset.as_numpy_iterator():
        remaining = batch[::-1]
        while remaining.shape[1] > 1:
            yield remaining[:, :BATCHED_ITEM_LENGTH + 1]
            remaining = remaining[:, BATCHED_ITEM_LENGTH:]

dataset = tf.data.Dataset.from_generator(subbatches, output_types=TYPE, output_shapes=(BATCH_SIZE, None))
dataset = dataset.map(lambda batch: (batch[:, :-1], batch[:, 1:]))

dataset

<MapDataset shapes: ((128, None), (128, None)), types: (tf.uint16, tf.uint16)>

In [6]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    return tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]),
        tf.keras.layers.LSTM(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.LSTM(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size),
    ])

In [7]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

def average_batch_length(true_labels, predictions):
    return tf.shape(true_labels)[1]

model = build_model(vocab_size = subword_text_encoder.vocab_size, embedding_dim=512, rnn_units=1024, batch_size=BATCH_SIZE)
model.compile(optimizer='adam', loss=loss, metrics=[average_batch_length])

In [8]:
checkpoint_dir = './training_checkpoints' # Directory where the checkpoints will be saved
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}") # Name of the checkpoint files

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, save_weights_only=True)

In [9]:
class ModelStateResetter(tf.keras.callbacks.Callback):
    def __init__(self):
        self.last_total_length = 0

    def on_batch_end(self, batch, logs={}):
        average_batch_length = logs.get('average_batch_length', 0)
        total_length = int(round(average_batch_length * (batch + 1)))
        current_batch_length = total_length - self.last_total_length
        self.last_total_length = total_length
        
        if current_batch_length < BATCHED_ITEM_LENGTH:
            self.model.reset_states()
        
model_state_resetter_callback = ModelStateResetter()

In [10]:
total_epochs = 35

for epoch in range(total_epochs):
    print('Epoch %d/%d' % (epoch + 1, total_epochs))
    model.fit(dataset, callbacks=[checkpoint_callback, model_state_resetter_callback])

Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35


За сега не се вижда въществена разлика когато подравняваме текстовете в batch-а в дясно и в ляво.

In [11]:
with open('page_revisions_text', 'rb') as text_file:
    data = text_file.read()

article = data.split(b'\0')[120]
del data

encoded_article = np.array(subword_text_encoder.encode(article + b'\0'), dtype=TYPE)

print('Raw:', len(article))
print('Encoded:', len(encoded_article))

Raw: 25541
Encoded: 8222


In [12]:
import ctypes

class Huffman:
    huffman = ctypes.CDLL('x64/Release/huffman')
    
    huffman.create_tree.restype = ctypes.c_void_p
    huffman.destroy_tree.restype = None
    huffman.load_weights.restype = None
    huffman.create_code_string.restype = ctypes.c_char_p
    
    def __init__(self, category_count):
        self.tree = ctypes.c_void_p(self.huffman.create_tree(category_count))

    def __del__(self):
        self.huffman.destroy_tree(self.tree)
        
    def load_weights(self, weights):
        self.huffman.load_weights(self.tree, weights.ctypes.data_as(ctypes.POINTER(ctypes.c_float)))
    
    def get_code_length(self, category):
        return self.huffman.get_code_length(self.tree, category)

    def get_code_zero_count(self, category):
        return self.huffman.get_code_zero_count(self.tree, category)

In [13]:
def huffman_archive_size(model, text):
    archived_size = 0
    zeros = 0
    input_eval = np.array([[0]], dtype=TYPE)
    huffman_tree = Huffman(subword_text_encoder.vocab_size)

    text_generated = []

    model.reset_states()

    for index, byte in enumerate(text):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0) # remove the batch dimension

        weights = tf.nn.softmax(predictions[0]).numpy()
        huffman_tree.load_weights(weights)
        zeros += huffman_tree.get_code_zero_count(byte.item())
        archived_size += huffman_tree.get_code_length(byte.item())

        input_eval = tf.expand_dims([byte], 0)
  
    return archived_size, zeros

In [14]:
tf.train.latest_checkpoint(checkpoint_dir)
model = build_model(vocab_size = subword_text_encoder.vocab_size, embedding_dim=512, rnn_units=1024, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

In [15]:
archived_size, zeros = huffman_archive_size(model, encoded_article)
print('Compressed length:', archived_size)

compression_ratio = archived_size / (len(encoded_article) * 8)
print('Compression ratio for encoded:', compression_ratio)

compression_ratio = archived_size / (len(article) * 8)
print('Compression ratio for raw:', compression_ratio)

k = (zeros / archived_size)
compression_ratio = compression_ratio * (-k * np.log2(k) - (1-k) * np.log2(1-k))
print('Potential compression ratio with arithmetic coding:', compression_ratio)

Compressed length: 170843
Compression ratio for encoded: 2.5973455363658475
Compression ratio for raw: 0.8361213343252026
Potential compression ratio with arithmetic coding: 0.836075518621219


Доста любопитно. Нека видим какъв е резултатът за първите 2000 статии.

In [16]:
total_raw = 0
total_compressed = 0

for index, article in enumerate(itertools.islice(articles(), 0, 2000)):
    raw = (len(article) + 1) * 8
    encoded_article = np.array(subword_text_encoder.encode(article + b'\0'), dtype=TYPE)
    compressed, _ = huffman_archive_size(model, encoded_article)
    total_raw += raw
    total_compressed += compressed
    print('Article %d:\tCompression: %f\tAvg Compression: %f' % (index, compressed/raw, total_compressed/total_raw))

Article 0:	Compression: 1.013889	Avg Compression: 1.013889
Article 1:	Compression: 1.130952	Avg Compression: 1.095833
Article 2:	Compression: 1.130102	Avg Compression: 1.111239
Article 3:	Compression: 1.050000	Avg Compression: 1.098022
Article 4:	Compression: 1.014286	Avg Compression: 1.081178
Article 5:	Compression: 0.660256	Avg Compression: 1.004108
Article 6:	Compression: 0.879414	Avg Compression: 0.879866
Article 7:	Compression: 0.837838	Avg Compression: 0.879839
Article 8:	Compression: 0.940625	Avg Compression: 0.879881
Article 9:	Compression: 0.833333	Avg Compression: 0.879847
Article 10:	Compression: 1.085526	Avg Compression: 0.879980
Article 11:	Compression: 0.872159	Avg Compression: 0.879974
Article 12:	Compression: 0.897727	Avg Compression: 0.879987
Article 13:	Compression: 1.036184	Avg Compression: 0.880088
Article 14:	Compression: 0.968085	Avg Compression: 0.880158
Article 15:	Compression: 1.145833	Avg Compression: 0.880266
Article 16:	Compression: 0.932143	Avg Compression:

Article 139:	Compression: 0.927141	Avg Compression: 0.893333
Article 140:	Compression: 0.907405	Avg Compression: 0.894262
Article 141:	Compression: 0.887981	Avg Compression: 0.894214
Article 142:	Compression: 0.987535	Avg Compression: 0.895795
Article 143:	Compression: 0.879258	Avg Compression: 0.895640
Article 144:	Compression: 1.057692	Avg Compression: 0.895653
Article 145:	Compression: 0.950409	Avg Compression: 0.897606
Article 146:	Compression: 0.883257	Avg Compression: 0.897370
Article 147:	Compression: 0.903036	Avg Compression: 0.897522
Article 148:	Compression: 0.832202	Avg Compression: 0.893723
Article 149:	Compression: 0.824668	Avg Compression: 0.889366
Article 150:	Compression: 0.996429	Avg Compression: 0.889372
Article 151:	Compression: 0.819949	Avg Compression: 0.888557
Article 152:	Compression: 0.869929	Avg Compression: 0.886852
Article 153:	Compression: 0.823015	Avg Compression: 0.885389
Article 154:	Compression: 1.038043	Avg Compression: 0.885394
Article 155:	Compression

Article 274:	Compression: 0.975315	Avg Compression: 0.892458
Article 275:	Compression: 0.893634	Avg Compression: 0.892459
Article 276:	Compression: 0.781870	Avg Compression: 0.892329
Article 277:	Compression: 0.893887	Avg Compression: 0.892338
Article 278:	Compression: 0.966756	Avg Compression: 0.892767
Article 279:	Compression: 0.896379	Avg Compression: 0.892790
Article 280:	Compression: 0.895833	Avg Compression: 0.892790
Article 281:	Compression: 0.898364	Avg Compression: 0.892906
Article 282:	Compression: 0.944926	Avg Compression: 0.893021
Article 283:	Compression: 0.853467	Avg Compression: 0.892653
Article 284:	Compression: 0.986020	Avg Compression: 0.893866
Article 285:	Compression: 0.766304	Avg Compression: 0.893863
Article 286:	Compression: 0.800000	Avg Compression: 0.893861
Article 287:	Compression: 1.251083	Avg Compression: 0.894400
Article 288:	Compression: 0.889046	Avg Compression: 0.894395
Article 289:	Compression: 0.800000	Avg Compression: 0.894393
Article 290:	Compression

Article 409:	Compression: 0.899925	Avg Compression: 0.891945
Article 410:	Compression: 0.849856	Avg Compression: 0.891851
Article 411:	Compression: 0.935185	Avg Compression: 0.891851
Article 412:	Compression: 0.923562	Avg Compression: 0.891998
Article 413:	Compression: 1.054167	Avg Compression: 0.891999
Article 414:	Compression: 0.953726	Avg Compression: 0.892672
Article 415:	Compression: 1.012255	Avg Compression: 0.892702
Article 416:	Compression: 0.870019	Avg Compression: 0.892569
Article 417:	Compression: 0.906196	Avg Compression: 0.892588
Article 418:	Compression: 0.914407	Avg Compression: 0.892647
Article 419:	Compression: 0.905414	Avg Compression: 0.892943
Article 420:	Compression: 0.864673	Avg Compression: 0.892684
Article 421:	Compression: 0.845446	Avg Compression: 0.892516
Article 422:	Compression: 0.818790	Avg Compression: 0.892475
Article 423:	Compression: 0.932190	Avg Compression: 0.892484
Article 424:	Compression: 0.856160	Avg Compression: 0.892447
Article 425:	Compression

Article 544:	Compression: 0.858171	Avg Compression: 0.896503
Article 545:	Compression: 0.900735	Avg Compression: 0.896503
Article 546:	Compression: 0.970158	Avg Compression: 0.896718
Article 547:	Compression: 1.033227	Avg Compression: 0.896870
Article 548:	Compression: 0.891841	Avg Compression: 0.896854
Article 549:	Compression: 0.901462	Avg Compression: 0.896872
Article 550:	Compression: 0.873153	Avg Compression: 0.896852
Article 551:	Compression: 0.884939	Avg Compression: 0.896822
Article 552:	Compression: 0.846248	Avg Compression: 0.896711
Article 553:	Compression: 1.062500	Avg Compression: 0.896712
Article 554:	Compression: 0.917445	Avg Compression: 0.896752
Article 555:	Compression: 0.956493	Avg Compression: 0.896855
Article 556:	Compression: 1.015625	Avg Compression: 0.896856
Article 557:	Compression: 0.936401	Avg Compression: 0.896865
Article 558:	Compression: 0.822531	Avg Compression: 0.896751
Article 559:	Compression: 0.853823	Avg Compression: 0.896716
Article 560:	Compression

Article 681:	Compression: 0.941942	Avg Compression: 0.895816
Article 682:	Compression: 0.919362	Avg Compression: 0.895826
Article 683:	Compression: 0.785856	Avg Compression: 0.895781
Article 684:	Compression: 0.942708	Avg Compression: 0.895781
Article 685:	Compression: 0.817868	Avg Compression: 0.895743
Article 686:	Compression: 0.842328	Avg Compression: 0.895708
Article 687:	Compression: 0.934211	Avg Compression: 0.895708
Article 688:	Compression: 1.010000	Avg Compression: 0.895708
Article 689:	Compression: 0.909614	Avg Compression: 0.895740
Article 690:	Compression: 1.037291	Avg Compression: 0.895758
Article 691:	Compression: 0.895652	Avg Compression: 0.895758
Article 692:	Compression: 1.035714	Avg Compression: 0.895759
Article 693:	Compression: 0.894894	Avg Compression: 0.895758
Article 694:	Compression: 0.813747	Avg Compression: 0.895438
Article 695:	Compression: 0.826435	Avg Compression: 0.895399
Article 696:	Compression: 0.830009	Avg Compression: 0.894699
Article 697:	Compression

Article 816:	Compression: 0.895766	Avg Compression: 0.895049
Article 817:	Compression: 1.071970	Avg Compression: 0.895050
Article 818:	Compression: 0.913583	Avg Compression: 0.895066
Article 819:	Compression: 0.907420	Avg Compression: 0.895086
Article 820:	Compression: 0.893410	Avg Compression: 0.895081
Article 821:	Compression: 0.839576	Avg Compression: 0.895006
Article 822:	Compression: 1.221875	Avg Compression: 0.895009
Article 823:	Compression: 0.819592	Avg Compression: 0.894991
Article 824:	Compression: 1.136785	Avg Compression: 0.895026
Article 825:	Compression: 0.949306	Avg Compression: 0.895123
Article 826:	Compression: 1.241667	Avg Compression: 0.895125
Article 827:	Compression: 0.982833	Avg Compression: 0.895135
Article 828:	Compression: 0.876530	Avg Compression: 0.895123
Article 829:	Compression: 0.830473	Avg Compression: 0.894835
Article 830:	Compression: 0.878203	Avg Compression: 0.894832
Article 831:	Compression: 0.880238	Avg Compression: 0.894826
Article 832:	Compression

Article 953:	Compression: 1.089286	Avg Compression: 0.895630
Article 954:	Compression: 0.976170	Avg Compression: 0.895783
Article 955:	Compression: 0.963002	Avg Compression: 0.895894
Article 956:	Compression: 1.039062	Avg Compression: 0.895895
Article 957:	Compression: 0.876614	Avg Compression: 0.895872
Article 958:	Compression: 0.895876	Avg Compression: 0.895872
Article 959:	Compression: 0.890173	Avg Compression: 0.895870
Article 960:	Compression: 0.970091	Avg Compression: 0.895894
Article 961:	Compression: 1.016198	Avg Compression: 0.895913
Article 962:	Compression: 0.903091	Avg Compression: 0.895919
Article 963:	Compression: 0.854531	Avg Compression: 0.895830
Article 964:	Compression: 0.993056	Avg Compression: 0.895830
Article 965:	Compression: 1.028571	Avg Compression: 0.895831
Article 966:	Compression: 1.028409	Avg Compression: 0.895832
Article 967:	Compression: 1.031250	Avg Compression: 0.895832
Article 968:	Compression: 0.963291	Avg Compression: 0.895853
Article 969:	Compression

Article 1086:	Compression: 0.891000	Avg Compression: 0.896755
Article 1087:	Compression: 0.931913	Avg Compression: 0.896770
Article 1088:	Compression: 1.011777	Avg Compression: 0.896779
Article 1089:	Compression: 1.065811	Avg Compression: 0.896805
Article 1090:	Compression: 1.442857	Avg Compression: 0.896807
Article 1091:	Compression: 0.939394	Avg Compression: 0.896807
Article 1092:	Compression: 1.430147	Avg Compression: 0.896809
Article 1093:	Compression: 1.170455	Avg Compression: 0.896811
Article 1094:	Compression: 0.914868	Avg Compression: 0.896814
Article 1095:	Compression: 1.131944	Avg Compression: 0.896815
Article 1096:	Compression: 1.128378	Avg Compression: 0.896816
Article 1097:	Compression: 1.069079	Avg Compression: 0.896817
Article 1098:	Compression: 1.189394	Avg Compression: 0.896818
Article 1099:	Compression: 1.110714	Avg Compression: 0.896819
Article 1100:	Compression: 1.114583	Avg Compression: 0.896820
Article 1101:	Compression: 0.893742	Avg Compression: 0.896817
Article 

Article 1219:	Compression: 0.874754	Avg Compression: 0.897568
Article 1220:	Compression: 0.993572	Avg Compression: 0.897584
Article 1221:	Compression: 0.993243	Avg Compression: 0.897585
Article 1222:	Compression: 0.969828	Avg Compression: 0.897585
Article 1223:	Compression: 0.975864	Avg Compression: 0.897601
Article 1224:	Compression: 0.917683	Avg Compression: 0.897603
Article 1225:	Compression: 0.835897	Avg Compression: 0.897464
Article 1226:	Compression: 0.883983	Avg Compression: 0.897431
Article 1227:	Compression: 1.098485	Avg Compression: 0.897432
Article 1228:	Compression: 1.098485	Avg Compression: 0.897432
Article 1229:	Compression: 0.885237	Avg Compression: 0.897409
Article 1230:	Compression: 0.912809	Avg Compression: 0.897415
Article 1231:	Compression: 1.142045	Avg Compression: 0.897416
Article 1232:	Compression: 0.923323	Avg Compression: 0.897422
Article 1233:	Compression: 0.990972	Avg Compression: 0.897561
Article 1234:	Compression: 0.979546	Avg Compression: 0.897705
Article 

Article 1353:	Compression: 0.841571	Avg Compression: 0.897135
Article 1354:	Compression: 0.971816	Avg Compression: 0.897230
Article 1355:	Compression: 0.953929	Avg Compression: 0.897269
Article 1356:	Compression: 1.245968	Avg Compression: 0.897271
Article 1357:	Compression: 0.855136	Avg Compression: 0.897254
Article 1358:	Compression: 1.137500	Avg Compression: 0.897255
Article 1359:	Compression: 0.911621	Avg Compression: 0.897281
Article 1360:	Compression: 0.875403	Avg Compression: 0.897209
Article 1361:	Compression: 0.866771	Avg Compression: 0.897130
Article 1362:	Compression: 1.068750	Avg Compression: 0.897130
Article 1363:	Compression: 0.790000	Avg Compression: 0.897130
Article 1364:	Compression: 1.003600	Avg Compression: 0.897145
Article 1365:	Compression: 0.846184	Avg Compression: 0.897130
Article 1366:	Compression: 0.962413	Avg Compression: 0.897161
Article 1367:	Compression: 0.974928	Avg Compression: 0.897259
Article 1368:	Compression: 1.555556	Avg Compression: 0.897260
Article 

Article 1486:	Compression: 0.851218	Avg Compression: 0.895463
Article 1487:	Compression: 1.121622	Avg Compression: 0.895464
Article 1488:	Compression: 0.847721	Avg Compression: 0.895404
Article 1489:	Compression: 0.882073	Avg Compression: 0.895394
Article 1490:	Compression: 1.130682	Avg Compression: 0.895394
Article 1491:	Compression: 0.955000	Avg Compression: 0.895394
Article 1492:	Compression: 0.932034	Avg Compression: 0.895401
Article 1493:	Compression: 0.863840	Avg Compression: 0.895374
Article 1494:	Compression: 0.892585	Avg Compression: 0.895372
Article 1495:	Compression: 0.787037	Avg Compression: 0.895108
Article 1496:	Compression: 1.045455	Avg Compression: 0.895109
Article 1497:	Compression: 0.871559	Avg Compression: 0.895001
Article 1498:	Compression: 1.000000	Avg Compression: 0.895001
Article 1499:	Compression: 0.823683	Avg Compression: 0.894933
Article 1500:	Compression: 0.901746	Avg Compression: 0.894934
Article 1501:	Compression: 0.866028	Avg Compression: 0.894876
Article 

Article 1619:	Compression: 1.009839	Avg Compression: 0.896005
Article 1620:	Compression: 0.941240	Avg Compression: 0.896031
Article 1621:	Compression: 0.877074	Avg Compression: 0.895988
Article 1622:	Compression: 0.880741	Avg Compression: 0.895984
Article 1623:	Compression: 0.859993	Avg Compression: 0.895977
Article 1624:	Compression: 0.971570	Avg Compression: 0.896063
Article 1625:	Compression: 0.848609	Avg Compression: 0.896037
Article 1626:	Compression: 0.818299	Avg Compression: 0.895943
Article 1627:	Compression: 1.118452	Avg Compression: 0.895947
Article 1628:	Compression: 0.967593	Avg Compression: 0.895947
Article 1629:	Compression: 0.883318	Avg Compression: 0.895915
Article 1630:	Compression: 0.796674	Avg Compression: 0.895902
Article 1631:	Compression: 0.899497	Avg Compression: 0.895903
Article 1632:	Compression: 1.048077	Avg Compression: 0.895904
Article 1633:	Compression: 0.973158	Avg Compression: 0.895982
Article 1634:	Compression: 0.890091	Avg Compression: 0.895973
Article 

Article 1752:	Compression: 0.935276	Avg Compression: 0.895927
Article 1753:	Compression: 0.980957	Avg Compression: 0.896013
Article 1754:	Compression: 0.903597	Avg Compression: 0.896014
Article 1755:	Compression: 1.096154	Avg Compression: 0.896014
Article 1756:	Compression: 0.902362	Avg Compression: 0.896015
Article 1757:	Compression: 0.975962	Avg Compression: 0.896015
Article 1758:	Compression: 1.030828	Avg Compression: 0.896024
Article 1759:	Compression: 0.913043	Avg Compression: 0.896024
Article 1760:	Compression: 0.800653	Avg Compression: 0.895794
Article 1761:	Compression: 0.898338	Avg Compression: 0.895795
Article 1762:	Compression: 0.961957	Avg Compression: 0.895795
Article 1763:	Compression: 1.109375	Avg Compression: 0.895796
Article 1764:	Compression: 0.814188	Avg Compression: 0.895647
Article 1765:	Compression: 0.899117	Avg Compression: 0.895648
Article 1766:	Compression: 1.049235	Avg Compression: 0.895654
Article 1767:	Compression: 0.851837	Avg Compression: 0.895648
Article 

Article 1888:	Compression: 1.055556	Avg Compression: 0.895276
Article 1889:	Compression: 1.055556	Avg Compression: 0.895276
Article 1890:	Compression: 1.025000	Avg Compression: 0.895276
Article 1891:	Compression: 1.025000	Avg Compression: 0.895277
Article 1892:	Compression: 0.869953	Avg Compression: 0.895275
Article 1893:	Compression: 1.025000	Avg Compression: 0.895275
Article 1894:	Compression: 1.025000	Avg Compression: 0.895275
Article 1895:	Compression: 1.025000	Avg Compression: 0.895275
Article 1896:	Compression: 1.028571	Avg Compression: 0.895276
Article 1897:	Compression: 0.788690	Avg Compression: 0.895275
Article 1898:	Compression: 0.818091	Avg Compression: 0.895246
Article 1899:	Compression: 0.817389	Avg Compression: 0.895219
Article 1900:	Compression: 0.977830	Avg Compression: 0.895223
Article 1901:	Compression: 0.878740	Avg Compression: 0.895219
Article 1902:	Compression: 0.848395	Avg Compression: 0.895213
Article 1903:	Compression: 0.776937	Avg Compression: 0.894839
Article 

Драстична разлика спряко предишната терадка. Единствената разлика е подравняването. Изглежда наистина е проблем за обучението. Keras има механизъм за справяне с това - маскиращ слой. Ще го опитаме в следващата терадка...