In [1]:
import itertools
import math
import os

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

physical_devices = tf.config.list_physical_devices('GPU')
for physical_device in physical_devices:
    tf.config.experimental.set_memory_growth(physical_device, enable=True)

In [2]:
TYPE=np.int16

subword_text_encoder = tfds.features.text.SubwordTextEncoder.load_from_file('vocab_4096')

class Articles:
    EMPTY_ARTICLE = np.array([], dtype=TYPE) # used for padding
    
    def __init__(self, path):
        with open(path, 'rb') as text_file:
            data = text_file.read()

        self.articles = set(data.split(b'\0'))
        self._encoded_articles = None

    @property
    def encoded_articles(self):
        if self._encoded_articles == None:
            articles = [np.array(subword_text_encoder.encode(article), dtype=TYPE) for article in self.articles]
            self._encoded_articles = sorted(articles, key=len)
        
        return self._encoded_articles

    def articles_generator(self, batch_size = 1, start = 0, end = None):
        end = end or len(self.encoded_articles)

        for _ in range(batch_size - ((end - start - 1) % batch_size + 1)):
            yield self.EMPTY_ARTICLE

        for article in itertools.islice(self.encoded_articles, start, end):
            yield article

    def subbatch_generator(self, batch_size, batch_length, start = 0, end = None):
        end = end or len(self.encoded_articles)

        dataset = tf.data.Dataset.from_generator(self.articles_generator, args=(batch_size, start, end), output_types=TYPE)
        dataset = dataset.padded_batch(batch_size, padded_shapes=([None]), drop_remainder=True)
        dataset = dataset.shuffle(100)

        for batch in dataset.as_numpy_iterator():
            remaining = batch
            while remaining.shape[1] > batch_length + 1:
                yield remaining[:, :batch_length + 1]
                remaining = remaining[:, batch_length:]

            yield remaining
            if remaining.shape[1] == batch_length + 1:
                yield np.zeros((batch_size, 2), dtype=TYPE)

    def steps(self, batch_size, batch_length):
        articles = self.articles_generator(batch_size, batch_length)
        return sum(math.ceil(len(article) / batch_length) for i, article in enumerate(articles) if (i + 1) % batch_size == 0)

    def dataset(self, batch_size, batch_length, start = 0, end = None):
        end = end or len(self.encoded_articles)

        dataset = tf.data.Dataset.from_generator(self.subbatch_generator, args=(batch_size, batch_length, start, end), output_types=TYPE, output_shapes=(batch_size, None))
        return dataset.map(lambda batch: (batch[:, :-1], batch[:, 1:]))

In [3]:
articles = Articles('page_revisions_text')

In [4]:
BATCH_SIZE = 2048
BATCHED_ITEM_LENGTH = 32

def build_model(vocab_size, embedding_dim, rnn_units):
    return tf.keras.Sequential([
        tf.keras.layers.Masking(mask_value=0, batch_input_shape=[BATCH_SIZE, None]),
        tf.keras.layers.Embedding(vocab_size, embedding_dim),
        tf.keras.layers.GRU(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.GRU(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size),
    ])

checkpoint_dir = './training_checkpoints-4' # Directory where the checkpoints will be saved
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}") # Name of the checkpoint files

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, save_weights_only=True)

class ModelStateResetter(tf.keras.callbacks.Callback):
    def __init__(self):
        pass

    def on_epoch_begin(self, epoch, logs=None):
        self.last_total_length = 0
        
    def on_batch_end(self, batch, logs={}):
        average_batch_length = logs.get('average_batch_length', 0)
        total_length = int(round(average_batch_length * (batch + 1)))
        current_batch_length = total_length - self.last_total_length
        self.last_total_length = total_length
        
        if current_batch_length < BATCHED_ITEM_LENGTH:
            self.model.reset_states()
        
model_state_resetter_callback = ModelStateResetter()

def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

def average_batch_length(true_labels, predictions):
    return tf.shape(true_labels)[1]

In [7]:
model = build_model(vocab_size = subword_text_encoder.vocab_size, embedding_dim=64, rnn_units=768)
# model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.compile(optimizer='adam', loss=loss, metrics=[average_batch_length])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking_1 (Masking)          (2048, None)              0         
_________________________________________________________________
embedding_1 (Embedding)      (2048, None, 64)          259072    
_________________________________________________________________
gru_2 (GRU)                  (2048, None, 768)         1921536   
_________________________________________________________________
gru_3 (GRU)                  (2048, None, 768)         3543552   
_________________________________________________________________
dense_1 (Dense)              (2048, None, 4048)        3112912   
Total params: 8,837,072
Trainable params: 8,837,072
Non-trainable params: 0
_________________________________________________________________


In [12]:
dataset = articles.dataset(BATCH_SIZE, BATCHED_ITEM_LENGTH)
callbacks = [checkpoint_callback, model_state_resetter_callback]
model.fit(dataset, epochs=5, callbacks=callbacks)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x12c45dfd7c8>

In [None]:
dataset = articles.dataset(BATCH_SIZE, BATCHED_ITEM_LENGTH)
callbacks = [checkpoint_callback, model_state_resetter_callback]
model.fit(dataset, epochs=5, callbacks=callbacks)

Epoch 1/5
Epoch 2/5
  626/14061 [>.............................] - ETA: 2:44:04 - loss: 2.4925 - average_batch_length: 31.8291

In [6]:
dataset = articles.dataset(BATCH_SIZE, BATCHED_ITEM_LENGTH)
callbacks = [checkpoint_callback, model_state_resetter_callback]
model.fit(dataset, epochs=4, callbacks=callbacks)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x272c2ee5908>

In [None]:
dataset = articles.dataset(BATCH_SIZE, BATCHED_ITEM_LENGTH)
callbacks = [checkpoint_callback, model_state_resetter_callback]
model.fit(dataset, epochs=2, callbacks=callbacks)

Epoch 1/2
Epoch 2/2

In [6]:
dataset = articles.dataset(BATCH_SIZE, BATCHED_ITEM_LENGTH)
callbacks = [checkpoint_callback, model_state_resetter_callback]
model.fit(dataset, epochs=2, callbacks=callbacks)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x2419ecad4c8>

До тук добре. Числото изглежда много подобно на това, което сме виждали в предишни тетрадки с различен код, но при идентични параметри.

Да видим до каква компресия ще доведе този `loss`.

In [8]:
def build_predicting_model(vocab_size, embedding_dim, rnn_units):
    return tf.keras.Sequential([
        tf.keras.layers.Masking(mask_value=0, batch_input_shape=[1, 1]),
        tf.keras.layers.Embedding(vocab_size, embedding_dim),
        tf.keras.layers.GRU(rnn_units, stateful=True, return_sequences=True),
        tf.keras.layers.GRU(rnn_units, stateful=True, return_sequences=True),
        tf.keras.layers.Dense(vocab_size),
    ])

model = build_predicting_model(vocab_size = subword_text_encoder.vocab_size, embedding_dim=64, rnn_units=768)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, 1]))

In [9]:
import ctypes

class Huffman:
    huffman = ctypes.CDLL('x64/Release/huffman')
    
    huffman.create_tree.restype = ctypes.c_void_p
    huffman.destroy_tree.restype = None
    huffman.load_weights.restype = None
    huffman.create_code_string.restype = ctypes.c_char_p
    
    def __init__(self, category_count):
        self.tree = ctypes.c_void_p(self.huffman.create_tree(category_count))

    def __del__(self):
        self.huffman.destroy_tree(self.tree)
        
    def load_weights(self, weights):
        self.huffman.load_weights(self.tree, weights.ctypes.data_as(ctypes.POINTER(ctypes.c_float)))
    
    def get_code_length(self, category):
        return self.huffman.get_code_length(self.tree, category)

    def get_code_zero_count(self, category):
        return self.huffman.get_code_zero_count(self.tree, category)

In [10]:
def huffman_archive_size(model, text):
    archived_size = 0
    zeros = 0
    input_eval = np.array([[0]], dtype=TYPE)
    huffman_tree = Huffman(subword_text_encoder.vocab_size)

    text_generated = []

    model.reset_states()

    for index, byte in enumerate(text):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0) # remove the batch dimension

        weights = tf.nn.softmax(predictions[0]).numpy()
        huffman_tree.load_weights(weights)
        zeros += huffman_tree.get_code_zero_count(byte.item())
        archived_size += huffman_tree.get_code_length(byte.item())

        input_eval = tf.expand_dims([byte], 0)
  
    return archived_size, zeros

In [11]:
total_raw = 0
total_compressed = 0

for index, encoded_article in enumerate(articles.articles_generator(1)):
    if index % 1000 == 0:
        article = subword_text_encoder.decode(encoded_article)
        raw = len(article) * 8
        if raw == 0:
            continue
        compressed, _ = huffman_archive_size(model, encoded_article)
        total_raw += raw
        total_compressed += compressed

        print('Article %d:\tLength: %d\tCompression: %f\tAvg Compression: %f' % (index, raw, compressed/raw, total_compressed/total_raw))

Article 1000:	Length: 136	Compression: 0.610294	Avg Compression: 0.610294
Article 2000:	Length: 152	Compression: 0.513158	Avg Compression: 0.559028
Article 3000:	Length: 160	Compression: 0.543750	Avg Compression: 0.553571
Article 4000:	Length: 216	Compression: 0.407407	Avg Compression: 0.506024
Article 5000:	Length: 160	Compression: 0.493750	Avg Compression: 0.503641
Article 6000:	Length: 192	Compression: 0.515625	Avg Compression: 0.505906
Article 7000:	Length: 184	Compression: 0.445652	Avg Compression: 0.496667
Article 8000:	Length: 216	Compression: 0.398148	Avg Compression: 0.481638
Article 9000:	Length: 152	Compression: 0.598684	Avg Compression: 0.492985
Article 10000:	Length: 184	Compression: 0.467391	Avg Compression: 0.490297
Article 11000:	Length: 200	Compression: 0.460000	Avg Compression: 0.487193
Article 12000:	Length: 176	Compression: 0.500000	Avg Compression: 0.488252
Article 13000:	Length: 224	Compression: 0.361607	Avg Compression: 0.476190
Article 14000:	Length: 256	Compres

Article 110000:	Length: 19760	Compression: 0.063765	Avg Compression: 0.180044
Article 111000:	Length: 20480	Compression: 0.064795	Avg Compression: 0.175246
Article 112000:	Length: 14104	Compression: 0.138968	Avg Compression: 0.174234
Article 113000:	Length: 20296	Compression: 0.066860	Avg Compression: 0.170093
Article 114000:	Length: 20736	Compression: 0.067998	Avg Compression: 0.166223
Article 115000:	Length: 20920	Compression: 0.077151	Avg Compression: 0.162942
Article 116000:	Length: 20864	Compression: 0.072326	Avg Compression: 0.159731
Article 117000:	Length: 21208	Compression: 0.067239	Avg Compression: 0.156515
Article 118000:	Length: 20832	Compression: 0.074837	Avg Compression: 0.153818
Article 119000:	Length: 21152	Compression: 0.071861	Avg Compression: 0.151159
Article 120000:	Length: 21136	Compression: 0.071253	Avg Compression: 0.148650
Article 121000:	Length: 21240	Compression: 0.075518	Avg Compression: 0.146413
Article 122000:	Length: 20864	Compression: 0.072278	Avg Compress

Article 215000:	Length: 191240	Compression: 0.202860	Avg Compression: 0.183924
Article 216000:	Length: 168272	Compression: 0.131400	Avg Compression: 0.182393
Article 217000:	Length: 240448	Compression: 0.195389	Avg Compression: 0.182913
Article 218000:	Length: 283136	Compression: 0.199554	Avg Compression: 0.183661
Article 219000:	Length: 347320	Compression: 0.189739	Avg Compression: 0.183979
Article 220000:	Length: 461312	Compression: 0.239456	Avg Compression: 0.187582


In [12]:
model = build_model(vocab_size = subword_text_encoder.vocab_size, embedding_dim=64, rnn_units=768)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.compile(optimizer='adam', loss=loss, metrics=[average_batch_length])
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking_3 (Masking)          (2048, None)              0         
_________________________________________________________________
embedding_3 (Embedding)      (2048, None, 64)          259072    
_________________________________________________________________
gru_6 (GRU)                  (2048, None, 768)         1921536   
_________________________________________________________________
gru_7 (GRU)                  (2048, None, 768)         3543552   
_________________________________________________________________
dense_3 (Dense)              (2048, None, 4048)        3112912   
Total params: 8,837,072
Trainable params: 8,837,072
Non-trainable params: 0
_________________________________________________________________


In [15]:
dataset = articles.dataset(BATCH_SIZE, BATCHED_ITEM_LENGTH)
callbacks = [checkpoint_callback, model_state_resetter_callback]
model.fit(dataset, epochs=10, callbacks=callbacks)

Epoch 1/10
Epoch 2/10

UnknownError: Failed to rename: ./training_checkpoints-4\ckpt_2_temp_18191358c68149299289797b0f08c481/part-00000-of-00002.data-00000-of-00001 to: ./training_checkpoints-4\ckpt_2.data-00000-of-00002 : Access is denied.
; Input/output error [Op:MergeV2Checkpoints]

In [16]:
dataset = articles.dataset(BATCH_SIZE, BATCHED_ITEM_LENGTH)
callbacks = [checkpoint_callback, model_state_resetter_callback]
model.fit(dataset, epochs=8, callbacks=callbacks)

Epoch 1/8
Epoch 2/8

UnknownError: Failed to rename: ./training_checkpoints-4\ckpt_2_temp_81291d2a6dbe41c1bb6b88bf9a06653c/part-00000-of-00002.data-00000-of-00001 to: ./training_checkpoints-4\ckpt_2.data-00000-of-00002 : Access is denied.
; Input/output error [Op:MergeV2Checkpoints]

In [None]:
checkpoint_dir = './training_checkpoints-5' # Directory where the checkpoints will be saved
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}") # Name of the checkpoint files

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, save_weights_only=True)

dataset = articles.dataset(BATCH_SIZE, BATCHED_ITEM_LENGTH)
callbacks = [checkpoint_callback, model_state_resetter_callback]
model.fit(dataset, epochs=4, callbacks=callbacks)

Epoch 1/4
Epoch 2/4
Epoch 3/4

In [6]:
dataset = articles.dataset(BATCH_SIZE, BATCHED_ITEM_LENGTH)
callbacks = [checkpoint_callback, model_state_resetter_callback]
model.fit(dataset, epochs=4, callbacks=callbacks)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x1e1192a1b08>

In [None]:
dataset = articles.dataset(BATCH_SIZE, BATCHED_ITEM_LENGTH)
callbacks = [checkpoint_callback, model_state_resetter_callback]
model.fit(dataset, epochs=3, callbacks=callbacks)

Epoch 1/3
Epoch 2/3
Epoch 3/3

In [8]:
dataset = articles.dataset(BATCH_SIZE, BATCHED_ITEM_LENGTH)
callbacks = [checkpoint_callback, model_state_resetter_callback]
model.fit(dataset, epochs=2, callbacks=callbacks)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x1cdd2ae8bc8>

In [9]:
dataset = articles.dataset(BATCH_SIZE, BATCHED_ITEM_LENGTH)
callbacks = [checkpoint_callback, model_state_resetter_callback]
model.fit(dataset, callbacks=callbacks)



<tensorflow.python.keras.callbacks.History at 0x1cdd0a39408>

In [13]:
total_raw = 0
total_compressed = 0

for index, encoded_article in enumerate(articles.articles_generator(1)):
    if index % 1000 == 0:
        article = subword_text_encoder.decode(encoded_article)
        raw = len(article) * 8
        if raw == 0:
            continue
        compressed, _ = huffman_archive_size(model, encoded_article)
        total_raw += raw
        total_compressed += compressed

        print('Article %d:\tLength: %d\tCompression: %f\tAvg Compression: %f' % (index, raw, compressed/raw, total_compressed/total_raw))

Article 1000:	Length: 144	Compression: 18.659722	Avg Compression: 18.659722
Article 2000:	Length: 160	Compression: 16.750000	Avg Compression: 17.654605
Article 3000:	Length: 176	Compression: 15.369318	Avg Compression: 16.816667
Article 4000:	Length: 152	Compression: 17.723684	Avg Compression: 17.034810
Article 5000:	Length: 192	Compression: 13.979167	Avg Compression: 16.322816
Article 6000:	Length: 216	Compression: 12.472222	Avg Compression: 15.523077
Article 7000:	Length: 264	Compression: 10.223485	Avg Compression: 14.450153
Article 8000:	Length: 208	Compression: 13.033654	Avg Compression: 14.255291
Article 9000:	Length: 216	Compression: 12.435185	Avg Compression: 14.027778
Article 10000:	Length: 264	Compression: 10.295455	Avg Compression: 13.533133
Article 11000:	Length: 232	Compression: 11.629310	Avg Compression: 13.334532
Article 12000:	Length: 192	Compression: 14.119792	Avg Compression: 13.396937
Article 13000:	Length: 288	Compression: 9.329861	Avg Compression: 12.963757
Article 1

Article 109000:	Length: 20168	Compression: 0.135413	Avg Compression: 0.591006
Article 110000:	Length: 16776	Compression: 0.289878	Avg Compression: 0.580490
Article 111000:	Length: 20168	Compression: 0.137445	Avg Compression: 0.562639
Article 112000:	Length: 15768	Compression: 0.173072	Avg Compression: 0.550742
Article 113000:	Length: 20312	Compression: 0.136520	Avg Compression: 0.535063
Article 114000:	Length: 14880	Compression: 0.144153	Avg Compression: 0.524516
Article 115000:	Length: 17224	Compression: 0.215571	Avg Compression: 0.515160
Article 116000:	Length: 18632	Compression: 0.208995	Avg Compression: 0.505448
Article 117000:	Length: 20888	Compression: 0.148219	Avg Compression: 0.493181
Article 118000:	Length: 21680	Compression: 0.136808	Avg Compression: 0.480916
Article 119000:	Length: 21080	Compression: 0.142837	Avg Compression: 0.469968
Article 120000:	Length: 18984	Compression: 0.280973	Avg Compression: 0.464613
Article 121000:	Length: 21096	Compression: 0.142065	Avg Compress

Article 214000:	Length: 163000	Compression: 0.247331	Avg Compression: 0.232207
Article 215000:	Length: 191240	Compression: 0.197176	Avg Compression: 0.230970
Article 216000:	Length: 210688	Compression: 0.183712	Avg Compression: 0.229199
Article 217000:	Length: 242768	Compression: 0.215399	Avg Compression: 0.228628
Article 218000:	Length: 278320	Compression: 0.181934	Avg Compression: 0.226514
Article 219000:	Length: 341104	Compression: 0.202396	Avg Compression: 0.225246
Article 220000:	Length: 461312	Compression: 0.237505	Avg Compression: 0.226060


In [15]:
BATCH_SIZE = 8192
BATCHED_ITEM_LENGTH = 8

def build_model(vocab_size, embedding_dim, rnn_units):
    return tf.keras.Sequential([
        tf.keras.layers.Masking(mask_value=0, batch_input_shape=[BATCH_SIZE, None]),
        tf.keras.layers.Embedding(vocab_size, embedding_dim),
        tf.keras.layers.GRU(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.GRU(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size),
    ])

checkpoint_dir = './training_checkpoints-4' # Directory where the checkpoints will be saved
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}") # Name of the checkpoint files

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, save_weights_only=True)

class ModelStateResetter(tf.keras.callbacks.Callback):
    def __init__(self):
        pass

    def on_epoch_begin(self, epoch, logs=None):
        self.last_total_length = 0
        
    def on_batch_end(self, batch, logs={}):
        average_batch_length = logs.get('average_batch_length', 0)
        total_length = int(round(average_batch_length * (batch + 1)))
        current_batch_length = total_length - self.last_total_length
        self.last_total_length = total_length
        
        if current_batch_length < BATCHED_ITEM_LENGTH:
            self.model.reset_states()
        
model_state_resetter_callback = ModelStateResetter()

def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

def average_batch_length(true_labels, predictions):
    return tf.shape(true_labels)[1]

In [None]:
model = build_predicting_model(vocab_size = subword_text_encoder.vocab_size, embedding_dim=64, rnn_units=768)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, 1]))

In [16]:
model = build_model(vocab_size = subword_text_encoder.vocab_size, embedding_dim=64, rnn_units=768)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.compile(optimizer='adam', loss=loss, metrics=[average_batch_length])
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking_4 (Masking)          (8192, None)              0         
_________________________________________________________________
embedding_4 (Embedding)      (8192, None, 64)          259072    
_________________________________________________________________
gru_8 (GRU)                  (8192, None, 768)         1921536   
_________________________________________________________________
gru_9 (GRU)                  (8192, None, 768)         3543552   
_________________________________________________________________
dense_4 (Dense)              (8192, None, 4048)        3112912   
Total params: 8,837,072
Trainable params: 8,837,072
Non-trainable params: 0
_________________________________________________________________


In [17]:
dataset = articles.dataset(BATCH_SIZE, BATCHED_ITEM_LENGTH)
callbacks = [checkpoint_callback, model_state_resetter_callback]
model.fit(dataset, callbacks=callbacks)



<tensorflow.python.keras.callbacks.History at 0x1cdc22c3b48>

In [18]:
model = build_predicting_model(vocab_size = subword_text_encoder.vocab_size, embedding_dim=64, rnn_units=768)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, 1]))

In [19]:
total_raw = 0
total_compressed = 0

for index, encoded_article in enumerate(articles.articles_generator(1)):
    if index % 1000 == 0:
        article = subword_text_encoder.decode(encoded_article)
        raw = len(article) * 8
        if raw == 0:
            continue
        compressed, _ = huffman_archive_size(model, encoded_article)
        total_raw += raw
        total_compressed += compressed

        print('Article %d:\tLength: %d\tCompression: %f\tAvg Compression: %f' % (index, raw, compressed/raw, total_compressed/total_raw))

Article 1000:	Length: 144	Compression: 26.055556	Avg Compression: 26.055556
Article 2000:	Length: 160	Compression: 23.375000	Avg Compression: 24.644737
Article 3000:	Length: 176	Compression: 21.375000	Avg Compression: 23.445833
Article 4000:	Length: 152	Compression: 24.730263	Avg Compression: 23.754747
Article 5000:	Length: 192	Compression: 19.500000	Avg Compression: 22.763350
Article 6000:	Length: 216	Compression: 17.416667	Avg Compression: 21.652885
Article 7000:	Length: 264	Compression: 14.265152	Avg Compression: 20.157209
Article 8000:	Length: 208	Compression: 18.105769	Avg Compression: 19.875000
Article 9000:	Length: 216	Compression: 17.398148	Avg Compression: 19.565394
Article 10000:	Length: 264	Compression: 14.318182	Avg Compression: 18.869980
Article 11000:	Length: 232	Compression: 16.206897	Avg Compression: 18.592176
Article 12000:	Length: 192	Compression: 19.635417	Avg Compression: 18.675083
Article 13000:	Length: 288	Compression: 5.256944	Avg Compression: 17.245932
Article 1

Article 109000:	Length: 20168	Compression: 0.138834	Avg Compression: 0.804328
Article 110000:	Length: 16776	Compression: 0.327432	Avg Compression: 0.787674
Article 111000:	Length: 20168	Compression: 0.151478	Avg Compression: 0.762041
Article 112000:	Length: 15768	Compression: 0.343798	Avg Compression: 0.749268
Article 113000:	Length: 20312	Compression: 0.097381	Avg Compression: 0.724593
Article 114000:	Length: 14880	Compression: 0.428898	Avg Compression: 0.716615
Article 115000:	Length: 17224	Compression: 0.448212	Avg Compression: 0.708487
Article 116000:	Length: 18632	Compression: 0.355195	Avg Compression: 0.697280
Article 117000:	Length: 20888	Compression: 0.225440	Avg Compression: 0.681077
Article 118000:	Length: 21680	Compression: 0.149908	Avg Compression: 0.662796
Article 119000:	Length: 21080	Compression: 0.135911	Avg Compression: 0.645735
Article 120000:	Length: 18984	Compression: 0.304625	Avg Compression: 0.636070
Article 121000:	Length: 21096	Compression: 0.171834	Avg Compress

Article 214000:	Length: 163000	Compression: 0.382730	Avg Compression: 0.350507
Article 215000:	Length: 191240	Compression: 0.319766	Avg Compression: 0.349421
Article 216000:	Length: 210688	Compression: 0.320004	Avg Compression: 0.348319
Article 217000:	Length: 242768	Compression: 0.309707	Avg Compression: 0.346721
Article 218000:	Length: 278320	Compression: 0.275277	Avg Compression: 0.343486
Article 219000:	Length: 341104	Compression: 0.298539	Avg Compression: 0.341122
Article 220000:	Length: 461312	Compression: 0.324115	Avg Compression: 0.339993
