## Обработка на най-дългите статии

В dataset-а имаме известно количество много дълги статии. Да видим дали можем да обработим нещо в тях. Възможна идея е да ги разцепим на секции.

In [1]:
import itertools
import math
import os

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

physical_devices = tf.config.list_physical_devices('GPU')
for physical_device in physical_devices:
    tf.config.experimental.set_memory_growth(physical_device, enable=True)

In [2]:
TYPE=np.int16

subword_text_encoder = tfds.features.text.SubwordTextEncoder.load_from_file('vocab_4096')

class Articles:
    EMPTY_ARTICLE = np.array([], dtype=TYPE) # used for padding
    
    def __init__(self, path):
        with open(path, 'rb') as text_file:
            data = text_file.read()

        self.articles = set(data.split(b'\0'))
        self._encoded_articles = None

    @property
    def encoded_articles(self):
        if self._encoded_articles == None:
            articles = [np.array(subword_text_encoder.encode(article), dtype=TYPE) for article in self.articles]
            self._encoded_articles = sorted(articles, key=len)
        
        return self._encoded_articles

    def articles_generator(self, batch_size = 1, start = 0, end = None):
        end = end or len(self.encoded_articles)

        for _ in range(batch_size - ((end - start - 1) % batch_size + 1)):
            yield self.EMPTY_ARTICLE

        for article in itertools.islice(self.encoded_articles, start, end):
            yield article

    def subbatch_generator(self, batch_size, batch_length, start = 0, end = None):
        end = end or len(self.encoded_articles)

        dataset = tf.data.Dataset.from_generator(self.articles_generator, args=(batch_size, start, end), output_types=TYPE)
        dataset = dataset.padded_batch(batch_size, padded_shapes=([None]), drop_remainder=True)
        dataset = dataset.shuffle(100)

        for batch in dataset.as_numpy_iterator():
            remaining = batch
            while remaining.shape[1] > batch_length + 1:
                yield remaining[:, :batch_length + 1]
                remaining = remaining[:, batch_length:]

            yield remaining
            if remaining.shape[1] == batch_length + 1:
                yield np.zeros((batch_size, 2), dtype=TYPE)

    def steps(self, batch_size, batch_length):
        articles = self.articles_generator(batch_size, batch_length)
        return sum(math.ceil(len(article) / batch_length) for i, article in enumerate(articles) if (i + 1) % batch_size == 0)
                
    def dataset(self, batch_size, batch_length, start = 0, end = None):
        end = end or len(self.encoded_articles)

        dataset = tf.data.Dataset.from_generator(self.subbatch_generator, args=(batch_size, batch_length, start, end), output_types=TYPE, output_shapes=(batch_size, None))
        return dataset.map(lambda batch: (batch[:, :-1], batch[:, 1:]))

In [3]:
articles = Articles('page_revisions_text')

In [4]:
articles.steps(2048, 32)

14061

In [5]:
steps = 12

for i in range(steps):
    batch_size = 8 * 2**i
    batch_item_length = 4 * 2**(steps - i - 1)
    count = articles.steps(batch_size, batch_item_length)
    print("batch size: %6d\t batch item length: %4d\tsteps per epoch: %6d" % (batch_size, batch_item_length, count))

batch size:      8	 batch item length: 8192	steps per epoch:  27455
batch size:     16	 batch item length: 4096	steps per epoch:  15332
batch size:     32	 batch item length: 2048	steps per epoch:   9572
batch size:     64	 batch item length: 1024	steps per epoch:   7081
batch size:    128	 batch item length:  512	steps per epoch:   6303
batch size:    256	 batch item length:  256	steps per epoch:   6340
batch size:    512	 batch item length:  128	steps per epoch:   7222
batch size:   1024	 batch item length:   64	steps per epoch:   9423
batch size:   2048	 batch item length:   32	steps per epoch:  14061
batch size:   4096	 batch item length:   16	steps per epoch:  23482
batch size:   8192	 batch item length:    8	steps per epoch:  42479
batch size:  16384	 batch item length:    4	steps per epoch:  80683


В Wikitext формата има поддръжка на секции. Анотират се със последователсности от знака `=` преди и след името на секцията. Могат да бъдат влагани чрез повече `=`.

In [7]:
import re

section_pattern = re.compile(b'==+[^=]+==+')
sections = []

for article in articles.articles:
    sections += section_pattern.findall(article)

sections

[b'==External links==',
 b'== History ==',
 b'== Trophies and titles ==',
 b'== 2005/2006 squad ==',
 b'===First team squad===',
 b'===Players on loan===',
 b'== Hall of Fame ==',
 b'==See also==',
 b'== Other sports ==',
 b'==External links==',
 b'== Ancient ruins ==',
 b'==See also==',
 b'== Geography ==',
 b'== Demographics ==',
 b'== External links ==',
 b'==Logographic systems==',
 b'==Ideographic and phonetic dimensions==',
 b'==Chinese characters==',
 b'==Advantages and disadvantages==',
 b'==See also==',
 b'==External links==',
 b'==References==',
 b'==Timeline==',
 b'==External links==',
 b'==Biography==',
 b'==Controversies==',
 b'==Awards==',
 b'==Clubs==',
 b'==Honours==',
 b'===As a player===',
 b'===As a manager===',
 b'==References==',
 b'== Geography ==',
 b'== Demographics ==',
 b'== External links ==',
 b'== Geometric introduction ==',
 b'=== Stereographic projection ===',
 b'==== Alternate stereographic projection ====',
 b'==== Geometric features of note ====',
 b'=

Броят на всички секции е:

In [8]:
len(sections)

657844

Дължината на текста дефиниращ всички секции е:

In [9]:
sum(len(s) for s in sections)

13595913

Броят на уникалните секции е:

In [10]:
len(set(sections))

232660

Дължината на текста дефиниращ уникалните секции е:

In [11]:
sum(len(s) for s in set(sections))

6798552

In [72]:
x = []
for article in articles.articles:
    x += map(len, section_pattern.split(article))

In [75]:
sorted(x, reverse=True)

[534207,
 429626,
 405308,
 312923,
 286620,
 225882,
 220513,
 199784,
 198211,
 186840,
 184934,
 184108,
 175179,
 164796,
 151103,
 150412,
 147413,
 129560,
 126372,
 116206,
 93809,
 91557,
 84708,
 83911,
 82402,
 77976,
 76453,
 75651,
 68982,
 67341,
 63557,
 59553,
 55361,
 54970,
 54840,
 54582,
 53896,
 53319,
 52356,
 51870,
 50261,
 49519,
 45581,
 44064,
 43338,
 42922,
 42404,
 42002,
 40823,
 40214,
 39682,
 39579,
 38324,
 37833,
 37249,
 37159,
 35995,
 35966,
 35178,
 34693,
 34543,
 34501,
 34250,
 34218,
 33499,
 33411,
 33238,
 32892,
 32822,
 32642,
 32536,
 32369,
 32228,
 32166,
 32132,
 31910,
 31840,
 31610,
 31511,
 31397,
 31229,
 31205,
 31205,
 31075,
 30723,
 30519,
 30317,
 30269,
 30149,
 30061,
 29998,
 29967,
 29943,
 29903,
 29658,
 29500,
 29484,
 29446,
 29401,
 29253,
 29176,
 28932,
 28881,
 28594,
 28387,
 28239,
 28123,
 27992,
 27986,
 27931,
 27914,
 27754,
 27657,
 27509,
 27476,
 27454,
 27396,
 27276,
 27131,
 27115,
 27089,
 27069,
 270

In [78]:
len(x)

878301

In [77]:
sorted(articles.articles, key=len, reverse=True)[0]



In [90]:
user_pattern = re.compile(b'\[\[User:[^\]]+\]\]')

users = []

for article in articles.articles:
    users += user_pattern.findall(article)

users

[b'[[User:Wshun|wshun]]',
 b'[[User:ed_g2s|ed g2s]]',
 b'[[User:Mateo SA]]',
 b'[[User:Montrealais|Montr&eacute;alais]]',
 b'[[User:Linus|Linus]]',
 b'[[User:Linus|Linus]]',
 b'[[User:Linus|Linus]]',
 b'[[User:Linus|Linus]]',
 b'[[User:Linus|Linus]]',
 b'[[User:Linus|Linus]]',
 b'[[User:Linus|Linus]]',
 b'[[User:Linus|Linus]]',
 b'[[User:Linus|Linus]]',
 b'[[User:Linus|Linus]]',
 b'[[User:Linus|Linus]]',
 b'[[User:Linus|Linus]]',
 b'[[User:Linus|Linus]]',
 b'[[User:Linus|Linus]]',
 b'[[User:Linus|Linus]]',
 b'[[User:Rasmus Faber|Rasmus Faber]]',
 b'[[User:Linus|Linus]]',
 b'[[User:Rasmus Faber|Rasmus Faber]]',
 b'[[User:Tannin|Tannin]]',
 b'[[User:Ancheta Wis|Ancheta Wis]]',
 b'[[User:Big iron|Big iron]]',
 b'[[User:Fabiform|Fabiform]]',
 b'[[User:Fabiform|Fabiform]]',
 b'[[User:Fabiform|Fabiform]]',
 b'[[User:Fabiform|Fabiform]]',
 b'[[User:Steve nova|Steve nova]]',
 b'[[User:Mrdice|Mrdice]]',
 b'[[User:Ancheta Wis|Ancheta Wis]]',
 b'[[User:Hadal|Hadal]]',
 b'[[User:Doctormuffin|Docto

In [94]:
set(users)

{b'[[User:TreyHarris|Trey Harris]]',
 b'[[User:kallemax|Casper B. Henriksen]]',
 b'[[User:LarryMac|LarryMac]]',
 b'[[User:BrianReading|Brian Reading]]',
 b'[[User:Llywrch|Llywrch]]',
 b'[[User:bobby2nag|Nagarjuna Kommineni]]',
 b'[[User:Hall Monitor|Hall Monitor]]',
 b'[[User:Icundell|Ian Cundell]]',
 b'[[User:ihatepotsmokinghippies|Ihatepotsmokinghippies]]',
 b'[[User:Venugopal|Venugopal]]',
 b'[[User:Enirac Sum|Enirac Sum]]',
 b'[[User:Tom Peters|Tom Peters]]',
 b'[[User:Sintonak.X|Sintonak.X]]',
 b'[[User:Taxman/Copyrights]]',
 b'[[User:Diberri|Diberri]]',
 b'[[User:D33j4y|D33j4y]]',
 b'[[User:Prz|Prz]]',
 b'[[User:Agent Smith|Agent Smith]]',
 b'[[User:AleatoricConsonance|AleatoricConsonance]]',
 b'[[User:Saintswithin|Saint]]',
 b'[[User:Kingturtle|Half shark-alligator, half man]]',
 b'[[User:Sparr|Sparr]]',
 b'[[User:ClaudineChionh|ClaudineChionh]]',
 b'[[User:Seav|seav]]',
 b'[[User:Wayland|Wayland]]',
 b'[[User:Black Widow|Black Widow]]',
 b'[[User:Nikai]]',
 b'[[User:Nachiket Go

In [93]:
print('Total count:', len(users))
print('Total length:', sum(len(u) for u in users))
print('Unique count:', len(set(users)))
print('Unique length:', sum(len(u) for u in set(users)))

Total count: 54413
Total length: 1493801
Unique count: 8404
Unique length: 236667


In [96]:
len(max(articles.articles, key=len))

738998

In [103]:
len(user_pattern.sub(b'xxx', max(articles.articles, key=len)))

619903

In [104]:
user_pattern.sub(b'xxx', max(articles.articles, key=len))

