In [2]:
#!wget https://dumps.wikimedia.org/plwiki/20240801/plwiki-20240801-pages-meta-current1.xml-p1p187037.bz2

--2024-08-17 11:57:56--  https://dumps.wikimedia.org/plwiki/20240801/plwiki-20240801-pages-meta-current1.xml-p1p187037.bz2
Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 2620:0:861:3:208:80:154:71, 208.80.154.71
Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|2620:0:861:3:208:80:154:71|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 239025740 (228M) [application/octet-stream]
Saving to: ‘plwiki-20240801-pages-meta-current1.xml-p1p187037.bz2’


2024-08-17 11:58:49 (4.39 MB/s) - ‘plwiki-20240801-pages-meta-current1.xml-p1p187037.bz2’ saved [239025740/239025740]



In [11]:
FILENAMES = ['plwiki-20240801-pages-meta-current1.xml-p1p187037.bz2']

# Putting everything together

In [12]:
import re
import sys
import bz2
from xml import sax
import mwparserfromhell as mw
from collections import Counter

In [13]:
SEP_STR = ";;;"
SEP = mw.nodes.text.Text(SEP_STR)
re_URL = re.compile(r"(?:http|https|ftp)://[A-Za-z0-9:._\/~%+&#?!=()@-]+[\w/]")
re_SEP = re.compile(r";{3,}")

In [14]:
def strip_wikicode(text):
    t = mw.parse(text)
    for tag in t.filter_tags(matches=lambda node: node.tag == 'ref'):
        if hasattr(tag.contents, 'nodes'):
            tag.contents.nodes = [SEP] + tag.contents.nodes + [SEP]
        else:
            tag.value = SEP_STR + tag.value + SEP_STR
    text = re_URL.sub(SEP_STR, t.strip_code())
    text = re_SEP.sub("\n\n", text)
    return text

In [15]:
class WikipediaHandler(sax.handler.ContentHandler):
    in_page = False
    in_text = False
    last_text = ""

    def __init__(self, handle_text):
        self.handle_text = handle_text

    def startElement(self, name, attrs):
        if name == 'page':
            self.in_page = True
            self.in_text = False
        elif self.in_page and name == 'text':
            self.in_text = True
            self.last_text = ""
        else:
            self.in_text = False

    def endElement(self, name):
        if name == 'text':
            self.in_text = False
        elif name == 'page':
            self.in_text = False
            self.in_page = False
            self.parse_text()

    def characters(self, content):
        if self.in_text:
            self.last_text += content

    def parse_text(self):
        if self.last_text:
            text = strip_wikicode(self.last_text)
            self.last_text = ""
            self.handle_text(text)

In [16]:
def read_chunks(file_name, callback, chunk_size, max_chunks=None):
    if file_name.endswith(".bz2"):
        fd = bz2.open(file_name, "rt", encoding="utf-8")
    else:
        fd = open(file_name, "rt", encoding="utf-8")

    chunk_n = 0
    with fd:
        while max_chunks is None or chunk_n < max_chunks:
            buffer = fd.read(chunk_size)
            chunk_n += 1
            print("Chunk", chunk_n, "of", len(buffer), "bytes", type(buffer), file=sys.stderr)
            if not buffer:
                break
            try:
                callback(buffer)
            except StopIteration:
                break

In [30]:
class MultiDocumentParser:
    def __init__(self, callback, handler_class=WikipediaHandler):
        self.callback = callback
        self.handler = None
        self.handler_class = handler_class
        self.debug_context = ""
        self.parser = None

    def reset(self):
        print("New XML document - parser reset", file=sys.stderr)
        if self.handler is not None:
            self.handler.parse_text()
        self.parser = sax.make_parser()
        self.handler = self.handler_class(self.callback)
        self.parser.setContentHandler(self.handler)

    def feed(self, buffer):
        if self.parser is None:
            self.reset()
        while True:
            try:
                self.parser.feed(buffer)
            except sax.SAXParseException as e:
                i = self.parser._parser.CurrentByteIndex
                # not XML_ERROR_JUNK_AFTER_DOC_ELEMENT
                if e.getException().code != 9:
                    print("Exception context:", self.debug_context, "###", buffer[i-10:i], "###", buffer[i:i+10], file=sys.stderr)
                    raise
                buffer = buffer[i:]
                self.reset()
            else:
                break
        self.debug_context = buffer[-10:]

    def __enter__(self, *args, **kwargs):
        return self

    def __exit__(self, *args, **kwargs):
        if self.handler is not None:
            self.handler.parse_text()
        return self

In [18]:
def handle_article_text(text):
    print(len(text), repr([text[:10], text[-10:]]))

In [19]:
chunk_size = 1024 * 1024 * 8  # 8 MB
# chunk_size = 1024 * 1024 * 128  # 128 MB
parser = MultiDocumentParser(handle_article_text, WikipediaHandler)
for file_name in FILENAMES:
    read_chunks(file_name, parser.feed, chunk_size, 3)

Chunk 1 of 8388608 bytes <class 'str'>
Parser reset


13604 ['AWK – inte', ' skryptowe']
491 ['Alergologi', ' lekarskie']
136 ['Andrzej To', 'as.--Kpjas']
14540 ['thumb|94 z', 'nia znaków']
47570 ['thumb|Atom', ' \n \n \n \n\n ']
8169 ['Aksjomat, ', 'a \n\n \n\n \n ']
6233 ['mały|Rycin', '05-10].\n\n ']
4461 ['Link\n\nPopr', '2 sty 2015']
3865 ['Alkeny – o', 'afia \n \n\n ']
2477 ['ActiveX – ', 't Explorer']
3736 ['Interfejs ', 'zypisy \n\n ']
5633 ['AmigaOS – ', 'operacyjne']
4793 ['Associatio', 'ozarządowe']
2964 ['Alternatyw', 'i logiczne']
2291 ['Aksjomat i', 'tematyczne']
23 ['PATRZ Cesa', 'wo Etiopii']
715 ['thumb|150p', 'ia:Polieny']
1433 ['thumb|200p', 'zypisy \n\n ']
9142 ['Alkany (pa', ' \n\n\n \n\n\n\n ']
4869 ['thumb|140p', 'zypisy \n\n ']
71064 ['thumb|„A w', '05-12].\n\n ']
610 ['Anarchia (', 'zypisy \n\n*']
20098 ['To juz prz', 'i nadal są']
1983 ['ASN.1 (skr', 'puterowych']
21297 ['Algorytm –', 'anych\n \n\n ']
750 ['Przylądek ', ' Zachodnia']
3682 ['thumb|100p', 'zypisy \n\n ']
19245 ['Agatha Chr', 'rli w 1976']
3273 ['Al

Chunk 2 of 8388608 bytes <class 'str'>


662 ['Bohemia – ', 'j Zelandii']
1208 ['Bebop (rów', 'le jazzowe']
65062 ['thumb|Wład', ' Wikipedii']
22863 ['Bolesław V', 'niowiecze)']
14170 ['Bitwa pod ', ' Gorlicami']
3996 ['thumb|350p', ' \n \n \n \n\n ']
19656 ['Bitwa pod ', 'pod Lenino']
940 ['Bison – ge', 'egoria:GNU']
14558 ['Bronisław ', 'nografowie']
295 ['Babaszka, ', ':Zecerstwo']
5287 ['thumb|Synt', 'oaktywne\n ']
2048 ['thumb|Abdo', ' lekarskie']
27 ['REDIRECT W', 'an foremny']
4774 ['Baran (łac', ' Guide \n\n ']
11633 ['Bruksela (', 'm Brukseli']
1078 ['Biopolimer', ', s. 53\n\n ']
1655 ['Biotyna ( ', 'icykliczne']
17 ['REDIRECT B', 'T Bakterie']
8394 ['PYTANIE:\nP', '2017 (CET)']
4603 ['Burrhus Fr', 'of Science']
4789 ['Bitwa na P', ' Psim Polu']
2229 ['Bromowodór', 'ązki bromu']
4653 ['thumb|200p', 'szczalniki']
3366 ['\\n\nZastana', '012 (CEST)']
11998 ['thumb|280p', 'y:Хирургия']
29395 ['Chemia org', 'n. \n \n \n\n ']
2356 ['Chemia fiz', 'zypisy \n\n ']
8244 ['thumb|200p', 'Typografia']
2179 ['mały|Przyk', '12-03].

Chunk 3 of 8388608 bytes <class 'str'>


5054 ['Free Pasca', 'ji GNU GPL']
4155 ['Festiwal (', 'zypisy \n\n ']
8352 ['Wyjaśnieni', '010 (CEST)']
28 ['REDIRECT T', 'z I Wielki']
21 ['REDIRECT F', 'lip I Arab']
267 ['Femto (f) ', 'sy \n\nfemto']
624 ['F (minusku', 'Spółgłoski']
15 ['REDIRECT Ż', 'ECT Żelazo']
215 ['Flawiusz W', 'w IV wieku']
8279 ['thumb|300p', 'e world\n\n ']
1068 ['Flaga Arab', 'audyjskiej']
1992 ['Flaga Libi', 'dowe Libii']
19508 ['Flaga Pols', ' Wikipedii']
4002 ['Flaga Irla', 'e Irlandii']
824 ['thumb|250p', 'dowe Nauru']
4469 ['thumb|Szta', 'i\nBrazylii']
1006 ['thumb|Pols', 'zypisy \n\n ']
2611 ['Flaga olim', 'limpijskie']
71 ['REDIRECT M', 'ółksiężyca']
2122 ['thumb|Flor', ' Florencji']
1434 ['thumb|240p', 'ria:Monety']
497 [' fm – jedn', ' Przypisy ']
11 ['PATRZ Fran', 'ATRZ Frans']
2929 ['thumb|Prze', 'e sygnałów']
1287 ['mały|Holen', 'polityczna']
21 ['REDIRECT F', 'le krótkie']
1654 ['Fale średn', 'agnetyczne']
21 ['REDIRECT f', 'le średnie']
2187 ['Fale długi', 'agnetyczne']
20 ['REDIRECT f', 'ale 

## Split into words

In [24]:
re_word = re.compile(r"\b(\w{2,})\b(?!\. ?\d)", re.I | re.U | re.M)
re_digit = re.compile(r"[\d_]")

def split_words(text):
    for m in re_word.finditer(text):
        word = m[0]
        if not re_digit.search(word):
            yield word.lower()

In [28]:
list(split_words("Lorem ipsum abc123 dolor sit amet?"))

['lorem', 'ipsum', 'dolor', 'sit', 'amet']

## Count most common words

In [29]:
from collections import Counter

word_counter = Counter()

def count_words_in_text(text):
    word_counter.update(split_words(text))

parser = MultiDocumentParser(count_words_in_text, WikipediaHandler)
for file_name in FILENAMES:
    read_chunks(file_name, parser.feed, chunk_size, 3)

Chunk 1 of 8388608 bytes <class 'str'>
Parser reset
Chunk 2 of 8388608 bytes <class 'str'>
Chunk 3 of 8388608 bytes <class 'str'>


In [31]:
print("Most common words in Polish Wikipedia:")
print(word_counter.most_common(50))

Most common words in Polish Wikipedia:
[('na', 26592), ('się', 22263), ('do', 19682), ('jest', 12148), ('nie', 9211), ('przez', 8263), ('od', 7875), ('roku', 7496), ('to', 7398), ('oraz', 6285), ('że', 6142), ('po', 5724), ('kategoria', 5509), ('jako', 4697), ('za', 4570), ('są', 4462), ('dla', 4354), ('jego', 4195), ('lub', 3804), ('jak', 3546), ('tym', 3425), ('był', 3099), ('pod', 3072), ('także', 3028), ('ze', 2931), ('co', 2888), ('też', 2781), ('thumb', 2696), ('jednak', 2623), ('który', 2558), ('które', 2415), ('wieku', 2397), ('ich', 2331), ('tego', 2291), ('jej', 2269), ('również', 2268), ('np', 2175), ('in', 2151), ('został', 2115), ('przy', 2096), ('the', 2001), ('czy', 1986), ('ii', 1950), ('była', 1884), ('latach', 1868), ('ma', 1851), ('miasta', 1783), ('ale', 1777), ('lat', 1696), ('być', 1655)]


## Compare it with the NLTK build-in corpus of the Polish language of the 1960s

In [39]:
import nltk
from nltk.corpus.reader import pl196x

def pl196x_words():
    pl196x_dir = nltk.data.find("corpora/pl196x")
    pl = pl196x.Pl196xCorpusReader(pl196x_dir, r".*\.xml", textids="textids.txt", cat_file="cats.txt")
    return pl.words(fileids=pl.fileids(), categories="cats.txt")

def pl196x_words_filtered():
    for word in pl196x_words():
        if re_word.match(word) and not re_digit.search(word):
            yield word.lower()

In [41]:
word_counter = Counter()  # reset counter
word_counter.update(pl196x_words_filtered())
print("Most common words in PL196x corpus:")
print(word_counter.most_common(50))

Most common words in PL196x corpus:
[('się', 10315), ('nie', 9460), ('na', 9285), ('do', 6260), ('to', 6186), ('że', 4704), ('jest', 3541), ('jak', 2882), ('co', 2449), ('ale', 2228), ('tak', 2210), ('po', 1898), ('od', 1883), ('tym', 1851), ('już', 1831), ('za', 1548), ('przez', 1535), ('tylko', 1496), ('dla', 1450), ('tego', 1339), ('ja', 1312), ('czy', 1240), ('pan', 1189), ('mnie', 1155), ('jeszcze', 1140), ('są', 1112), ('może', 1090), ('przy', 1064), ('ze', 999), ('mi', 941), ('ten', 939), ('roku', 928), ('jego', 915), ('było', 893), ('bo', 884), ('będzie', 883), ('tu', 883), ('ich', 860), ('tej', 827), ('sobie', 816), ('ma', 812), ('bardzo', 789), ('go', 779), ('oraz', 778), ('tysiąc', 772), ('więc', 752), ('które', 750), ('no', 745), ('tych', 741), ('był', 736)]


## Count it as grammatical lemmas

In [47]:
import morfeusz2
morf = morfeusz2.Morfeusz()

In [48]:
morf.analyse("miał")

[(0, 1, ('miał', 'miał', 'subst:sg:nom.acc:m3', ['nazwa_pospolita'], [])),
 (0, 1, ('miał', 'mieć', 'praet:sg:m1.m2.m3:imperf', [], []))]

In [59]:
def lemma(word):
    if word == 'miał':
        return 'mieć'
    a = morf.analyse(word)
    return a[0][2][1]

[lemma('miał'), lemma('chciał'), lemma('roku')]

['mieć', 'chcieć', 'rok:Sm3~lata']

In [60]:
def lemma(word):
    if word == 'miał':
        return 'mieć'
    a = morf.analyse(word)
    return a[0][2][1].split(':')[0]

[lemma('miał'), lemma('chciał'), lemma('roku')]

['mieć', 'chcieć', 'rok']

In [61]:
word_counter = Counter()  # reset counter
word_counter.update(map(lemma, pl196x_words_filtered()))

print("Most common lemmas in PL196x corpus:")
print(word_counter.most_common(50))

Most common lemmas in PL196x corpus:
[('ten', 11460), ('się', 10315), ('nie', 9808), ('na', 9768), ('być', 9079), ('do', 6312), ('on', 5444), ('że', 4752), ('co', 3362), ('który', 3171), ('jak', 3002), ('ja', 2374), ('tak', 2250), ('ale', 2232), ('po', 2156), ('oda', 2034), ('tym', 1852), ('już', 1833), ('pan', 1752), ('za', 1719), ('przez', 1554), ('czy', 1499), ('tylko', 1498), ('taki', 1475), ('dla', 1453), ('tysiąc', 1373), ('mój', 1291), ('siebie', 1269), ('bardzo', 1211), ('mieć', 1188), ('swój', 1175), ('móc', 1175), ('wiedzieć', 1159), ('miąć', 1157), ('jeszcze', 1142), ('rok', 1121), ('ty', 1107), ('może', 1090), ('sam', 1086), ('wszystek', 1084), ('pani', 1084), ('przy', 1064), ('dwa', 1038), ('czas', 1024), ('mówić', 1016), ('z', 1002), ('inny', 995), ('chcieć', 994), ('my', 974), ('człowiek', 960)]


In [62]:
def count_lemmas_in_text(text):
    word_counter.update(map(lemma, split_words(text)))

word_counter = Counter()  # reset counter
parser = MultiDocumentParser(count_lemmas_in_text, WikipediaHandler)
for file_name in FILENAMES:
    read_chunks(file_name, parser.feed, chunk_size, 3)

print("Most common lemmas in Polish Wikipedia:")
print(word_counter.most_common(50))

Chunk 1 of 8388608 bytes <class 'str'>
New XML document - parser reset
Chunk 2 of 8388608 bytes <class 'str'>
Chunk 3 of 8388608 bytes <class 'str'>


Most common lemmas in Polish Wikipedia:
[('na', 26598), ('być', 23535), ('się', 22263), ('do', 19687), ('ten', 15923), ('on', 13830), ('który', 11639), ('nie', 9211), ('przez', 8287), ('rok', 8205), ('oda', 7898), ('oraz', 6285), ('że', 6149), ('kategoria', 5731), ('po', 5724), ('zostać', 4972), ('jako', 4697), ('za', 4572), ('dla', 4358), ('lato', 4223), ('polski', 4138), ('inny', 3937), ('lub', 3804), ('co', 3572), ('jak', 3548), ('miasto', 3484), ('tym', 3425), ('czas', 3208), ('swój', 3101), ('pod', 3073), ('także', 3028), ('pierwszy', 2990), ('z', 2935), ('wiek', 2855), ('też', 2781), ('wielki', 2745), ('thumb', 2696), ('jednak', 2623), ('sam', 2419), ('czy', 2415), ('język', 2302), ('mieć', 2281), ('jej', 2269), ('również', 2268), ('część', 2212), ('jeden', 2204), ('np', 2175), ('mały', 2172), ('bardzo', 2171), ('in', 2151)]
