# Parse Wikipedia dump XML
## Parsing the XML itself

In [26]:
from xml import sax

In [16]:
class WikipediaHandler(sax.handler.ContentHandler):
    in_page = False
    in_text = False
    last_text = ""

    def __init__(self, handle_text, strip_wikicode=lambda s: s):
        self.handle_text = handle_text
        self.strip_wikicode = strip_wikicode

    def startElement(self, name, attrs):
        if name == 'page':
            self.in_page = True
            self.in_text = False
        elif self.in_page and name == 'text':
            self.in_text = True
            self.last_text = ""
        else:
            self.in_text = False

    def endElement(self, name):
        if name == 'text':
            self.in_text = False
        elif name == 'page':
            self.in_text = False
            self.in_page = False
            self.parse_text()

    def characters(self, content):
        if self.in_text:
            self.last_text += content

    def parse_text(self):
        if self.last_text:
            text = self.strip_wikicode(self.last_text)
            self.last_text = ""
            self.handle_text(text)

In [23]:
parser = sax.make_parser()
parser.setContentHandler(WikipediaHandler(lambda s: print("Article text:\n\t", s)))
with open('example.xml', 'r', encoding="UTF-8") as fd:
    parser.feed(fd.read())

Article text:
	 
          Lorem ipsum dolor sit<ref name="utf8_ascii">{{Cytuj stronę |tytuł=utf-8(7)&nbsp;– Linux manual page |opublikowany=Man7.org |data=2014-02-26 |url=http://man7.org/linux/man-pages/man7/utf-8.7.html |data dostępu=2014-04-21 |język=en}}</ref> amet
      
Article text:
	 Another [[article]] text...


## Stripping Mediawiki markup

In [14]:
import mwparserfromhell as mw
import re

In [15]:
SEP_STR = ";;;"
SEP = mw.nodes.text.Text(SEP_STR)
re_URL = re.compile(r"(?:http|https|ftp)://[A-Za-z0-9:._\/~%+&#?!=()@-]+[\w/]")
re_SEP = re.compile(r";{3,}")

def strip_wikicode(text):
    t = mw.parse(text)
    for tag in t.filter_tags(matches=lambda node: node.tag == 'ref'):
        if hasattr(tag.contents, 'nodes'):
            tag.contents.nodes = [SEP] + tag.contents.nodes + [SEP]
        else:
            tag.value = SEP_STR + tag.value + SEP_STR
    text = re_URL.sub(SEP_STR, t.strip_code())
    text = re_SEP.sub("\n\n", text)
    return text

In [22]:
parser = sax.make_parser()
parser.setContentHandler(WikipediaHandler(
    lambda s: print("Article text:\n\t", s),
    strip_wikicode))
with open('example.xml', 'r', encoding="UTF-8") as fd:
    parser.feed(fd.read())

Article text:
	           Lorem ipsum dolor sit

 amet
      
Article text:
	 Another article text...


## Reading XML from BZ2 archives in chunks

In [50]:
#!wget https://dumps.wikimedia.org/plwiki/20240801/plwiki-20240801-pages-meta-current1.xml-p1p187037.bz2

In [37]:
import bz2, sys

def read_chunks(file_name, callback, chunk_size, max_chunks=None):
    if file_name.endswith(".bz2"):
        fd = bz2.open(file_name, "rt", encoding="utf-8")
    else:
        fd = open(file_name, "rt", encoding="utf-8")

    chunk_n = 0
    with fd:
        while max_chunks is None or chunk_n < max_chunks:
            buffer = fd.read(chunk_size)
            chunk_n += 1
            print("Chunk", chunk_n, "of", len(buffer), "bytes", type(buffer), file=sys.stderr)
            if not buffer:
                break
            try:
                callback(buffer)
            except StopIteration:
                break

In [46]:
def handle_article_text(text):
    print(len(text), repr([text[:10], text[-10:]]))

In [47]:
FILE_NAME = 'plwiki-20240801-pages-meta-current1.xml-p1p187037.bz2'
CHUNK_SIZE = 1024 * 1024 * 8  # 8 MB

In [49]:
parser = sax.make_parser()
parser.setContentHandler(WikipediaHandler(handle_article_text, strip_wikicode))
read_chunks(FILE_NAME, parser.feed, CHUNK_SIZE, 3)