## Get dumps

Go to [dumps](https://dumps.wikimedia.org/enwiki), and get the latest dump.

## Parse dumps

In [None]:
base_path = '/Users/harangju/Developer/data/wiki/'
xml_name = 'enwiki-20190801-pages-articles-multistream.xml'
index_name = 'enwiki-20190801-pages-articles-multistream-index.txt'
# base_path = '/Users/harangju/Developer/data/wiki/partition/'
# xml_name = 'enwiki-20190801-pages-articles-multistream1.xml-p10p30302'
# index_name = 'enwiki-20190801-pages-articles-multistream-index1.txt-p10p30302'
xml_path = base_path + xml_name
xml_bz2_path = xml_path + '.bz2'
index_path = base_path + index_name
index_bz2_path = index_path + '.bz2'

### Get indices

How to use multistream?
For multistream, you can get an index file, pages-articles-multistream-index.txt.bz2. The first field of this index is the number of bytes to seek into the compressed archive pages-articles-multistream.xml.bz2, the second is the article ID, the third the article title.

Cut a small part out of the archive with dd using the byte offset as found in the index. You could then either bzip2 decompress it or use bzip2recover, and search the first file for the article ID.

In [None]:
indices = []
with open(index_path, 'r', encoding = 'utf-8') as f:
    for line in f:
        [n1, n2, name] = line.strip().split(':',2)
        indices.append((n1, n2, name))

In [None]:
print(len(indices))
indices[90:101]

### Unzip with bzcat

In [None]:
import subprocess

lines = []
for line in subprocess.Popen(['bzcat'],
                             stdin = open(xml_bz2_path),
                             stdout = subprocess.PIPE).stdout:
    lines.append(line)
lines[:3]

### Unzip with bz2

In [None]:
import bz2

In [None]:
str(10).encode('utf-8')

In [None]:
# bz2 incremental
with bz2.BZ2File(xml_bz2_path, 'rb') as f:
    f.seek(0)
    for _ in range(1000):
        data = data + f.readline()
data[:1000]

In [None]:
lines = data.split(b'\n')
lines[:5]

#### Parse XML

In [None]:
import xml.sax

class WikiXmlHandler(xml.sax.handler.ContentHandler):
    """Content handler for Wiki XML data using SAX"""
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._current_tag = None
        self._pages = []

    def characters(self, content):
        """Characters between opening and closing tags"""
        if self._current_tag:
            self._buffer.append(content)

    def startElement(self, name, attrs):
        """Opening tag of element"""
        if name in ('title', 'text'):
            self._current_tag = name
            self._buffer = []

    def endElement(self, name):
        """Closing tag of element"""
        if name == self._current_tag:
            self._values[name] = ' '.join(self._buffer)

        if name == 'page':
            self._pages.append((self._values['title'], self._values['text']))

In [None]:
# Content handler for Wiki XML
handler = WikiXmlHandler()

# Parsing object
parser = xml.sax.make_parser()
parser.setContentHandler(handler)

# parse
for l in lines:
    parser.feed(l)

In [None]:
print([x[0] for x in handler._pages])

#### Parse wiki

In [None]:
import mwparserfromhell as mph

In [None]:
print(handler._pages[1][0])
wiki = mph.parse(handler._pages[1][1])

In [None]:
wikilinks = [x.title for x in wiki.filter_wikilinks()]
print(f'There are {len(wikilinks)} wikilinks.')
wikilinks[:5]

In [None]:
templates = wiki.filter_templates()
print(f'There are {len(templates)} templates.')
for template in templates[:5]:
    print(template.name)