### Dependencies

In [None]:
import xml.sax

class WikiXmlHandler(xml.sax.handler.ContentHandler):
    """Content handler for Wiki XML data using SAX"""
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._current_tag = None

    def characters(self, content):
        """Characters between opening and closing tags"""
        if self._current_tag:
            if self._current_tag == 'id' and self._buffer:
                return
            self._buffer.append(content)

    def startElement(self, name, attrs):
        """Opening tag of element"""
        if name in ('title', 'text', 'id'):
            if name == 'id' and name in self._values.keys():
                return
            self._current_tag = name
            self._buffer = []

    def endElement(self, name):
        """Closing tag of element"""
        if name == self._current_tag:
            self._values[name] = ' '.join(self._buffer)

### Engine

In [None]:
import bz2
import mwparserfromhell as mph
import re

class WikiEngine():
    def __init__(self, path_xml, path_idx):
        # path_xml is the path to the XML bz2 file
        # path_idx is the path to the index bz2 file
        self._idx = None
        self.page = None
        self._links = None
        self.path_xml = path_xml
        self.path_idx = path_idx
        self.handler = WikiXmlHandler()
        self.parser = xml.sax.make_parser()
        self.parser.setContentHandler(self.handler)
        
    def get_idx(self):
        if self._idx:
            return self._idx
        elif self.path_idx:
            print('Loading index...')
            self._idx = {}
            with bz2.BZ2File(self.path_idx, 'rb') as file:
                for line in file:
                    [offset, page_id, name] = line.strip().split(b':', 2)
                    self._idx[name.decode('utf-8')] = (int(offset), int(page_id))
            return self._idx
    idx = property(get_idx)
        
    def load_page(self, page_name):
        if page_name not in self.idx.keys():
            return
        page_offset, page_id = self.idx[page_name]
        print('Searching for page "' + page_name + '"'
              ' with id ' + str(page_id) + '...')
        xml = WikiEngine.search_dump(self.path_xml, page_offset, page_id).decode('utf-8')
        xml = WikiEngine.strip_manual_ref(xml)
        print('Loaded.')
        print('Parsing XML...')
        self.parser.feed(xml)
        print('Parsing wiki (only the top section)...')
#         text = WikiEngine.filter_top_section(self.handler._values['text'])
        text = self.handler._values['text']
        self.page = mph.parse(text)
        print('Parsed.')
        self._links = None
        self.parser.reset()
        return self.page
    
    def get_links(self):
        if self._links:
            return self._links
        elif self.page:
            self._links = [x.title for x in self.page.filter_wikilinks()]
            return self._links
    links = property(get_links)
    
    @staticmethod
    def search_dump(path, offset, page_id):
        page_found = False
        xml = b''
        max_search = 200e6
        with bz2.BZ2File(path, 'rb') as file:
            file.seek(offset)
            while (file.tell() - offset) < max_search:
                line = file.readline()
                if b'<page>' in line:
                    xml = b''
                xml = xml + line
                if b'<id>' + str(page_id).encode('utf-8') + b'</id>' in line:
                    print('Found at byte offset ' + str(file.tell()) + '.')
                    page_found = True
                if b'</page>' in line and page_found:
                    return xml
        raise NameError('No page found with name "' + page_name +
                        '" in "' + self.path_xml + '".')
    
    @staticmethod
    def strip_manual_ref(text):
        return re.sub(r'&lt;/*ref.*?(/&gt;|&gt;)', '', text)
    
    @staticmethod
    def filter_top_section(text):
        head = re.search(r'==.*?==', text)
        idx = head.span(0)[0] if head else len(text)
        return (text[:idx], text[idx:])

### Test

In [None]:
base_path = '/Users/harangju/Developer/data/wiki/partition/'
xml_name = 'enwiki-20190720-pages-articles-multistream1.xml-p10p30302.bz2'
index_name = 'enwiki-20190720-pages-articles-multistream-index1.txt-p10p30302.bz2'
xml_path = base_path + xml_name
index_path = base_path + index_name
wiki = WikiEngine(xml_path, index_path)

In [None]:
wiki.load_page('AccessibleComputing')
print('Number of links: ' + str(len(wiki.links)))