### Dependencies

In [3]:
import xml.sax

class WikiXmlHandler(xml.sax.handler.ContentHandler):
    """Content handler for Wiki XML data using SAX"""
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._current_tag = None
        self._pages = []

    def characters(self, content):
        """Characters between opening and closing tags"""
        if self._current_tag:
            if self._current_tag == 'id' and self._buffer:
                return
            self._buffer.append(content)

    def startElement(self, name, attrs):
        """Opening tag of element"""
        if name in ('title', 'text', 'id'):
            if name == 'id' and name in self._values.keys():
                return
            self._current_tag = name
            self._buffer = []

    def endElement(self, name):
        """Closing tag of element"""
        if name == self._current_tag:
            self._values[name] = ' '.join(self._buffer)
        if name == 'page':
            self._pages.append((self._values['title'], self._values['text'], self._values['id']))

### Engine

In [132]:
import bz2
import mwparserfromhell as mph

class WikiEngine():
    def __init__(self, path_xml, path_idx):
        # path_xml is path to XML bz2 file
        # path_idx is path to the index txt file
        self._indices = None
        self._handler = WikiXmlHandler()
        self._parser_xml = xml.sax.make_parser()
        self._parser_xml.setContentHandler(self._handler)
        self._path_xml = path_xml
        self.init_indices(path_idx)
        
    def init_indices(self, path_idx=None):
        print('Initializing indices...')
        if path_idx:
            self._path_idx = path_idx
        self._idx = {}
        with open(self._path_idx, 'r', encoding = 'utf-8') as file:
            for line in file:
                [offset, page_id, name] = line.strip().split(':', 2)
                self._idx[name] = (int(offset), int(page_id))
    
    def get_links(self, page_name):
        if page_name not in self._idx.keys():
            raise NameError('No page with name "' + page_name + '"')
        else:
            page_offset, page_id = self._idx[page_name]
        print('Searching for links with name "' + page_name + '"')
        xml = WikiEngine.search_dump(self._path_xml, page_offset, page_id)
        if xml == None:
            raise NameError('No page found with name "' + page_name +
                            '" in "' + self._path_xml + '".')
        self._parser_xml.feed(xml)
        page = mph.parse(self._handler._pages[0][1])
        links = [x.title for x in wiki.filter_wikilinks()]
        self._parser_xml.reset()
        return links
    
    @staticmethod
    def search_dump(path, offset, page_id):
        page_found = False
        xml = b''
        with bz2.BZ2File(path, 'rb') as file:
            file.seek(offset)
            for _ in range(10000):
                line = file.readline()
                if b'<page>' in line:
                    xml = b''
                xml = xml + line
                if b'<id>' + str(page_id).encode('utf-8') + b'</id>' in line:
                    page_found = True
                if b'</page>' in line and page_found:
                    return xml

### Test

In [128]:
# base_path = '/Users/harangju/Developer/data/wiki/'
# xml_name = 'enwiki-20190801-pages-articles-multistream.xml.bz2'
# index_name = 'enwiki-20190801-pages-articles-multistream-index.txt'
base_path = '/Users/harangju/Developer/data/wiki/partition/'
xml_name = 'enwiki-20190801-pages-articles-multistream1.xml-p10p30302.bz2'
index_name = 'enwiki-20190801-pages-articles-multistream-index1.txt-p10p30302'
xml_path = base_path + xml_name
index_path = base_path + index_name

In [129]:
wiki = WikiEngine(xml_path, index_path)

Initializing indices...


In [130]:
wiki.get_links('AccessibleComputing')

Searching for links with name "AccessibleComputing"


'#REDIRECT [[Computer accessibility]] \n \n {{R from move}} \n {{R from CamelCase}} \n {{R unprintworthy}}'