### Dependencies

In [106]:
import xml.sax

class WikiXmlHandler(xml.sax.handler.ContentHandler):
    """Content handler for Wiki XML data using SAX"""
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._current_tag = None

    def characters(self, content):
        """Characters between opening and closing tags"""
        if self._current_tag:
            if self._current_tag == 'id' and self._buffer:
                return
            self._buffer.append(content)

    def startElement(self, name, attrs):
        """Opening tag of element"""
        if name in ('title', 'text', 'id'):
            if name == 'id' and name in self._values.keys():
                return
            self._current_tag = name
            self._buffer = []

    def endElement(self, name):
        """Closing tag of element"""
        if name == self._current_tag:
            self._values[name] = ' '.join(self._buffer)

### Engine

In [228]:
import bz2
import mwparserfromhell as mph

class WikiEngine():
    def __init__(self, path_xml, path_idx):
        # path_xml is the path to the XML bz2 file
        # path_idx is the path to the index bz2 file
        self._indices = None
        self._handler = WikiXmlHandler()
        self._parser_xml = xml.sax.make_parser()
        self._parser_xml.setContentHandler(self._handler)
        self._path_xml = path_xml
        self.init_indices(path_idx)
        self._page = None
        
    def init_indices(self, path_idx=None):
        print('Initializing indices...')
        if path_idx:
            self._path_idx = path_idx
        self._idx = {}
        with bz2.BZ2File(self._path_idx, 'rb') as file:
            for line in file:
                [offset, page_id, name] = line.strip().split(b':', 2)
                self._idx[name.decode('utf-8')] = (int(offset), int(page_id))
    
    def get_links(self, page_name):
        if page_name not in self._idx.keys():
            raise NameError('No page with name "' + page_name + '"')
        else:
            page_offset, page_id = self._idx[page_name]
        print('Searching for pages with name "' + page_name + 
              '" with id ' + str(page_id) + '...')
        xml = WikiEngine.search_dump(self._path_xml, page_offset, page_id)
        if xml == None:
            raise NameError('No page found with name "' + page_name +
                            '" in "' + self._path_xml + '".')
        print('Parsing XML & wikicode...')
        self._parser_xml.feed(xml)
        self._page = mph.parse(self._handler._values['text'], skip_style_tags = True)
        links = [x.title for x in self._page.filter_wikilinks()]
        self._parser_xml.reset()
        return links
    
    @staticmethod
    def search_dump(path, offset, page_id):
        page_found = False
        xml = b''
        max_search = 200e6
        with bz2.BZ2File(path, 'rb') as file:
            file.seek(offset)
            print('Byte offset (start):\t' + str(file.tell()))
            while (file.tell() - offset) < max_search:
                line = file.readline()
                if b'<page>' in line:
                    xml = b''
                xml = xml + line
                if b'<id>' + str(page_id).encode('utf-8') + b'</id>' in line:
                    print('Byte offset (end):\t' + str(file.tell()))
                    page_found = True
                if b'</page>' in line and page_found:
                    return xml
            print('Byte offset (end):\t' + str(file.tell()))

### Test

In [229]:
# base_path = '/Users/harangju/Developer/data/wiki/'
# xml_name = 'enwiki-20190801-pages-articles-multistream.xml.bz2'
# index_name = 'enwiki-20190801-pages-articles-multistream-index.txt'
base_path = '/Users/harangju/Developer/data/wiki/partition/'
xml_name = 'enwiki-20190720-pages-articles-multistream1.xml-p10p30302.bz2'
index_name = 'enwiki-20190720-pages-articles-multistream-index1.txt-p10p30302.bz2'
xml_path = base_path + xml_name
index_path = base_path + index_name

In [230]:
wiki = WikiEngine(xml_path, index_path)

Initializing indices...


In [231]:
links = wiki.get_links('Autism')

Searching for pages with name "Autism" with id 25...
Byte offset (start):	617
Byte offset (end):	113638
Parsing XML & wikicode...


In [234]:
headings = wiki._page.filter_headings()
headings

[]

In [220]:
links

['Psychiatry',
 'Interpersonal relationship',
 'communication',
 'bullying',
 'Heritability of autism',
 'Reactive attachment disorder',
 'intellectual disability',
 'schizophrenia',
 'Behavioral therapy',
 'speech therapy',
 'psychotropic medication',
 'Atypical antipsychotics',
 'antidepressants',
 'stimulants',
 'developmental disorder',
 'developmental milestones',
 'Regressive autism',
 'Heritability of autism',
 'environmental factors',
 'rubella',
 'valproic acid',
 'cocaine',
 'pesticides',
 'air pollution',
 'fetal growth restriction',
 'autoimmune disease',
 'Controversies in autism',
 'Causes of autism',
 'MMR vaccine controversy',
 'nerve cell',
 'synapse',
 'DSM-5',
 'Asperger syndrome',
 'pervasive developmental disorder not otherwise specified',
 'autism spectrum disorder',
 'speech therapy',
 'Applied behavior analysis',
 'self-care',
 'Societal and cultural aspects of autism',
 'Autism rights movement',
 'The Psychologist (magazine)',
 'British Psychological Society',
