### Dependencies

In [146]:
import xml.sax

class WikiXmlHandler(xml.sax.handler.ContentHandler):
    """Content handler for Wiki XML data using SAX"""
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._current_tag = None

    def characters(self, content):
        """Characters between opening and closing tags"""
        if self._current_tag:
            if self._current_tag == 'id' and self._buffer:
                return
            self._buffer.append(content)

    def startElement(self, name, attrs):
        """Opening tag of element"""
        if name in ('title', 'text', 'id'):
            if name == 'id' and name in self._values.keys():
                return
            self._current_tag = name
            self._buffer = []

    def endElement(self, name):
        """Closing tag of element"""
        if name == self._current_tag:
            self._values[name] = ' '.join(self._buffer)

### Engine

In [515]:
import bz2
import mwparserfromhell as mph
import re

class WikiEngine():
    def __init__(self, path_xml, path_idx):
        # path_xml is the path to the XML bz2 file
        # path_idx is the path to the index bz2 file
        self._indices = None
        self._handler = WikiXmlHandler()
        self._parser_xml = xml.sax.make_parser()
        self._parser_xml.setContentHandler(self._handler)
        self._path_xml = path_xml
        self.init_indices(path_idx)
        self._page = None
        print('Initialized.')
        
    def init_indices(self, path_idx=None):
        print('Initializing indices...')
        if path_idx:
            self._path_idx = path_idx
        self._idx = {}
        with bz2.BZ2File(self._path_idx, 'rb') as file:
            for line in file:
                [offset, page_id, name] = line.strip().split(b':', 2)
                self._idx[name.decode('utf-8')] = (int(offset), int(page_id))
    
    def get_page(self, page_name):
        if page_name not in self._idx.keys():
            raise NameError('No page with name "' + page_name + '"')
        else:
            page_offset, page_id = self._idx[page_name]
        print('Searching for page "' + page_name + '"'
              ' with id ' + str(page_id) + '...')
        xml = WikiEngine.search_dump(self._path_xml, page_offset, page_id).decode('utf-8')
        xml = WikiEngine.strip_manual_ref(xml)
        print('Parsing XML...')
        self._parser_xml.feed(xml)
        print('Parsing wiki...')
        self._page = mph.parse(self._handler._values['text'], skip_style_tags = True)
        print('Parsed.')
        self._parser_xml.reset()
        return self._page
    
    def get_links(self):
        return [x.title for x in self._page.filter_wikilinks()]
    
    @staticmethod
    def search_dump(path, offset, page_id):
        page_found = False
        xml = b''
        max_search = 200e6
        with bz2.BZ2File(path, 'rb') as file:
            file.seek(offset)
            while (file.tell() - offset) < max_search:
                line = file.readline()
                if b'<page>' in line:
                    xml = b''
                xml = xml + line
                if b'<id>' + str(page_id).encode('utf-8') + b'</id>' in line:
                    print('Found id ' + str(page_id) + 
                          ' at byte offset ' + str(file.tell()) + '.')
                    page_found = True
                if b'</page>' in line and page_found:
                    return xml
        raise NameError('No page found with name "' + page_name +
                        '" in "' + self._path_xml + '".')
    
    @staticmethod
    def strip_manual_ref(text):
        return re.sub(r'(&lt;ref).*?(/&gt;|&gt;)', '', text)

### Test

In [516]:
base_path = '/Users/harangju/Developer/data/wiki/partition/'
xml_name = 'enwiki-20190720-pages-articles-multistream1.xml-p10p30302.bz2'
index_name = 'enwiki-20190720-pages-articles-multistream-index1.txt-p10p30302.bz2'
xml_path = base_path + xml_name
index_path = base_path + index_name

In [517]:
wiki = WikiEngine(xml_path, index_path)

Initializing indices...
Initialized.


In [518]:
wiki.get_page('Autism')
links = wiki.get_links()
print('Number of links: ' + str(len(links)))

Searching for page "Autism" with id 25...
Found id 25 at byte offset 113638.
&lt;ref name=Land2008/&gt;

&lt;ref&gt;

&lt;ref name=NIH2016&gt;

&lt;ref name=DSM5/&gt;

&lt;ref name=NIH2016/&gt;

&lt;ref name=Ch2012/&gt;

&lt;ref name=NIH2016/&gt;

&lt;ref&gt;

&lt;ref name=CCD2007/&gt;

&lt;ref name=San2016/&gt;

&lt;ref&gt;

&lt;ref name=Ji2015/&gt;

&lt;ref name=&quot;Oswald DP 2006&quot;/&gt;

&lt;ref name=&quot;ReferenceA&quot;/&gt;

&lt;ref name=Ste106/&gt;

&lt;ref name=GBD2015Pre/&gt;

&lt;ref name=DSM5 /&gt;

&lt;ref name=&quot;Land2008&quot;&gt;

&lt;ref name=DSM5&gt;

&lt;ref name=Stef2008&gt;

&lt;ref name=Ch2012&gt;

&lt;ref name=&quot;VohrPoggiDavis2017&quot; /&gt;

&lt;ref&gt;

&lt;ref name=&quot;VohrPoggiDavis2017&quot; /&gt;

&lt;ref name=SamsamAhangari2014 /&gt;

&lt;ref name=Rut2005&gt;

&lt;ref name=&quot;Lev2009&quot;/&gt;

&lt;ref name=&quot;Lev2009&quot;&gt;

&lt;ref name=DSM5/&gt;

&lt;ref name=&quot;John2007&quot;&gt;

&lt;ref name=&quot;CCD2007&quot;&gt;

&lt;r

In [424]:
links

['Psychiatry',
 'Interpersonal relationship',
 'communication',
 'bullying',
 'Heritability of autism',
 'Reactive attachment disorder',
 'intellectual disability',
 'schizophrenia',
 'Behavioral therapy',
 'speech therapy',
 'psychotropic medication',
 'Atypical antipsychotics',
 'antidepressants',
 'stimulants',
 'developmental disorder',
 'Heritability of autism',
 'environmental factors',
 'speech therapy',
 'Applied behavior analysis',
 'self-care',
 'neurodevelopmental disorder',
 'autism spectrum disorder',
 '#Classification',
 'toddler',
 'social norms',
 'eye contact',
 'turn-taking',
 'File:Autistic-sweetiepie-boy-with-ducksinarow.jpg',
 'Stereotypy',
 'Compulsive behavior',
 'Ritual#Psychology',
 'Self-injury',
 'Dermatillomania',
 'splinter skill',
 'Savant syndrome',
 'stress (psychological)',
 'Gastrointestinal diseases',
 'comorbidity',
 'File:Single Chromosome Mutations.svg',
 'chromosome abnormalities',
 'Heritability of autism',
 'mutation',
 'Intrauterine growth rest

In [372]:
templates = wiki._page.filter_templates()
names = [x.name for x in templates]
names[:7]

['about',
 'pp-semi-indef',
 'pp-move-indef',
 'short description',
 'bots',
 'Use dmy dates',
 'Use American English']

In [383]:
external_links = wiki._page.filter_external_links()
external_links[:5]

['http://cid.oxfordjournals.org/content/48/4/456.full',
 'https://web.archive.org/web/20131031043545/http://cid.oxfordjournals.org/content/48/4/456.full',
 'http://works.bepress.com/rhea_paul/50',
 'http://www.racgp.org.au/afp/200709/200709angley.pdf',
 'https://web.archive.org/web/20130407205054/http://www.racgp.org.au/afp/200709/200709angley.pdf']

In [425]:
wiki._page.nodes

['{{about|the classic autistic disorder|other conditions sometimes called  " autism " |Autism spectrum|the journal|Autism (journal)}}',
 ' \n ',
 '{{pp-semi-indef}}',
 ' \n ',
 '{{pp-move-indef}}',
 ' \n ',
 '{{short description|neurodevelopmental disorder involving problems with social interaction, communication, and repetitive behaviors}}',
 ' \n ',
 '{{bots|deny=Monkbot}}',
 '  < !-- keep Monkbot from visiting this page -- > \n ',
 '{{Use dmy dates|date=August 2018}}',
 ' \n ',
 '{{Use American English|date=August 2016}}',
 ' \n ',
 '{{Infobox medical condition (new) \n | name            = Autism \n | image           = Autism-stacking-cans 2nd edit.jpg \n | alt             = Boy stacking cans \n | caption         = Repetitively stacking or lining up objects is associated with autism. \n | field           = [[Psychiatry]] \n | symptoms        = Trouble with [[Interpersonal relationship|social interaction]], impaired [[communication]], restricted interests, repetitive behavior \n | co