### Engine

In [309]:
import bz2
import os
import xml.etree.ElementTree as ET
import mwparserfromhell as mph
import re

class WikiEngine():
    def __init__(self, path_xml, path_idx):
        self._idx = {}
        self._links = []
        self.page = None
        self.path_xml = path_xml
        self.path_idx = path_idx
        
    def get_idx(self):
        if self._idx:
            return self._idx
        else:
            print('Loading index...')
            with bz2.BZ2File(self.path_idx, 'rb') as file:
                for line in file:
                    [offset, page_id, name] = line.strip().split(b':', 2)
                    self._idx[name.decode('utf-8')] = (int(offset), int(page_id))
            print('Loaded.')
            print('Calculating block sizes...')
            self.append_block_sizes()
            print('Calculated.')
            return self._idx
    idx = property(get_idx)
    
    def append_block_sizes(self):
        offsets = sorted([x[0] for x in self.idx.values()]) \
                + [os.path.getsize(xml_path)]
        for key in self.idx.keys():
            offset = self.idx[key][0]
            next_offset = next(x for x in offsets if x > offset)
            self.idx[key] = self.idx[key] + (next_offset - offset,)
    
    def get_links(self):
        if self._links:
            return self._links
        elif self.page:
            self._links = [x.title for x in self.page.filter_wikilinks()]
            return self._links
    links = property(get_links)
    
    def load_page(self, page_name, filter_top=False):
        if page_name not in self.idx.keys():
            return
        offset, pid, block_size = self.idx[page_name]
        print('Fetching block from dump...')
        xml = WikiEngine.fetch_block(self.path_xml, offset, block_size)
        print('Parsing XML...')
        root = ET.fromstring(b'<root>' + xml + b'</root>')
        print('Searching for id ' + str(pid) + '...')
        text = WikiEngine.search_id(root, pid)
        print('Parsing wiki...')
        text = WikiEngine.filter_top_section(text) if filter_top else text
        self.page = mph.parse(text)
        print('Parsed.')
        self._links = []
        return self.page
    
    @staticmethod
    def fetch_block(path, offset, block_size):
        with open(path, 'rb') as file:
            file.seek(offset)
            return bz2.decompress(file.read(block_size))
    
    @staticmethod
    def search_id(root, pid):
        for page in root.iter('page'):
            if pid == int(page.find('id').text):
                return page.find('revision').find('text').text
    
    @staticmethod
    def filter_top_section(text):
        head = re.search(r'==.*?==', text)
        idx = head.span(0)[0] if head else len(text)
        return text[:idx] #(text[:idx], text[idx:])

### Test

In [311]:
path_base = '/Users/harangju/Developer/data/wiki/partition/'
name_xml = 'enwiki-20190720-pages-articles-multistream1.xml-p10p30302.bz2'
name_index = 'enwiki-20190720-pages-articles-multistream-index1.txt-p10p30302.bz2'
path_xml = path_base + name_xml
path_index = path_base + name_index
wiki = WikiEngine(path_xml, path_index)

In [312]:
# test initial idx
# should load indices & calculate block sizes
l = list(wiki.idx)
l[:3]

Loading index...
Loaded.
Calculating block sizes...
Calculated.


['AccessibleComputing', 'Anarchism', 'AfghanistanHistory']

In [313]:
# test subsequent idx
# this time, should just return the indices
l = list(wiki.idx)
l[:3]

['AccessibleComputing', 'Anarchism', 'AfghanistanHistory']

In [314]:
# test fetch_block()
# should return the XML block starting with alchemy
offset, i, block_size = wiki.idx['Alchemy']
xml = WikiEngine.fetch_block(path_xml, offset, block_size)
xml[:100]

b'  <page>\n    <title>AccessibleComputing</title>\n    <ns>0</ns>\n    <id>10</id>\n    <redirect title="'

In [315]:
# test search_id()
root = ET.fromstring(b'<root>' + xml + b'</root>')
page_text = WikiEngine.search_id(root, 12)
page_text[:200]

'{{redirect2|Anarchist|Anarchists|other uses|Anarchists (disambiguation)}}\n{{pp-move-indef}}{{short description|Political philosophy that advocates self-governed societies}}\n{{use dmy dates|date=July 2'

In [316]:
# test filter_top_section
WikiEngine.filter_top_section(page_text)

"{{redirect2|Anarchist|Anarchists|other uses|Anarchists (disambiguation)}}\n{{pp-move-indef}}{{short description|Political philosophy that advocates self-governed societies}}\n{{use dmy dates|date=July 2018}}\n{{use British English|date=January 2014}}\n{{anarchism sidebar}}\n{{libertarianism sidebar}}\n{{revolution sidebar}}\n{{basic forms of government}}\n'''Anarchism''' is an [[Anti-authoritarianism|anti-authoritarian]] [[political philosophy]]{{sfnm|1a1=McLaughlin|1y=2007|1p=59|2a1=Flint|2y=2009|2p=27}} that rejects [[Hierarchy|hierarchies]] deemed unjust and advocates their replacement with [[Workers' self-management|self-managed]], [[Self-governance|self-governed]] societies based on voluntary, [[cooperative]] institutions. These institutions are often described as [[Stateless society|stateless societies]],{{sfnm|1a1=Craig|1y=2005|1p=14|2a1=Sheehan|2y=2003|2p=85}} although several authors have defined them more specifically as distinct institutions based on non-hierarchical or [[F

In [318]:
# test load_page()
# page_name = 'AccessibleComputing'
page_name = 'Anarchism'
# page_name = 'Artificial languages'
# page_name = 'Abstract (law)'
# page_name = 'Anxiety'
# page_name = 'Foreign relations of Azerbaijan'
# page_name = 'Alfonso Cuarón'
# page_name = 'ADHD'
page = wiki.load_page(page_name, filter_top=True)
page

Fetching block from dump...
Parsing XML...
Searching for id 12...
Parsing wiki...
Parsed.


"{{redirect2|Anarchist|Anarchists|other uses|Anarchists (disambiguation)}}\n{{pp-move-indef}}{{short description|Political philosophy that advocates self-governed societies}}\n{{use dmy dates|date=July 2018}}\n{{use British English|date=January 2014}}\n{{anarchism sidebar}}\n{{libertarianism sidebar}}\n{{revolution sidebar}}\n{{basic forms of government}}\n'''Anarchism''' is an [[Anti-authoritarianism|anti-authoritarian]] [[political philosophy]]{{sfnm|1a1=McLaughlin|1y=2007|1p=59|2a1=Flint|2y=2009|2p=27}} that rejects [[Hierarchy|hierarchies]] deemed unjust and advocates their replacement with [[Workers' self-management|self-managed]], [[Self-governance|self-governed]] societies based on voluntary, [[cooperative]] institutions. These institutions are often described as [[Stateless society|stateless societies]],{{sfnm|1a1=Craig|1y=2005|1p=14|2a1=Sheehan|2y=2003|2p=85}} although several authors have defined them more specifically as distinct institutions based on non-hierarchical or [[F

In [319]:
print('Number of links: ' + str(len(wiki.links)))

Number of links: 25
