In [4]:
import mwparserfromhell as mwparser
import gzip
import json

In [5]:
import re

def clean_template_name(x):
    return str(x.name).lower().strip().replace(" ", "_").replace("-", "_")

class WikicodeConverter:
    _RE_IMAGE = re.compile('^(?:File|Image|Media):', flags=re.I)
    _RE_CATEGORY = re.compile('^Category:', flags=re.I)
 
    def __init__(self, parser = mwparser.parser.Parser(), tags_keep=[], tags_remove=[],
                 templates_keep=[], templates_remove=[]):
        self._parser = parser
        self.tags_keep = tags_keep
        self.tags_remove = tags_remove
        self.templates_keep = templates_keep
        self.templates_remove = templates_remove
      
    def _strip_tag(self, x):
        tag = x.tag.lower().strip()
        if tag in self.tags_keep:
            out = f"<{tag.tag}>"
        elif tag in self.tags_remove:
            out = None
        else:
            out = x.__strip__()
        return out
    
    def _strip_template(self, x):
        name = str(x.name).lower().strip().replace(" ", "_")
        print(name)
        if (self.templates_keep and 
            re.match('|'.join(self.templates_keep), name)):
            out = "{{" + name + "}}"
        elif re.match("|".join(self.templates_remove), name):
            out = None
        else:
            out = x.__strip__()
        return out    
    
    def _is_image(self, x):
        return self._is_wikilink(x) and self._RE_IMAGE.match(str(x.title))
    
    def _is_category(self, x):
        return self._is_wikilink(x) and self._RE_CATEGORY.match(str(x.title))

    def _is_ref(self, x):
        if self._is_tag(x, "ref$"):
            return True
        if self._is_template(x, pattern="(ref|note)$"):
            return True
        return False
    
    def _is_tag(self, x, pattern=None):
        out = isinstance(x, mwparser.nodes.Tag)
        if out and pattern is not None:
            out = re.match(pattern, str(x.tag), re.I)
        return out
    
    def _is_heading(self, x):
        return isinstance(x, mwparser.nodes.Heading)
    
    def _is_wikilink(self, x):
        return isinstance(x, mwparser.nodes.Wikilink)

    def _is_template(self, x, pattern=None):
        out = isinstance(x, mwparser.nodes.Template)
        if out and pattern is not None:
            out = re.match(pattern, clean_template_name(x), re.I)
        return out
    
    def convert(self, content):
        """Convert Wiki markup to plain text."""
        wikicode = mwparser.parse(content)
        texts = []
        # then concatenate the stripped text of each section
        for i, section in enumerate(wikicode.get_sections(flat=True, include_lead=True, include_headings=True)):
            section_text = []
            for node in section.nodes:
                # references needs to preceed tags and templates since they
                # have both forms
                if self._is_ref(node):
                    nodestr = "<ref>"
                elif self._is_tag(node):
                    if str(node.tag).lower() == "table":
                        nodestr = None
                    else:
                        nodestr = self._strip_tag(node)
                elif self._is_template(node):
                    nodestr = self._strip_template(node)
                elif self._is_category(node) or self._is_image(node):
                    nodestr = None
                elif self._is_heading(node):
                    nodestr = "{level} {title} {level}\n".format(level='=' * node.level, title=str(node.title))
                else:
                    nodestr = str(node.__strip__(normalize=True))
                if nodestr is not None:
                    section_text.append(str(nodestr))
            section_text = ''.join(section_text)
            # remove redundant multiple references
            section_text = re.sub('<ref>(\\s*<ref>)*', ' <ref> ', section_text)
            texts.append(section_text)
        return ''.join(texts)


In [10]:
templates_keep = ["citation_needed$", "peacock_term$", "weasel_inline$", "vague$", "lopsided$", "pov_statement$", "dubious$"]
converter = WikicodeConverter(tags_remove=["img", "table"], templates_remove=["infobox", "reflist", "notelist"])
cleaned = ''.join(converter.convert(rev['wikitext']))

NameError: name 'rev' is not defined

In [34]:
import spacy
nlp = spacy.load('en_core_web_lg')


In [35]:
from spacy.symbols import ORTH, LEMMA, POS, TAG

for i in range(2, 6):
    nlp.tokenizer.add_special_case('=' * i, [{ORTH: '=' * i, TAG: 'SYM'}])
tags = ['ref']
for tag in tags:
    nlp.tokenizer.add_special_case(f"<{tag}>", [{ORTH: f"<{tag}>", TAG: 'SYM'}])


In [37]:
for tok in nlp(cleaned).sents:
    print(tok)


M-116 is a  long state trunkline highway in the US state of Michigan that begins in Ludington at an intersection with US Highway 10 (US 10) at James Street and Ludington Avenue.
The road travels northward, much of it along the shore of Lake Michigan before reaching its terminus at the entrance to Ludington State Park.
The roadway has been in the state highway system since the late 1920s.
It has been realigned a few times, most recently in the late 1990s.


== Route description ==

M-116 begins in Ludington at the intersection of James Street and Ludington Avenue.
It is at this intersection where US 10 turns southward and heads to the Ludington–Manitowoc ferry docks.  
From here, M-116 continues westward along Ludington Avenue through a mixed commercial area for about one-half mile (0.8 km) before turning north on Lakeshore Drive.
The road continues as North Lakeshore Drive  through resident areas and pass the Lakeview Cemetery.
Before passing the Lincoln Hills Golf Club, M-116 crosses

In [114]:
parsed.filter_templates()

0