In [82]:
import mwparserfromhell as mwparser
import gzip
import json

In [83]:
input_file = "../data/labelled.enwiki.w_text.ndjson.gz"
with gzip.open("../data/labelled.enwiki.w_text.ndjson.gz", "rt") as f:
    revisions = [json.loads(line) for line in f]

In [84]:
rev = revisions[0]

In [85]:
parsed = mwparser.parse(rev['wikitext'])

In [86]:
parsed.fil

AttributeError: 'Wikicode' object has no attribute 'fil'

In [87]:
import re

def num_headings(text, level):
    return len([x for x in parsed.filter_headings() if x.level == level])

def clean_template_name(x):
    return str(x.name).strip().lower().replace(" ", "_").replace("-", "_")

def match_template(x, pattern):
    return bool(re.match(pattern, clean_template_name(x)))

def wikilink_title_matches(pattern, link):
    return bool(re.match(pattern, str(link.title)))

def features(revision):
    text = mwparser.parse(revision['wikitext'])

    # number of characters
    revision['chars'] = len(str(text))

    # Content characters are visible characters. Operationalized as characters after
    # mwparserfromhell calls the ``strip_code()`` method.
    plaintext = text.strip_code()
    revision['content_chars'] = len(plaintext)
        
    # Filter external
    revision['external_links'] = len(text.filter_external_links())

    # Total number of headings
    headings = text.filter_headings()
    revision['headings'] = len([x for x in headings if x.level == 2])
    
    # Sub-headings
    revision['sub_headings'] = len([x for x in headings if x.level > 2])

    # Number of wikilinks
    wikilinks = text.filter_wikilinks()
    
    # number of images
    revision['images'] = sum([wikilink_title_matches(r"file|image\:", link) for link in wikilinks])

    # number of categories
    revision['categories'] = sum([wikilink_title_matches("Category\:", link) for link in wikilinks])
    
    # Other wikilinks
    revision['wikilinks'] = len(wikilinks) - sum(revision[k] for k in ("images", "categories"))

    # Templates
    templates = text.filter_templates()
    
    # number of who templates
    revision['who_templates'] = sum([match_template(x, 'who$') for x in templates])

    # main templates
    revision['main_template'] = sum([match_template(x, "main$") for x in templates])
    
    # citation templates
    revision['cite_template'] = sum([match_template(x, "cite") for x in templates])
    
    # has infobox
    revision['infoboxes'] = sum([match_template(x, "infobox") for x in templates])

    # number of citation needed templates
    revision['citation_needed'] = sum(match_template(x, "citation_needed|cn|fact") for x in templates)
    
    # other templates
    revision['other_templates'] = len(templates) - sum(revision[k] for k in ("infoboxes", "cite_templates", "citation_needed",
                                                                             "main_templates", "who_templates"))
    
    # number of ref tags
    revision['ref'] = len([x for x in text.filter_tags() if x.tag == "ref"])

    # number of smartlists (e.g. wikitables)
    revision['smartlists'] = len([x for x in text.nodes if isinstance(x, mwparser.smart_list.SmartList)])
    
    # sections
    sections = parsed.get_sections(flat=True, include_lead=True, include_headings=False)
    non_lead_paras = 0
    non_ref_paras = 0
    non_ref_paras_length = 0
    for i, section in enumerate(sections):
        paras = [mwparser.parse(x) for x in re.split(r'\n{2,}', str(section))]
        if (i == 0):
            revision['lead_paras'] = len(paras)
            revision['lead_char'] = len(str(section))
        else:
            non_lead_paras += 1
            for para in paras:
                if not len([x for x in text.filter_tags() if x.tag == "ref"]):
                    non_ref_paras += 1
                    non_ref_paras_length += len(str(para))
    revision['non_lead_paras'] = non_lead_paras
    revision['non_ref_paras'] = non_ref_paras
    
    # geotagged
    revision['coordinates'] = '#coordinates' in str(text).lower()
    
    # Add plaintext for more features
    revision['text'] = plaintext
    del revision['wikitext'] 


features(rev)


NameError: name 'r' is not defined

In [79]:
"ac" in "abc"

True

False

In [11]:
type(mwparser.parse("{|\n | A || b \n |- \n | CD || EF | }").nodes)

mwparserfromhell.smart_list.SmartList

1. Recursively 

There are three ways to cite in Wikipedia, though only one may be used per article.

1.  Parenthetical reference in plain text
2.  Reference tags `<ref>...</ref>`
3.  Reference and note templates. `{{ref|...}` and `{{note|...}`
4.  Inline hyperlinks (deprecated)
5.  Manual citations
6.  Legal citations
7.  In text attribution