Parse the wikicode in the WP10 subset of data and extract features from the wikitext.

In [18]:
import mwparserfromhell as mwparser
import gzip
import json
import pandas as pd
from joblib import Parallel, delayed

In [15]:
import re

def num_headings(text, level):
    """Count number of headings in wikicode document with ``level``"""
    return len([x for x in text.filter_headings() if x.level == level])

def clean_template_name(x):
    """Return the cleaned and standardized template name"""
    return str(x.name).strip().lower().replace(" ", "_").replace("-", "_")

def match_template(x, pattern):
    """Does the object match ``pattern``"""
    return bool(re.match(pattern, clean_template_name(x), re.I))

def wikilink_title_matches(pattern, link):
    """Does wikilink title match ``pattern``"""
    return bool(re.match(pattern, str(link.title), re.I))

def featurize(revision):
    """Create features for each revision
    
    Parameters
    -----------
    revision: `dict`
        A dict with revision information. It must contain a
        "wikitext" key. It will returne
    

    Returns
    --------
    None
        The dictionary ``revision`` is altered in place.
    
    """
    text = mwparser.parse(revision['wikitext'])

    # number of characters
    revision['chars'] = len(str(text))

    # Content characters are visible characters. Operationalized as characters after
    # mwparserfromhell calls the ``strip_code()`` method.
    plaintext = text.strip_code()
    revision['content_chars'] = len(plaintext)
        
    # Filter external
    revision['external_links'] = len(text.filter_external_links())

    # Total number of headings
    headings = text.filter_headings()
    revision['headings'] = len([x for x in headings if x.level == 2])
    
    # Sub-headings
    revision['sub_headings'] = len([x for x in headings if x.level > 2])

    # Number of wikilinks
    wikilinks = text.filter_wikilinks()
    
    # number of images
    revision['images'] = sum([wikilink_title_matches(r"file|image\:", link) for link in wikilinks])

    # number of categories
    revision['categories'] = sum([wikilink_title_matches("category\:", link) for link in wikilinks])
    
    # Other wikilinks
    revision['wikilinks'] = len(wikilinks) - sum(revision[k] for k in ("images", "categories"))

    # Templates
    templates = text.filter_templates()
    
    # number of who templates
    revision['who_templates'] = sum([match_template(x, 'who$') for x in templates])

    # main templates
    revision['main_templates'] = sum([match_template(x, "main$") for x in templates])
    
    # citation templates
    revision['cite_templates'] = sum([match_template(x, "cite") for x in templates])
    
    # has infobox
    revision['infoboxes'] = sum([match_template(x, "infobox") for x in templates])

    # number of citation needed templates
    revision['citation_needed'] = sum(match_template(x, "citation_needed|cn|fact") for x in templates)
    
    # other templates
    revision['other_templates'] = len(templates) - sum(revision[k] for k in ("infoboxes", "cite_templates", "citation_needed",
                                                                             "main_templates", "who_templates"))
    
    # number of ref tags
    revision['ref'] = len([x for x in text.filter_tags() if x.tag == "ref"])

    # number of smartlists (e.g. wikitables)
    revision['smartlists'] = len([x for x in text.nodes if isinstance(x, mwparser.smart_list.SmartList)])
    
    # sections
    sections = text.get_sections(flat=True, include_lead=True, include_headings=False)
    non_lead_paras = 0
    non_ref_paras = 0
    non_ref_paras_length = 0
    for i, section in enumerate(sections):
        paras = [mwparser.parse(x) for x in re.split(r'\n{2,}', str(section))]
        if (i == 0):
            revision['lead_paras'] = len(paras)
            revision['lead_char'] = len(str(section))
        else:
            non_lead_paras += 1
            for para in paras:
                if not len([x for x in text.filter_tags() if x.tag == "ref"]):
                    non_ref_paras += 1
                    non_ref_paras_length += len(str(para))
    revision['non_lead_paras'] = non_lead_paras
    revision['non_ref_paras'] = non_ref_paras
    
    # geotagged
    revision['coordinates'] = '#coordinates' in str(text).lower()
    
    # Add plaintext for more features
    revision['text'] = plaintext
    del revision['wikitext']


In [16]:
from multiprocessing import cpu_count
from joblib import Parallel, delayed
import itertools

input_file = "../data/enwiki.labeling_revisions.w_text.nettrom_30k.ndjson.gz"
workers = cpu_count() - 1
n = None

def load_and_featurize(line):
    revision = json.loads(line)
    featurize(revision)
    return revision

with gzip.open(input_file, "rt") as f:
    pool = Parallel(n_jobs=workers, verbose=True)
    lines = itertools.islice(iter(f), n)
    revisions = pool(delayed(load_and_featurize)(line) for line in lines)

[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    5.7s
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed:   27.4s
[Parallel(n_jobs=7)]: Done 436 tasks      | elapsed:  1.1min
[Parallel(n_jobs=7)]: Done 786 tasks      | elapsed:  1.7min
[Parallel(n_jobs=7)]: Done 1236 tasks      | elapsed:  2.2min
[Parallel(n_jobs=7)]: Done 1858 tasks      | elapsed:  2.8min
[Parallel(n_jobs=7)]: Done 3729 tasks      | elapsed:  4.1min
[Parallel(n_jobs=7)]: Done 4479 tasks      | elapsed:  5.7min
[Parallel(n_jobs=7)]: Done 5329 tasks      | elapsed:  7.5min
[Parallel(n_jobs=7)]: Done 6279 tasks      | elapsed:  9.7min
[Parallel(n_jobs=7)]: Done 7329 tasks      | elapsed: 12.0min
[Parallel(n_jobs=7)]: Done 8479 tasks      | elapsed: 13.8min
[Parallel(n_jobs=7)]: Done 9729 tasks      | elapsed: 15.4min
[Parallel(n_jobs=7)]: Done 11079 tasks      | elapsed: 17.3min
[Parallel(n_jobs=7)]: Done 12529 tasks      | elapsed: 19.2m

In [19]:
revisions = pd.DataFrame.from_records(revisions)
revisions = revisions.set_index('revid')

In [20]:
output_file = "../data/enwiki.labeling_revisions.w_features.nettrom_30k.csv.gz"
revisions.to_csv(output_file, index=True, compression="gzip")