## Backlog

Code to explore Wikipedia backlog issues.

In [5]:
import sys
sys.path.append("..")
%load_ext autoreload
%autoreload 2
import wikidit

In [14]:
from wikidit.mw import Session
import mwparserfromhell as mwparser


def normalize_title(title, session=None):
    if session is None:
        session = Session()
    result = session.get(action="query", titles=title, redirects=True)
    page = list(result['query']['pages'].values())[0]
    if 'missing' in page:
        raise ValueError(f"Title {title} does not exist")
    else:
        return page['title']

def get_talk_page(title, session=None):
    if session is None:
        session = Session()
    norm_title = normalize_title(title, session=session)
    result = session.get(action='query', titles=f"Talk:{norm_title}",
                         prop='revisions', rvprop='content', 
                         rvslots='main')
    return list(result['query']['pages'].values())[0]

def get_content(page):
    return page['revisions'][0]['slots']['main']['*']

def clean_wp_class(x):
    replacements = {"disambig": "dab", 
                    "current": "cur",
                    "a": "ga",
                    "bplus": "b",
                    "none": None}

    # See https://en.wikipedia.org/wiki/MediaWiki:Gadget-metadata.js
    x = str(x).strip().lower()
    if x in replacements:
        x = replacements[x]
    return x

def clean_wp_importance(x):
    x = str(x).strip().lower()
    if x == "none":
        return None
    return x

def parse_project(tmpl):
    class_ = [x.value for x in tmpl.params if x.name == "class"]
    class_ = None if not len(class_) else class_[0]
    importance = [x.value for x in tmpl.params if x.name == "importance"]
    importance = None if not len(importance) else importance[0]
    return (str(tmpl.name), {'class': clean_wp_class(class_),
                             'importance': clean_wp_class(importance)})
              
def get_projects(page):
    return dict(parse_project(x) for x in page.filter_templates(matches="WikiProject"))

def get_wikiprojects(title, session=None):
    # Problem: what if title doesn't exist
    # not sure if this handles cases where title is redirected
    page = get_talk_page(title, session=session)
    parsed = mwparser.parse(get_content(page))
    return get_projects(parsed)


def get_quality(title, session=None):
    CLASSES = ('fa', 'ga', 'b', 'c', 'start', 'stub',
               'fl', 'list', 'dab', 'book', 'template',
               'category', 'draft', 'redirect')
    # current and future can be ignored
    projs = get_wikiprojects(title, session=session)
    classes_ = set()
    for _, vals in projs.items():
        k = vals['class']
        if k is not None:
            classes_.add(k)
    if not len(classes_):
        return None
    else:
        for k in CLASSES:
            if k in classes_:
                return k
        return None

        

In [16]:
get_quality("Data science")

'start'