Permalink
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
889 lines (759 sloc) 37.9 KB
#! /usr/bin/env python3
# FIXME: space-initialized code blocks should be skipped, but mwparserfromhell does not support that
# TODO:
# detect self-redirects (definitely interactive only)
# changes rejected interactively should be logged
# warn if the link leads to an archived page
import difflib
import re
import logging
import contextlib
import datetime
import requests
import mwparserfromhell
from ws.client import API, APIError
from ws.db.database import Database
from ws.utils import LazyProperty
from ws.interactive import edit_interactive, require_login, InteractiveQuit
from ws.diff import diff_highlighted
import ws.ArchWiki.lang as lang
from ws.parser_helpers.encodings import dotencode, queryencode
from ws.parser_helpers.title import canonicalize, TitleError
from ws.parser_helpers.wikicode import get_anchors, ensure_flagged_by_template, ensure_unflagged_by_template
logger = logging.getLogger(__name__)
def get_ranks(key, iterable):
"""
Get a list of similarity ratios for a key in iterable.
:param str key: the main key to compare
:param iterable:
an iterable containing secondary keys to compare against the main key
:returns:
a list of ``(item, ratio)`` tuples sorted by ``ratio`` in descending
order, where ``item`` is an item from ``iterable`` and ``ratio`` its
similarity ratio
"""
sm = difflib.SequenceMatcher(a=key)
ranks = []
for item in iterable:
sm.set_seq2(item)
ratio = sm.ratio()
ranks.append( (item, ratio) )
ranks.sort(key=lambda match: match[1], reverse=True)
return ranks
def get_edit_checker(wikicode, summary_parts):
@contextlib.contextmanager
def checker(summary):
text = str(wikicode)
try:
yield
finally:
if text != str(wikicode):
summary_parts.append(summary)
return checker
class ExtlinkRules:
retype = type(re.compile(""))
# list of (url_regex, text_cond, text_cond_flags, replacement) tuples, where:
# - url_regex: a regular expression matching the URL (using re.fullmatch)
# - text_cond:
# - as str: a format string used to create the regular expression described above
# (it is formatted using the groups matched by url_regex)
# - as None: the extlink must not have any alternative text
# - text_cond_flags: flags for the text_cond regex
# - replacement: a format string used as a replacement (it is formatted
# using the groups matched by url_regex and the alternative
# text (if present))
replacements = [
# Arch bug tracker
(re.escape("https://bugs.archlinux.org/task/") + "(\d+)",
"(FS|flyspray) *#?{0}", 0, "{{{{Bug|{0}}}}}"),
# official packages, with and without alternative text
(r"https?\:\/\/(?:www\.)?archlinux\.org\/packages\/[\w-]+\/(?:any|i686|x86_64)\/([a-zA-Z0-9@._+-]+)\/?",
"{0}", re.IGNORECASE, "{{{{Pkg|{0}}}}}"),
(r"https?\:\/\/(?:www\.)?archlinux\.org\/packages\/[\w-]+\/(?:any|i686|x86_64)\/([a-zA-Z0-9@._+-]+)\/?",
None, 0, "{{{{Pkg|{0}}}}}"),
# AUR packages, with and without alternative text
(r"https?\:\/\/aur\.archlinux\.org\/packages\/([a-zA-Z0-9@._+-]+)\/?",
"{0}", re.IGNORECASE, "{{{{AUR|{0}}}}}"),
(r"https?\:\/\/aur\.archlinux\.org\/packages\/([a-zA-Z0-9@._+-]+)\/?",
None, 0, "{{{{AUR|{0}}}}}"),
# Wikipedia interwiki
(r"https?\:\/\/en\.wikipedia\.org\/wiki\/([^\]\?]+)",
".*", 0, "[[wikipedia:{0}|{1}]]"),
(r"https?\:\/\/en\.wikipedia\.org\/wiki\/([^\]\?]+)",
None, 0, "[[wikipedia:{0}]]"),
# change http:// to https:// for archlinux.org and wikipedia.org (do it at the bottom, i.e. with least priority)
(r"http:\/\/((?:[a-z]+\.)?(?:archlinux|wikipedia)\.org(?:\/\S+)?\/?)",
".*", 0, "[https://{0} {1}]"),
(r"http:\/\/((?:[a-z]+\.)?(?:archlinux|wikipedia)\.org(?:\/\S+)?\/?)",
None, 0, "https://{0}"),
]
def __init__(self):
_replacements = []
for url_regex, text_cond, text_cond_flags, replacement in self.replacements:
compiled = re.compile(url_regex)
_replacements.append( (compiled, text_cond, text_cond_flags, replacement) )
self.replacements = _replacements
@LazyProperty
def extlink_regex(self):
general = self.api.site.general
regex = re.escape(general["server"] + general["articlepath"].split("$1")[0])
regex += "(?P<pagename>[^\s\?]+)"
return re.compile(regex)
@staticmethod
def strip_extra_brackets(wikicode, extlink):
"""
Strip extra brackets around an external link, for example:
[[http://example.com/ foo]] -> [http://example.com/ foo]
"""
parent, _ = wikicode._do_strong_search(extlink, True)
index = parent.index(extlink)
def _get_text(index):
try:
node = parent.get(index)
if not isinstance(node, mwparserfromhell.nodes.text.Text):
return None
return node
except IndexError:
return None
prev = _get_text(index - 1)
next_ = _get_text(index + 1)
if prev is not None and next_ is not None and prev.endswith("[") and next_.startswith("]"):
prev.value = prev.value[:-1]
next_.value = next_.value[1:]
def extlink_to_wikilink(self, wikicode, extlink):
match = self.extlink_regex.fullmatch(str(extlink.url))
if match:
pagename = match.group("pagename")
title = self.api.Title(pagename)
# handle links to special namespaces correctly
if title.namespacenumber in {-2, 6, 14}:
target = ":" + title.fullpagename
else:
target = title.fullpagename
if extlink.title:
wikilink = "[[{}|{}]]".format(target, extlink.title)
else:
wikilink = "[[{}]]".format(target)
wikicode.replace(extlink, wikilink)
return True
return False
def extlink_replacements(self, wikicode, extlink):
for url_regex, text_cond, text_cond_flags, replacement in self.replacements:
if (text_cond is None and extlink.title is not None) or (text_cond is not None and extlink.title is None):
continue
match = url_regex.fullmatch(str(extlink.url))
if match:
if extlink.title is None:
repl = replacement.format(*match.groups())
# FIXME: hack to preserve brackets (e.g. [http://example.com/] )
if extlink.brackets and not repl.startswith("[") and not repl.endswith("]"):
repl = "[{}]".format(repl)
wikicode.replace(extlink, repl)
return True
else:
groups = [re.escape(g) for g in match.groups()]
alt_text = str(extlink.title).strip()
if re.fullmatch(text_cond.format(*groups), alt_text, text_cond_flags):
wikicode.replace(extlink, replacement.format(*match.groups(), extlink.title))
return True
else:
logger.warning("external link that should be replaced, but has custom alternative text: {}".format(extlink))
return False
def update_extlink(self, wikicode, extlink):
# always make sure to return as soon as the extlink is invalidated
self.strip_extra_brackets(wikicode, extlink)
if self.extlink_to_wikilink(wikicode, extlink):
return
if self.extlink_replacements(wikicode, extlink):
return
class WikilinkRules:
"""
Assumptions:
- all titles are case-insensitive on the first letter (true on ArchWiki)
- alternative text is intentional, no replacements there
"""
def __init__(self, api, db, *, interactive=False):
self.api = api
self.db = db
self.interactive = interactive
# mapping of canonical titles to displaytitles
self.displaytitles = {}
for ns in self.api.site.namespaces.keys():
if ns < 0:
continue
for page in self.api.generator(generator="allpages", gaplimit="max", gapnamespace=ns, prop="info", inprop="displaytitle"):
self.displaytitles[page["title"]] = page["displaytitle"]
self.void_update_cache = set()
def check_trivial(self, wikilink, title):
"""
Perform trivial simplification, replace `[[Foo|foo]]` with `[[foo]]`.
:param wikilink: instance of `mwparserfromhell.nodes.wikilink.Wikilink`
representing the link to be checked
:param title: the parsed :py:attr:`wikilink.title`
:type title: :py:class:`mw.parser_helpers.title.Title`
"""
if wikilink.text is None:
return
try:
text = self.api.Title(wikilink.text)
except TitleError:
return
if title == text:
# title is mandatory, so the text becomes the title
wikilink.title = title.leading_colon + str(wikilink.text)
wikilink.text = None
def check_relative(self, src_title, wikilink, title):
"""
Use relative links whenever possible. For example, links to sections such as
`[[Foo#Bar]]` on a page `title` are replaced with `[[#Bar]]` whenever `Foo`
redirects to or is equivalent to `title`.
:param str src_title: the title of the page being checked
:param wikilink: the link to be checked
:type wikilink: :py:class:`mwparserfromhell.nodes.wikilink.Wikilink`
:param title: the parsed :py:attr:`wikilink.title`
:type title: :py:class:`mw.parser_helpers.title.Title`
"""
if title.iwprefix or not title.sectionname:
return
# check if title is a redirect
target = self.api.redirects.map.get(title.fullpagename)
if target:
_title = self.api.Title(target)
_title.sectionname = title.sectionname
else:
_title = title
if canonicalize(src_title) == _title.fullpagename:
wikilink.title = "#" + _title.sectionname
title.parse(wikilink.title)
def check_redirect_exact(self, src_title, wikilink, title):
"""
Replace `[[foo|bar]]` with `[[bar]]` if `foo` and `bar` point to the
same page after resolving redirects.
:param str src_title: the title of the page being checked
:param wikilink: the link to be checked
:type wikilink: :py:class:`mwparserfromhell.nodes.wikilink.Wikilink`
:param title: the parsed :py:attr:`wikilink.title`
:type title: :py:class:`mw.parser_helpers.title.Title`
"""
if wikilink.text is None:
return
try:
text = self.api.Title(wikilink.text)
except TitleError:
return
# skip links to sections ([[#Foo|Foo]] should remain even if `Foo` redirects to `This page#Foo`)
if not title.pagename:
return
# handle relative links properly
# (we assume that subpages are enabled for all namespaces)
title = title.make_absolute(src_title)
target1 = self.api.redirects.map.get(title.fullpagename)
target2 = self.api.redirects.map.get(text.fullpagename)
if target1 is not None:
target1 = self.api.Title(target1)
# bail out if we lost the fragment
if target1.sectionname != title.sectionname:
return
if target2 is not None:
target2 = self.api.Title(target2)
if target1 is not None and target2 is not None:
if target1 == target2:
wikilink.title = wikilink.text
wikilink.text = None
title.parse(wikilink.title)
elif target1 is not None:
if target1 == text:
wikilink.title = wikilink.text
wikilink.text = None
title.parse(wikilink.title)
elif target2 is not None:
if target2 == title:
wikilink.title = wikilink.text
wikilink.text = None
title.parse(wikilink.title)
def check_redirect_capitalization(self, wikilink, title):
"""
Avoid redirect iff the difference is only in capitalization.
:param wikilink: the link to be checked
:type wikilink: :py:class:`mwparserfromhell.nodes.wikilink.Wikilink`
:param title: the parsed :py:attr:`wikilink.title`
:type title: :py:class:`mw.parser_helpers.title.Title`
"""
# run only in interactive mode
if self.interactive is False:
return
# FIXME: very common false positive
if title.pagename.lower().startswith("wpa supplicant"):
return
# might be only a section, e.g. [[#foo]]
if title.fullpagename:
target = self.api.redirects.map.get(title.fullpagename)
if target is not None and target.lower() == title.fullpagename.lower():
if title.sectionname:
target += "#" + title.sectionname
wikilink.title = target
title.parse(wikilink.title)
def check_displaytitle(self, wikilink, title):
# Replacing underscores and capitalization as per DISPLAYTITLE attribute
# is not safe (e.g. 'wpa_supplicant' and 'WPA supplicant' are equivalent
# without deeper context), so do it only in interactive mode.
if self.interactive is False:
return
# Avoid largescale edits if there is an alternative text.
if wikilink.text is not None:
return
# we can't check interwiki links
if title.iwprefix:
return
# skip relative links
if not title.fullpagename:
return
# skip links to special namespaces
if title.namespacenumber < 0:
return
# report pages without DISPLAYTITLE (red links)
if title.fullpagename not in self.displaytitles:
logger.warning("wikilink to non-existing page: {}".format(wikilink))
return
# FIXME: very common false positive
if title.pagename == "Wpa supplicant":
return
# assemble new title
# TODO: simplify (see #25)
new = self.displaytitles[title.fullpagename]
if title.sectionname:
# preserve original section anchor, it will be checked in self.check_anchor()
_, anchor = wikilink.title.split("#", maxsplit=1)
new += "#" + anchor
# TODO: the following code block would strip the leading colon
if title.leading_colon:
return
# skip if only the case of the first letter is different
if wikilink.title[1:] != new[1:]:
first_letter = wikilink.title[0]
wikilink.title = new
# preserve the case of the first letter if the rest differs only in spaces/underscores
# (e.g. don't replace [[environment_variables]] with [[Environment variables]])
if wikilink.title[1:].replace(" ", "_") == new[1:].replace(" ", "_"):
wikilink.title = first_letter + wikilink.title[1:]
title.parse(wikilink.title)
def check_anchor(self, src_title, wikilink, title):
"""
:returns:
``True`` if the anchor is correct or has been corrected, ``False``
if it is definitely broken, ``None`` if it can't be checked at all
or the check was indecisive and a warning/error has been printed to
the log.
"""
# TODO: beware of https://phabricator.wikimedia.org/T20431
# we can't check interwiki links
if title.iwprefix:
return None
# empty sectionname is always valid
if title.sectionname == "":
return None
# determine target page
if title.fullpagename:
_target_title = title
else:
_target_title = self.api.Title(src_title)
# skip links to special pages (e.g. [[Special:Preferences#mw-prefsection-rc]])
if _target_title.namespacenumber < 0:
return None
# resolve redirects
anchor_on_redirect_to_section = False
if _target_title.fullpagename in self.api.redirects.map:
_target_title = self.api.Title(self.api.redirects.resolve(_target_title.fullpagename))
if _target_title.sectionname:
logger.warning("warning: section fragment placed on a redirect to possibly different section: {}".format(wikilink))
anchor_on_redirect_to_section = True
# get lists of section headings and anchors
_result = self.db.query(titles=_target_title.fullpagename, prop="sections", secprop={"title", "anchor"})
_result = list(_result)
assert len(_result) == 1
if "missing" in _result[0]:
logger.error("could not find content of page: '{}' (wikilink {})".format(_target_title.fullpagename, wikilink))
return None
headings = [section["title"] for section in _result[0].get("sections", [])]
anchors = [section["anchor"] for section in _result[0].get("sections", [])]
if len(headings) == 0:
logger.warning("wikilink with broken section fragment: {}".format(wikilink))
return False
anchor = dotencode(title.sectionname)
needs_fix = True
# handle double-anchor redirects first
if anchor_on_redirect_to_section is True:
if anchor in anchors:
return True
else:
return False
# try exact match first
if anchor in anchors:
needs_fix = False
# otherwise try case-insensitive match to detect differences in capitalization
elif self.interactive is True:
# FIXME: first detect section renaming properly, fuzzy search should be only the last resort to deal with typos and such
ranks = get_ranks(anchor, anchors)
ranks = list(filter(lambda rank: rank[1] >= 0.8, ranks))
if len(ranks) == 1 or ( len(ranks) >= 2 and ranks[0][1] - ranks[1][1] > 0.2 ):
logger.debug("wikilink {}: replacing anchor '{}' with '{}' on similarity level {}".format(wikilink, anchor, ranks[0][0], ranks[0][1]))
anchor = ranks[0][0]
elif len(ranks) > 1:
logger.debug("skipping {}: multiple feasible anchors per similarity ratio: {}".format(wikilink, ranks))
return False
else:
logger.warning("wikilink with broken section fragment: {}".format(wikilink))
return False
else:
logger.warning("wikilink with broken section fragment: {}".format(wikilink))
return False
# assemble new section fragment
# try to preserve the character separating base anchor and numeric suffix
dupl_match = re.match("(.+)([_ ])(\d+)$", str(wikilink.title))
if dupl_match:
suffix_sep = dupl_match.group(2)
else:
suffix_sep = " "
# get_anchors makes sure to strip markup and handle duplicate section names
new_fragment = get_anchors(headings, pretty=True, suffix_sep=suffix_sep)[anchors.index(anchor)]
# Avoid beautification if there is alternative text and the link
# actually works.
if wikilink.text is None or needs_fix is True:
# preserve title set in check_displaytitle()
# TODO: simplify (see #25)
t, _ = wikilink.title.split("#", maxsplit=1)
wikilink.title = t + "#" + new_fragment
title.parse(wikilink.title)
return True
def collapse_whitespace_pipe(self, wikilink):
"""
Strip whitespace around the pipe in wikilinks.
:param wikilink: instance of `mwparserfromhell.nodes.wikilink.Wikilink`
representing the link to be checked
"""
if wikilink.text is not None:
wikilink.title = wikilink.title.rstrip()
wikilink.text = wikilink.text.lstrip()
def collapse_whitespace(self, wikicode, wikilink):
"""
Attempt to fix spacing around wiki links after the substitutions.
:param wikicode: instance of `mwparserfromhell.wikicode.Wikicode`
containing the wikilink
:param wikilink: instance of `mwparserfromhell.nodes.wikilink.Wikilink`
representing the link to be checked
"""
parent, _ = wikicode._do_strong_search(wikilink, True)
index = parent.index(wikilink)
def _get_text(index):
try:
node = parent.get(index)
if not isinstance(node, mwparserfromhell.nodes.text.Text):
return None
return node
except IndexError:
return None
prev = _get_text(index - 1)
next_ = _get_text(index)
if prev is not None and (prev.endswith(" ") or prev.endswith("\n")):
wikilink.title = wikilink.title.lstrip()
if next_ is not None and (next_.startswith(" ") or next_.endswith("\n")):
if wikilink.text is not None:
wikilink.text = wikilink.text.rstrip()
else:
wikilink.title = wikilink.title.rstrip()
def update_wikilink(self, wikicode, wikilink, src_title, summary_parts):
if str(wikilink) in self.void_update_cache:
logger.debug("Skipping wikilink {} due to void-update cache.".format(wikilink))
return
title = self.api.Title(wikilink.title)
# skip interlanguage links (handled by interlanguage.py)
if title.iwprefix in self.api.site.interlanguagemap.keys():
return
summary = get_edit_checker(wikicode, summary_parts)
with summary("simplification and beautification of wikilinks"):
# beautify if urldecoded
# FIXME: make it implicit - it does not always propagate from the Title class
if not title.iwprefix and re.search("%[0-9a-f]{2}", str(wikilink.title), re.IGNORECASE):
# handle links with leading colon properly
wikilink.title = title.leading_colon + str(title)
# FIXME: should be done in the Title class
# the anchor is dot-encoded, but percent-encoding wors for links too
# and is even rendered nicely
wikilink.title = str(wikilink.title).replace("[", "%5B").replace("|", "%7C").replace("]", "%5D")
self.collapse_whitespace_pipe(wikilink)
self.check_trivial(wikilink, title)
self.check_relative(src_title, wikilink, title)
if lang.detect_language(src_title)[1] == "English":
self.check_redirect_exact(src_title, wikilink, title)
self.check_redirect_capitalization(wikilink, title)
# reparse the title, the redirect checks might change it non-equivalently
title = self.api.Title(wikilink.title)
self.check_displaytitle(wikilink, title)
with summary("fixed section fragments"):
anchor_result = self.check_anchor(src_title, wikilink, title)
if anchor_result is False:
with summary("flagged broken section links"):
ensure_flagged_by_template(wikicode, wikilink, "Broken section link")
else:
with summary("unflagged working section links"):
ensure_unflagged_by_template(wikicode, wikilink, "Broken section link")
with summary("simplification and beautification of wikilinks"):
# partial second pass
self.check_trivial(wikilink, title)
if lang.detect_language(src_title)[1] == "English":
self.check_redirect_exact(src_title, wikilink, title)
# collapse whitespace around the link, e.g. 'foo [[ bar]]' -> 'foo [[bar]]'
self.collapse_whitespace(wikicode, wikilink)
# cache context-less, correct wikilinks that don't need any update
if title.pagename and len(summary_parts) == 0 and anchor_result is True:
self.void_update_cache.add(str(wikilink))
class ManTemplateRules:
url_template = "http://jlk.fjfi.cvut.cz/arch/manpages/man/{pagename}.{section}"
url_template_nosection = "http://jlk.fjfi.cvut.cz/arch/manpages/man/{pagename}"
def __init__(self, timeout, max_retries):
self.timeout = timeout
self.session = requests.Session()
adapter = requests.adapters.HTTPAdapter(max_retries=max_retries)
self.session.mount("https://", adapter)
self.session.mount("http://", adapter)
self.cache_valid_urls = set()
self.cache_invalid_urls = set()
def update_man_template(self, wikicode, template):
if template.name.lower() != "man":
return
now = datetime.datetime.utcnow()
deadlink_params = [now.year, now.month, now.day]
deadlink_params = ["{:02d}".format(i) for i in deadlink_params]
if not template.has(1) or not template.has(2, ignore_empty=True):
ensure_flagged_by_template(wikicode, template, "Dead link", *deadlink_params, overwrite_parameters=False)
return
if template.get(1).value.strip():
url = self.url_template.format(section=template.get(1).value.strip(), pagename=queryencode(template.get(2).value.strip()))
else:
url = self.url_template_nosection.format(pagename=queryencode(template.get(2).value.strip()))
if template.has(3):
url += "#{}".format(queryencode(template.get(3).value.strip()))
if template.has("url"):
explicit_url = template.get("url").value.strip()
else:
explicit_url = None
def check_url(url):
if url.startswith("ftp://"):
logger.error("The FTP protocol is not supported by the requests module. URL: {}".format(url))
return True
if url in self.cache_valid_urls:
return True
elif url in self.cache_invalid_urls:
return False
response = self.session.get(url, timeout=self.timeout)
if response.status_code == 200:
# heuristics to get the missing section (redirect from some_page to some_page.1)
# WARNING: if the manual exists in multiple sections, the first one might not be the best
if len(response.history) == 1 and response.url.startswith(url + "."):
# template parameter 1= should be empty
assert not template.has(1, ignore_empty=True)
template.add(1, response.url[len(url) + 1:])
self.cache_valid_urls.add(response.url)
return True
else:
self.cache_valid_urls.add(url)
return True
elif response.status_code >= 400:
self.cache_invalid_urls.add(url)
return False
else:
raise NotImplementedError("Unexpected status code {} for man page URL: {}".format(response.status_code, url))
# check if the template parameters form a valid URL
if check_url(url):
ensure_unflagged_by_template(wikicode, template, "Dead link")
# remove explicit url= parameter - not necessary
if explicit_url is not None:
template.remove("url")
elif explicit_url is None:
ensure_flagged_by_template(wikicode, template, "Dead link", *deadlink_params, overwrite_parameters=False)
elif explicit_url != "":
if check_url(explicit_url):
ensure_unflagged_by_template(wikicode, template, "Dead link")
else:
ensure_flagged_by_template(wikicode, template, "Dead link", *deadlink_params, overwrite_parameters=False)
class LinkChecker(ExtlinkRules, WikilinkRules, ManTemplateRules):
skip_pages = ["Table of contents", "Help:Editing", "ArchWiki:Reports", "ArchWiki:Requests", "ArchWiki:Statistics"]
# article status templates, lowercase
skip_templates = ["accuracy", "archive", "bad translation", "expansion", "laptop style", "merge", "move", "out of date", "remove", "stub", "style", "translateme"]
def __init__(self, api, db, interactive=False, dry_run=False, first=None, title=None, langnames=None, connection_timeout=30, max_retries=3):
if not dry_run:
# ensure that we are authenticated
require_login(api)
# init inherited
ExtlinkRules.__init__(self)
WikilinkRules.__init__(self, api, db, interactive=interactive)
ManTemplateRules.__init__(self, connection_timeout, max_retries)
self.api = api
self.db = db
self.interactive = interactive
self.dry_run = dry_run
# parameters for self.run()
self.first = first
self.title = title
self.langnames = langnames
self.db.sync_with_api(api)
self.db.sync_latest_revisions_content(api)
self.db.update_parser_cache()
@staticmethod
def set_argparser(argparser):
# first try to set options for objects we depend on
present_groups = [group.title for group in argparser._action_groups]
if "Connection parameters" not in present_groups:
API.set_argparser(argparser)
if "Database parameters" not in present_groups:
Database.set_argparser(argparser)
group = argparser.add_argument_group(title="script parameters")
group.add_argument("-i", "--interactive", action="store_true",
help="enables interactive mode")
group.add_argument("--dry-run", action="store_true",
help="enables dry-run mode (changes are only shown and discarded)")
mode = group.add_mutually_exclusive_group()
mode.add_argument("--first", default=None, metavar="TITLE",
help="the title of the first page to be processed")
mode.add_argument("--title",
help="the title of the only page to be processed")
group.add_argument("--lang", default=None,
help="comma-separated list of language tags to process (default: all, choices: {})".format(lang.get_internal_tags()))
@classmethod
def from_argparser(klass, args, api=None, db=None):
if api is None:
api = API.from_argparser(args)
if db is None:
db = Database.from_argparser(args)
if args.lang:
tags = args.lang.split(",")
for tag in tags:
if tag not in lang.get_internal_tags():
# FIXME: more elegant solution
raise Exception("{} is not a valid language tag".format(tag))
langnames = {lang.langname_for_tag(tag) for tag in tags}
else:
langnames = set()
return klass(api, db, interactive=args.interactive, dry_run=args.dry_run, first=args.first, title=args.title, langnames=langnames, connection_timeout=args.connection_timeout, max_retries=args.connection_max_retries)
def update_page(self, src_title, text):
"""
Parse the content of the page and call various methods to update the links.
:param str src_title: title of the page
:param str text: content of the page
:returns: a (text, edit_summary) tuple, where text is the updated content
and edit_summary is the description of performed changes
"""
# FIXME: ideally "DeveloperWiki:" would be a proper namespace
if lang.detect_language(src_title)[0] in self.skip_pages or src_title.startswith("DeveloperWiki:"):
logger.info("Skipping blacklisted page [[{}]]".format(src_title))
return text, ""
logger.info("Parsing page [[{}]] ...".format(src_title))
# FIXME: skip_style_tags=True is a partial workaround for https://github.com/earwig/mwparserfromhell/issues/40
wikicode = mwparserfromhell.parse(text, skip_style_tags=True)
summary_parts = []
summary = get_edit_checker(wikicode, summary_parts)
for extlink in wikicode.ifilter_external_links(recursive=True):
# skip links inside article status templates
parent = wikicode.get(wikicode.index(extlink, recursive=True))
if isinstance(parent, mwparserfromhell.nodes.template.Template) and parent.name.lower() in self.skip_templates:
continue
with summary("replaced external links"):
self.update_extlink(wikicode, extlink)
for wikilink in wikicode.ifilter_wikilinks(recursive=True):
# skip links inside article status templates
parent = wikicode.get(wikicode.index(wikilink, recursive=True))
if isinstance(parent, mwparserfromhell.nodes.template.Template) and parent.name.lower() in self.skip_templates:
continue
self.update_wikilink(wikicode, wikilink, src_title, summary_parts)
for template in wikicode.ifilter_templates(recursive=True):
# skip templates that may be added or removed
if str(template.name) in {"Broken section link", "Dead link"}:
continue
# skip links inside article status templates
parent = wikicode.get(wikicode.index(template, recursive=True))
if isinstance(parent, mwparserfromhell.nodes.template.Template) and parent.name.lower() in self.skip_templates:
continue
_pure_template = lang.detect_language(str(template.name))[0]
if _pure_template.lower() in {"related", "related2"}:
target = template.get(1).value
# temporarily convert the {{Related}} to wikilink to reuse the update code
wl = mwparserfromhell.nodes.wikilink.Wikilink(target)
wikicode.replace(template, wl)
# update
self.update_wikilink(wikicode, wl, src_title, summary_parts)
# replace back
target.value = str(wl.title)
wikicode.replace(wl, template)
elif template.name.lower() == "man":
with summary("updated man page links"):
self.update_man_template(wikicode, template)
# deduplicate and keep order
parts = set()
parts_add = parts.add
summary_parts = [part for part in summary_parts if not (part in parts or parts_add(part))]
edit_summary = ", ".join(summary_parts)
if self.interactive is True:
edit_summary += " (interactive)"
return str(wikicode), edit_summary
def _edit(self, title, pageid, text_new, text_old, timestamp, edit_summary):
if text_old != text_new:
if self.dry_run:
diff = diff_highlighted(text_old, text_new, title + ".old", title + ".new", timestamp, "<utcnow>")
print(diff)
print("Edit summary: " + edit_summary)
print("(edit discarded due to --dry-run)")
else:
try:
if self.interactive is False:
self.api.edit(title, pageid, text_new, timestamp, edit_summary, bot="")
else:
edit_interactive(self.api, title, pageid, text_old, text_new, timestamp, edit_summary, bot="")
except APIError as e:
pass
def process_page(self, title):
result = self.api.call_api(action="query", prop="revisions", rvprop="content|timestamp", titles=title)
page = list(result["pages"].values())[0]
timestamp = page["revisions"][0]["timestamp"]
text_old = page["revisions"][0]["*"]
text_new, edit_summary = self.update_page(title, text_old)
self._edit(title, page["pageid"], text_new, text_old, timestamp, edit_summary)
def process_allpages(self, apfrom=None, langnames=None):
namespaces = [0, 4, 14]
if self.interactive is True:
namespaces.append(12)
# rewind to the right namespace (the API throws BadTitle error if the
# namespace of apfrom does not match apnamespace)
if apfrom is not None:
_title = self.api.Title(apfrom)
if _title.namespacenumber not in namespaces:
logger.error("Valid namespaces for the --first option are {}.".format([self.api.site.namespaces[ns] for ns in namespaces]))
return
while namespaces[0] != _title.namespacenumber:
del namespaces[0]
# apfrom must be without namespace prefix
apfrom = _title.pagename
for ns in namespaces:
for page in self.db.query(generator="allpages", gaplimit="max", gapfilterredir="nonredirects", gapnamespace=ns, gapfrom=apfrom,
prop="latestrevisions", rvprop={"timestamp", "content"}):
title = page["title"]
if langnames and lang.detect_language(title)[1] not in langnames:
continue
_title = self.api.Title(title)
timestamp = page["revisions"][0]["timestamp"]
text_old = page["revisions"][0]["*"]
text_new, edit_summary = self.update_page(title, text_old)
self._edit(title, page["pageid"], text_new, text_old, timestamp, edit_summary)
# the apfrom parameter is valid only for the first namespace
apfrom = ""
def run(self):
if self.title is not None:
checker.process_page(self.title)
else:
checker.process_allpages(apfrom=self.first, langnames=self.langnames)
if __name__ == "__main__":
import ws.config
checker = ws.config.object_from_argparser(LinkChecker, description="Parse all pages on the wiki and try to fix/simplify/beautify links")
try:
checker.run()
except (InteractiveQuit, KeyboardInterrupt):
pass