Permalink
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
364 lines (299 sloc) 14.5 KB
#! /usr/bin/env python3
from collections.abc import Iterable
import datetime
import logging
from ws.client import API, APIError
from ws.interactive import require_login
from ws.autopage import AutoPage
from ws.parser_helpers.title import canonicalize
import ws.ArchWiki.lang as lang
from ws.interlanguage.CategoryGraph import CategoryGraph
from ws.interlanguage.Categorization import Categorization
from ws.interlanguage.Decategorization import Decategorization
logger = logging.getLogger(__name__)
class LowercaseDict(dict):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def __getitem__(self, key):
return super().__getitem__(key.lower())
def __setitem__(self, key, value):
super().__setitem__(key.lower(), value)
def __contains__(self, key):
return super().__contains__(key.lower())
def get(self, key, default=None):
return super().get(key.lower(), default)
def setdefault(self, key, default=None):
return super().setdefault(key.lower(), default)
def update(self, other):
for key, value in other.items():
super().__setitem__(key.lower(), value)
def pop(self, key, default=None):
return super().pop(key.lower(), default)
class BaseFormatter:
def __init__(self, parents, info, category_names, alsoin=None):
self.parents = parents
self.info = info
self.category_names = category_names
if alsoin is None:
alsoin = {}
alsoin.setdefault("en", "also in")
self.alsoin = alsoin
def format_also_in(self, parents, lang_tag):
alsoin = self.alsoin.get(lang_tag, self.alsoin["en"])
return " ({alsoin} {categories})".format(alsoin=alsoin, categories=", ".join(sorted(parents)))
def localize(self, category):
default = lang.detect_language(category.split(":", 1)[1])[0]
return self.category_names.get(category, default)
class PlainFormatter(BaseFormatter):
def __init__(self, parents, info, category_names, alsoin=None):
super().__init__(parents, info, category_names, alsoin)
self.text = ""
def format_root(self, title):
if isinstance(title, str):
self.text += "{} ({})\n".format(title, self.info[title]["pages"])
elif isinstance(title, Iterable):
# title is a tuple of titles
for t in title:
self.format_root(t)
if len(title) > 1:
self.text += "----\n"
def format_cell(self, title, parent, levels):
lang_tag = lang.tag_for_langname(lang.detect_language(title)[1])
# indent
output = " " * len(levels) * 4
# level
output += ".".join(str(x + 1) for x in levels)
# title, number of subpages
output += " {} ({})".format(self.localize(title), self.info[title]["pages"])
# "also in" suffix
parents = set(self.parents[title]) - {parent}
if parents:
parents = [self.localize(cat) for cat in parents]
output += self.format_also_in(parents, lang_tag)
return output
def format_row(self, *columns):
for col in columns:
if isinstance(col, Iterable) and not isinstance(col, str):
self.text += self.format_cell(*col) + "\n"
else:
self.text += str(col) + "\n"
if len(columns) > 1:
self.text += "----\n"
def __str__(self):
return self.text
class MediaWikiFormatter(BaseFormatter):
cell_format = "<span style=\"margin-left:{margin:.3}em;\"><small>{levels}</small> {catlink} <small>{info}</small></span>"
def __init__(self, parents, info, category_names, alsoin=None, include_opening_closing_tokens=True):
super().__init__(parents, info, category_names, alsoin)
self.include_opening_closing_tokens = include_opening_closing_tokens
self.text = ""
def catlink(self, title):
catlink = "[[:{}|{}]]".format(title, self.localize(title))
if lang.is_rtl_language(lang.detect_language(title)[1]):
catlink += "&lrm;"
return catlink
def format_root(self, title):
if isinstance(title, str):
self.text += "| {} <small>({})</small>\n".format(self.catlink(title), self.info[title]["pages"])
self.text += "|-\n"
elif isinstance(title, Iterable):
# title is a tuple of titles
for t in title:
self.text += "| {} <small>({})</small>\n".format(self.catlink(t), self.info[t]["pages"])
self.text += "|-\n"
def format_cell(self, title, parent, levels):
lang_tag = lang.tag_for_langname(lang.detect_language(title)[1])
margin = 1.6 * len(levels)
lev = ".".join(str(x + 1) for x in levels) + "."
info = "({})".format(self.info[title]["pages"])
# "also in" suffix
parents = set(self.parents[title]) - {parent}
if parents:
parents = [self.catlink(cat) for cat in parents]
info += self.format_also_in(parents, lang_tag)
return self.cell_format.format(margin=margin, levels=lev, catlink=self.catlink(title), info=info)
def format_row(self, *columns):
for col in columns:
if isinstance(col, Iterable) and not isinstance(col, str):
self.text += "| " + self.format_cell(*col) + "\n"
elif col:
self.text += "| " + str(col) + "\n"
else:
self.text += "|\n"
self.text += "|-\n"
def __str__(self):
if self.include_opening_closing_tokens is True:
out = "{|\n" + self.text
if out.endswith("\n"):
out += "|}"
else:
out += "\n|}"
return out
return self.text
class TableOfContents:
def __init__(self, api, cliargs):
self.api = api
self.cliargs = cliargs
if self.cliargs.save is False and self.cliargs.print is False:
self.cliargs.print = True
if len(self.cliargs.toc_languages) == 1 and self.cliargs.toc_languages[0] == "all":
self.cliargs.toc_languages = lang.get_internal_tags()
# strip "(Language)" suffix
self.cliargs.toc_page = lang.detect_language(canonicalize(self.cliargs.toc_page))[0]
# detect page titles
self.titles = []
for ln in sorted(self.cliargs.toc_languages):
if ln == lang.tag_for_langname(lang.get_local_language()):
self.titles.append(self.cliargs.toc_page)
else:
self.titles.append("{} ({})".format(self.cliargs.toc_page, lang.langname_for_tag(ln)))
@staticmethod
def set_argparser(argparser):
import ws.config
# first try to set options for objects we depend on
present_groups = [group.title for group in argparser._action_groups]
if "Connection parameters" not in present_groups:
API.set_argparser(argparser)
output = argparser.add_argument_group(title="output mode")
_g = output.add_mutually_exclusive_group()
# TODO: maybe leave only the short option to forbid configurability in config file
_g.add_argument("-s", "--save", action="store_true",
help="try to save the page (requires being logged in)")
_g.add_argument("-p", "--print", action="store_true",
help="print the updated text in the standard output (this is the default output method)")
group = argparser.add_argument_group(title="script parameters")
group.add_argument("-a", "--anonymous", action="store_true",
help="do not require logging in: queries may be limited to a lower rate")
# TODO: maybe leave only the short option to forbid configurability in config file
group.add_argument("-f", "--force", action="store_true",
help="try to update the page even if it was last saved in the same UTC day")
group.add_argument("--toc-languages", default="all", type=ws.config.argtype_comma_list_choices(["all"] + lang.get_internal_tags()),
help="a comma-separated list of language tags whose ToC pages should be updated (default: %(default)s)")
group.add_argument("--toc-page", default="Table of contents",
help="the page name on the wiki to fetch and update (the language suffix "
"is added automatically as necessary) (default: %(default)s)")
# TODO: no idea how to forbid setting this globally in the config...
group.add_argument("--summary", default="automatic update",
help="the edit summary to use when saving the page (default: %(default)s)")
@classmethod
def from_argparser(klass, args, api=None):
if api is None:
api = API.from_argparser(args)
return klass(api, args)
def parse_toc_table(self, title, toc_table):
# default format is one column in the title's language
columns = [lang.tag_for_langname(lang.detect_language(title)[1])]
category_names = LowercaseDict()
alsoin = {}
if toc_table is not None:
# parse data-toc-languages attribute
try:
_languages = str(toc_table.get("data-toc-languages").value)
columns = _languages.split(",")
except ValueError:
toc_table.add("data-toc-languages", value=",".join(columns))
# parse data-toc-alsoin attribute
if toc_table.has("data-toc-alsoin"):
alsoin = self.parse_alsoin(title, str(toc_table.get("data-toc-alsoin").value))
elif columns != ["en"]:
logger.warning("Page [[{}]]: missing 'also in' translations".format(title))
# extract localized category names (useful even for PlainFormatter)
category_names = self.extract_translations(toc_table.contents)
return columns, category_names, alsoin
def parse_alsoin(self, title, value):
alsoin = {}
for item in value.split(","):
item = item.strip()
try:
tag, translation = item.split(":", maxsplit=1)
tag = tag.strip()
translation = translation.strip()
if not lang.is_language_tag(tag):
raise ValueError
except ValueError:
tag = lang.tag_for_langname(lang.detect_language(title)[1])
translation = item
alsoin[tag] = translation
logger.debug("Page [[{}]]: parsed data-toc-alsoin = {}".format(title, alsoin))
return alsoin
def extract_translations(self, wikicode):
dictionary = LowercaseDict()
for wikilink in wikicode.ifilter_wikilinks(recursive=True):
# skip catlinks without leading colon
if not wikilink.title.startswith(":"):
continue
title = self.api.Title(wikilink.title)
if title.namespace == "Category" and wikilink.text:
# skip trivial cases to apply our defaults
pure, _ = lang.detect_language(title.pagename)
if wikilink.text.lower() != title.pagename.lower() and wikilink.text.lower() != pure.lower():
dictionary[str(title)] = str(wikilink.text).strip()
return dictionary
def run(self):
if not self.cliargs.anonymous:
require_login(self.api)
# if we are going to save, make sure that the categories are correct first
if self.cliargs.save is True:
cat = Categorization(self.api)
cat.fix_allpages()
decat = Decategorization(self.api)
decat.fix_allpages()
# build category graph
graph = CategoryGraph(self.api)
# if we are going to save, init wanted categories
if self.cliargs.save is True:
graph.init_wanted_categories()
# detect target pages, fetch content at once
page = AutoPage(self.api, fetch_titles=self.titles)
for title in self.titles:
try:
page.set_title(title)
except ValueError:
# page not fetched
continue
toc_table = page.get_tag_by_id(tag="table", id="wiki-scripts-toc-table")
columns, category_names, alsoin = self.parse_toc_table(title, toc_table)
if toc_table is None:
if self.cliargs.save is True:
logger.error(
"The wiki page [[{}]] does not contain the ToC table. "
"Create the following entry point manually:\n"
"{{| id=\"wiki-scripts-toc-table\"\n...\n|}}".format(title))
continue
else:
logger.warning(
"The wiki page [[{}]] does not contain the ToC table, "
"so there will be no translations.".format(title))
if self.cliargs.print:
ff = PlainFormatter(graph.parents, graph.info, category_names, alsoin)
elif self.cliargs.save:
ff = MediaWikiFormatter(graph.parents, graph.info, category_names, alsoin, include_opening_closing_tokens=False)
else:
raise NotImplementedError("unknown output action: {}".format(self.cliargs.save))
roots = ["Category:{}".format(lang.langname_for_tag(c)) for c in columns]
ff.format_root(roots)
if len(roots) == 1:
for item in graph.walk(graph.subcats, roots[0]):
ff.format_row(item)
elif len(roots) == 2:
for result in graph.compare_components(graph.subcats, *roots):
ff.format_row(*result)
else:
logger.error("Cannot compare more than 2 languages at once. Requested: {}".format(columns))
continue
if self.cliargs.print:
print("== {} ==\n".format(title))
print(ff)
elif self.cliargs.save:
toc_table.contents = str(ff)
if self.cliargs.force or page.is_old_enough(min_interval=datetime.timedelta(days=1), strip_time=True):
try:
page.save(self.cliargs.summary)
except APIError:
pass
else:
logger.info("The page [[{}]] has already been updated this UTC day.".format(title))
if __name__ == "__main__":
import ws.config
toc = ws.config.object_from_argparser(TableOfContents, description="Build a presentation of the wiki's table of contents")
toc.run()