# PREAMBLE

In [None]:
import cfg

from collections import OrderedDict
import gzip
import re

import progress_bar as pb
import parallel_stream as ps
from normalize_text import normalize_text, normalize_hyphens, normalize_multiword

In [None]:
_src_tags_order = ("name", "alias", "redir")
_and_replacements = (" ", "", " and ", " n ", "n")
_parenthesis_re = re.compile("\s*\(.*?\)\s*$")


def normalize_aliases_raw(src_tag_to_aliases_raw, ampersand=True, hyphens=True, multiword=True, acronyms=True):
    assert isinstance(src_tag_to_aliases_raw, dict)
    assert (len(src_tag_to_aliases_raw) - sum(src_tag in src_tag_to_aliases_raw for src_tag in _src_tags_order)) == 0

    # ordered list of src_tag - aliases_raw
    src_tag__aliases_raw__pairs = tuple(
        (src_tag, tuple(alias_raw for alias_raw in src_tag_to_aliases_raw[src_tag] if ":" not in alias_raw))  # remove aliases containing :
        for src_tag in _src_tags_order
        if src_tag in src_tag_to_aliases_raw and src_tag_to_aliases_raw[src_tag] is not None
    )

    # result
    aliases = OrderedDict()

    # support data structure
    aliases_raw_support = OrderedDict()

    # following the fixed src_tag order performs the normalization on the aliases_raw
    for src_tag, aliases_raw in src_tag__aliases_raw__pairs:
        aliases_raw_support.clear()

        # raw version
        postponed_insertions = []  # ambiguous insertions will be inserted after the others
        for alias_raw in aliases_raw:
            new_alias_raw = _parenthesis_re.sub("", alias_raw)
            if len(new_alias_raw) != len(alias_raw):
                postponed_insertions.append(new_alias_raw)  # ambiguous alias
            elif alias_raw not in aliases_raw_support:
                aliases_raw_support[alias_raw] = tuple([])  # clear alias
        # raw version without parenthesis
        for alias_raw in postponed_insertions:
            if alias_raw not in aliases_raw_support:
                aliases_raw_support[alias_raw] = tuple(["norm()"])

        # & normalized version
        if ampersand:
            for alias_raw, tags in aliases_raw_support.iteritems():
                if "&" in alias_raw:
                    alias_raw_splitted = alias_raw.strip().split("&")
                    for i, and_replacement in enumerate(_and_replacements):
                        new_alias_raw = and_replacement.join(alias_raw_splitted)
                        if new_alias_raw not in aliases_raw_support:
                            aliases_raw_support[new_alias_raw] = ("norm&" + str(i),) + tags

        # default normalization before all
        for alias_raw, tags in aliases_raw_support.iteritems():
            alias = normalize_text(alias_raw)
            if alias not in aliases:
                aliases[alias] = (src_tag,) + tags

        # hyphens normalization
        if hyphens:
            for alias_raw, tags in aliases_raw_support.iteritems():
                alias = normalize_hyphens(alias_raw)
                if alias not in aliases:
                    aliases[alias] = (src_tag, "norm-") + tags

        # multiword normalization
        if hyphens:
            for alias_raw, tags in aliases_raw_support.iteritems():
                alias = normalize_multiword(alias_raw)
                if alias not in aliases:
                    aliases[alias] = (src_tag, "norm|") + tags

    # acronyms normalization
    if acronyms:
        acronyms_support = dict()
        for alias in aliases:
            if " " not in alias:
                continue
            # get the initial letters
            initials = [
                letter
                for i, letter in enumerate(alias)
                if i==0 or (alias[i-1] == ' ' and letter != ' ')
            ]
            if len(initials) <= 1:
                continue

            # create the two acronyms using the initial letters only
            acronyms = (
                ''.join(initials),
                ' '.join(initials)
            )

            # add the tag acronym to these acronyms
            for acronym in acronyms:
                if acronym not in aliases:
                    continue
                if len(aliases[acronym]) == 0 or aliases[acronym][-1] != "acronym": # since we are in this loop the tag "acronym" can only be the last one
                    aliases[acronym] += ("acronym", )

            # if only one of the two acronyms appear among the aliases I add the other one
            if acronyms[0] in aliases:
                if acronyms[1] not in aliases:
                    # only acronyms[0] is inside
                    acronyms_support[acronyms[1]] = aliases[acronyms[0]] + ("norm.", )
            else:
                if acronyms[1] in aliases:
                    # only acronyms[1] is inside
                    acronyms_support[acronyms[0]] = aliases[acronyms[1]] + ("norm.", )

        aliases.update(acronyms_support)

    # discard empty aliases if any
    aliases.pop("", None)

    # assertions
    assert all(("  " not in alias) for alias in aliases)

    # return the aliases
    return aliases

# PROCESS WIKIPEDIA RAW INTERMEDIATE FILES

In [None]:
path_template_aliases = cfg.processed_dir + "wikipedia_raw/aliases.part_{{}}_{}.tsv.gz".format(cfg.wiki_preprocessing_split_into)
path_template_redirects = cfg.processed_dir + "wikipedia_raw/redirects.part_{{}}_{}.tsv.gz".format(cfg.wiki_preprocessing_split_into)

In [None]:
# get from https://en.wikipedia.org/wiki/Help:Category
exclude_category_list = [
    "User", "Wikipedia", "File", "MediaWiki", "Template", "Help", "Category", "Portal", "Book", "Draft", "Education Program", "TimedText", "Module", "Gadget", "Gadget definition"
]

def filter_page_title_raw(page_title_raw):
    p = page_title_raw.find(":")
    if p > 0:
        category = page_title_raw[:p].lower()
        return category in exclude_category_list
    return page_title_raw.startswith("List of ")

## READ REDIRECTS

In [None]:
%%time
page_title_to_in_link_page_title_list = dict()

pbar = pb.ProgressBar()
for part in xrange(1, cfg.wiki_preprocessing_split_into+1):
    with gzip.open(path_template_redirects.format(part)) as redirects_file:
        for line in redirects_file:
            pbar.increase()

            page_id, page_title_src, page_title_dest = line[:-1].decode('utf-8').split("\t")

            if filter_page_title_raw(page_title_src) or filter_page_title_raw(page_title_dest):
                continue

            # put the redirect into the dictionary
            in_link_page_title_list = page_title_to_in_link_page_title_list.get(page_title_dest, None)
            if in_link_page_title_list is None:
                page_title_to_in_link_page_title_list[page_title_dest] = in_link_page_title_list = [page_title_src]
            else:
                in_link_page_title_list.append(page_title_src)
pbar.stop(True)
# it lasts 1min 40s

## READ ALIASES

In [None]:
%%time
page_title_to_page_id_aliases_raw = dict()

pbar = pb.ProgressBar()
for part in xrange(1, cfg.wiki_preprocessing_split_into+1):
    with gzip.open(path_template_aliases.format(part)) as aliases_file:
        for line in aliases_file:
            pbar.increase()

            # get the 3 main components
            page_id, page_title, aliases_raw = line[:-1].decode('utf-8').split("\t", 2)

            if filter_page_title_raw(page_title):
                continue

            # split the last component that contains the aliases
            aliases_raw = set(aliases_raw.split("\t") if aliases_raw else [])

            # warning about an entry that is already inside the dictionary
            if page_title in page_title_to_page_id_aliases_raw:
                print u"'{}' is already in".format(page_title)
                # include the previous aliases
                aliases_raw.update(page_title_to_page_id_aliases_raw[page_title][1])

            # discard the page title
            aliases_raw.discard(page_title)

            # put the aliases into the dictionary
            page_title_to_page_id_aliases_raw[page_title] = (page_id, tuple(aliases_raw))
pbar.stop(True)
# it lasts 2min

## WRITE ON DISK

### WRITE THE ALIASES

In [None]:
page_title_set = set(page_title_to_in_link_page_title_list) | set(page_title_to_page_id_aliases_raw)

In [None]:
def _emitter(outqueue):
    for page_title in page_title_set:
        outqueue.put(page_title)

def _worker(worker_id, inqueue, outqueue):
    src_tag_to_aliases_raw = {"name": None, "alias": None, "redir": None}

    for page_title in inqueue:
        # set the name
        src_tag_to_aliases_raw["name"] = (page_title,)

        # get page id and raw aliases
        if page_title in page_title_to_page_id_aliases_raw:
            page_id, aliases_raw = page_title_to_page_id_aliases_raw[page_title]
        else:
            page_id, aliases_raw = "-", None

        # set the aliases
        src_tag_to_aliases_raw["alias"] = aliases_raw

        # get the raw in-redirects
        in_link_page_title_list = page_title_to_in_link_page_title_list.get(page_title, None)
        
        # set the redirects
        src_tag_to_aliases_raw["redir"] = None if in_link_page_title_list is None else in_link_page_title_list

        # normalize all the information
        aliases_to_tags = normalize_aliases_raw(src_tag_to_aliases_raw)
        
        # format the output line
        if len(aliases_to_tags):
            line = "{}\t{}\n".format(page_id, "\t".join("{}:{}".format(alias, ",".join(tags)) for alias, tags in aliases_to_tags.iteritems()))
            outqueue.put(line)

        del aliases_to_tags

def _collector(inqueue):
    with gzip.open(cfg.processed_dir + "wikipedia.aliases.tsv.gz", "w") as outfile:
        for line in pb.iter_progress(inqueue):
            outfile.write(line)

ps.parallel_stream(
    _emitter,
    _worker,
    _collector,
    emitter_output_chunk_size=100,
    worker_output_chunk_size=100,
    emitter_queue_size=100,
    collector_queue_size=100,
    fork_collector=False,
    n_jobs=-1
)
# it lasts 3min 50s

# PROCESS WIKIDATA RAW INTERMEDIATE FILE

In [None]:
def _emitter(outqueue):
    with gzip.open(cfg.processed_dir + "wikidata_raw/aliases.tsv.gz", "r") as infile:
        for line in infile:
            outqueue.put(line)

def _worker(worker_id, inqueue, outqueue):    
    src_tag_to_aliases_raw = {"name": None, "alias": None}

    for line in inqueue:
        # split the line
        entity_id, aliases_raw = line[:-1].decode('utf-8').split("\t", 1)
        aliases_raw = aliases_raw.split("\t")
        
        # set name and aliases
        src_tag_to_aliases_raw["name"] = aliases_raw[:1]
        src_tag_to_aliases_raw["alias"] = aliases_raw[1:]

        # normalization
        aliases_to_tags = normalize_aliases_raw(src_tag_to_aliases_raw)

        # format the output line
        if len(aliases_to_tags):
            line = "{}\t{}\n".format(entity_id, "\t".join("{}:{}".format(alias, ",".join(tags)) for alias, tags in aliases_to_tags.iteritems()))
            outqueue.put(line)

        # free the memory
        del aliases_to_tags

def _collector(inqueue):
    with gzip.open(cfg.processed_dir + "wikidata.aliases.tsv.gz", "w") as outfile:
        for line in pb.iter_progress(inqueue):
            outfile.write(line)

ps.parallel_stream(
    _emitter,
    _worker,
    _collector,
    emitter_output_chunk_size=100,
    worker_output_chunk_size=100,
    emitter_queue_size=100,
    collector_queue_size=100,
    fork_collector=False,
    n_jobs=-1
)
# it lasts 6min 40s