# PREAMBLE

In [None]:
%load_ext cython

In [None]:
import cfg  # to include python_libs

import codecs
import cPickle as cPickle
import gzip
import os
import pattern.en

import progress_bar as pb
from efficient_query_expansion.normalize_text import normalize_text, normalize_hyphens, get_stopword_set

## STOPWORDS AND GOOD TERMS

In [None]:
# available at 
assert os.path.isfile(cfg.raw_dir + "frequent_terms.txt.gz")

In [None]:
%time stopwords = get_stopword_set()

In [None]:
# this set of terms represents an overestimation of the good terms.
# We filled it with all terms having document frequency greater than 20
%time good_unary_terms = set(line.strip() for line in gzip.open(cfg.raw_dir + "frequent_terms.txt.gz"))
print len(good_unary_terms)

# SUPPORT FOR THE EXPORT

In [None]:
# segments and viceversa
segment_to_segment_id = dict()
segment_id_to_segment = []

# spots segments
segment_id_to_segment_id_segment_sim_list = dict()

# entity-related segments
entity_id_to_tags_segment_id_list = []
segment_id_to_entity_id_tags_list = []

In [None]:
%%cython

def _filter_segment_support(segment, stopwords, good_unary_terms):
    if not segment:
        return False

    # split the segment into words
    segment_split = segment.split()
    # consider only aliases with at most 6 words (i.e. 5 spaces) and for which there is at least one word that is not a stopword.
    return 0 < len(segment_split) <= 6 and all(w in good_unary_terms for w in segment_split) and not all(w in stopwords for w in segment_split)

def _filter_support(segment_iterator, stopwords, good_unary_terms):
    if not isinstance(segment_iterator, set):
        segment_iterator = set(segment_iterator)
    return [segment for segment in segment_iterator if _filter_segment_support(segment, stopwords, good_unary_terms)]

def _add_segment_support(segment, segment_to_segment_id, segment_id_to_segment):
    segment_id = segment_to_segment_id.get(segment, None)
    if segment_id is None:
        # new segment
        segment_id = segment_to_segment_id[segment] = len(segment_id_to_segment)
        segment_id_to_segment.append(segment)
        return (segment_id, True)
    else:
        # segment already in
        return (segment_id, False)

In [None]:
def add_entity_aliases(alias_to_tags, entity_tags):
    global entity_id_to_tags_segment_id_list
    global segment_id_to_entity_id_tags_list

    # parameters check
    assert isinstance(alias_to_tags, (dict)) and all(isinstance(alias, str) and isinstance(tags, tuple) and all(isinstance(tag, str) for tag in tags) for alias, tags in alias_to_tags.iteritems())
    assert isinstance(entity_tags, tuple) and all(isinstance(tag, str) for tag in entity_tags)

    # filter some aliases with respect to the filter_segment_support function
    alias_to_tags = dict(
        (alias, tags)
        for alias, tags in alias_to_tags.iteritems()
        if _filter_segment_support(alias, stopwords, good_unary_terms)
    )
    num_aliases = len(alias_to_tags)

#     if num_aliases <= 1:
#         return
    # the previous filter has been replaced by the following one to include multi-term entities even if they haven't syns.
    # The reason is that the segmentation will put toghether terms that if expanded alone will have a different meaning
    if num_aliases <= 1:
        if num_aliases == 0:
            return
        if " " not in alias_to_tags.keys()[0]:
            return

    # fill the support structures above
    entity_id = len(entity_id_to_tags_segment_id_list)
    segment_id_list = []
    for segment, tags in alias_to_tags.iteritems():
        segment_id, is_segment_new = _add_segment_support(segment, segment_to_segment_id, segment_id_to_segment)
        entry = (entity_id, tags)
        if is_segment_new:
            segment_id_to_entity_id_tags_list.append((entry,))
        else:
            segment_id_to_entity_id_tags_list[segment_id] += (entry,)

        segment_id_list.append(segment_id)
    entity_id_to_tags_segment_id_list.append((entity_tags, tuple(segment_id_list)))

# FILL THE SUPPORT STRUCTURES WITH THE WIKIPEDIA ENTITIES

In [None]:
def _alias_tags_str_to_dict_entry(alias_tags_str):
    p = alias_tags_str.find(":")
    return (alias_tags_str[:p], tuple(alias_tags_str[p+1:].split(",")))

## ALIASES

In [None]:
%%time
wikipedia_entity_tags = ("WPEnt", )

with gzip.open(cfg.processed_dir + "wikipedia.aliases.tsv.gz", "r") as infile:
    for line in pb.iter_progress(infile):
        p = line.find("\t")
        entity_id = line[:p]
        alias_to_tags = dict(
            _alias_tags_str_to_dict_entry(alias_tags)
            for alias_tags in line[p+1:-1].split("\t")  # -1 because the last character is always the \n
        )

        # add this entity
        add_entity_aliases(alias_to_tags, wikipedia_entity_tags)
# it should last 17min

# FILL THE SUPPORT STRUCTURES WITH THE WIKIDATA ENTITIES

In [None]:
%%time
wikidata_entity_tags = ("WDEnt", )

with gzip.open(cfg.processed_dir + "wikidata.aliases.tsv.gz", "r") as infile:
    for line in pb.iter_progress(infile):
        p = line.find("\t")
        entity_id = line[:p]

        # exclude wiki properties from this export
        if entity_id[0] == 'P':
            continue

        alias_to_tags = dict(
            _alias_tags_str_to_dict_entry(alias_tags)
            for alias_tags in line[p+1:-1].split("\t")  # -1 because the last character is always the \n
        )

        add_entity_aliases(alias_to_tags, wikidata_entity_tags)
# it should last 48min

# FILL THE SUPPORT STRUCTURES WITH THE OPENOFFICE ENTRIES

In [None]:
assert os.path.isfile(cfg.raw_dir + "thesaurus_en_openoffice_v1.txt.gz")

In [None]:
%%time
# thesaurus-related segments
segment_id_to_meaning_id_list = dict()
meaning_id_to_pos_segment_id_list = []

reader = gzip.open(cfg.raw_dir + "thesaurus_en_openoffice_v1.txt.gz", "rb")
try:
    # ignore the header line which contains the encoding
    encoding = reader.readline().strip()
    # adjust the encoding reader
    if encoding != "ASCII":
        reader = codecs.getreader(encoding)(reader)

    for row in pb.iter_progress(reader):
        row = row.strip()
        if row == "":
            continue

        # implicit check that the row contains only two values
        word_raw, num_meanings = row.split("|")
        # check if the row is not a meaning of another word
        if word_raw.startswith("("):
            raise Exception("Bad original word format")

        key_list = _filter_support(
            [normalize_text(word_raw), normalize_hyphens(word_raw)],
            stopwords,
            good_unary_terms
        )
        meanings = []
        # add the hyphen normalization in the synset, in such a way to expand one form in the other
        if len(key_list) > 1:
            meanings.append(
                ("Hyph", key_list)
            )

        # implicit check if num_meanings is an integer
        for i in xrange(int(num_meanings)):
            synonyms_raw = reader.readline().strip().split("|")

            if synonyms_raw[0][0] != '(' or synonyms_raw[0][-1] != ')':
                raise Exception("POS not recognized on a meaning line")
            pos = str(synonyms_raw[0][1:-1])
            synonyms_raw = synonyms_raw[1:]
            synonyms = set(map(normalize_text, synonyms_raw) + map(normalize_hyphens, synonyms_raw))
            #if pos == 'noun':
            #    synonyms |= set(normalize_text(pattern.en.pluralize(synonym, pos)) for synonym in synonyms)

            synset = _filter_support(
                synonyms,
                stopwords,
                good_unary_terms
            )
            if len(synset) > 0:
                meanings.append(
                    (pos, synset)
                )

        # export if there are meanings to expand and if the key_list is not empty
        if len(meanings) == 0 or len(key_list) == 0:
            continue

        key_id_list = [
            _add_segment_support(key, segment_to_segment_id, segment_id_to_segment)[0]
            for key in key_list
        ]
        start_meaning_len = len(meaning_id_to_pos_segment_id_list)
        for pos, synset in meanings:
            meaning_id_to_pos_segment_id_list.append(
                (pos, tuple(
                    _add_segment_support(term, segment_to_segment_id, segment_id_to_segment)[0]
                    for term in synset
                ))
            )
        meaning_id_list = tuple(xrange(start_meaning_len, len(meaning_id_to_pos_segment_id_list)))
        for key_id in key_id_list:
            if key_id in segment_id_to_meaning_id_list:
                segment_id_to_meaning_id_list[key_id] += meaning_id_list
            else:
                segment_id_to_meaning_id_list[key_id] = meaning_id_list
finally:
    reader.close()
# it should last 45s

# EXPORT DICT AND ENTITY SUPPORT

In [None]:
print "Segments recognized:     {: >8}".format(len(segment_id_to_segment))
print "Entity wikipedia entries:{: >8}".format(len(set(segment_id for tags, segment_id_list in entity_id_to_tags_segment_id_list if tags[0] == "WPEnt" for segment_id in segment_id_list)))
print "Entity wikidata entries: {: >8}".format(len(set(segment_id for tags, segment_id_list in entity_id_to_tags_segment_id_list if tags[0] == "WDEnt" for segment_id in segment_id_list)))
print "Entity toal entries:     {: >8}".format(len(segment_id_to_entity_id_tags_list))
print "Thesaurus entries:       {: >8}".format(len(segment_id_to_meaning_id_list))

In [None]:
%%time
# compact the lists
segment_id_to_segment = tuple(segment_id_to_segment)

entity_id_to_tags_segment_id_list = tuple(entity_id_to_tags_segment_id_list)
segment_id_to_entity_id_tags_list = tuple(segment_id_to_entity_id_tags_list)

meaning_id_to_pos_segment_id_list = tuple(meaning_id_to_pos_segment_id_list)

In [None]:
%%time
with open(cfg.thesaurus_dir + "expansion_support.pickle", "wb") as outfile:
    cPickle.dump({
            'segment_id_to_segment': segment_id_to_segment,

            'entity_id_to_tags_segment_id_list': entity_id_to_tags_segment_id_list,
            'segment_id_to_entity_id_tags_list': segment_id_to_entity_id_tags_list,

            'segment_id_to_meaning_id_list': segment_id_to_meaning_id_list,
            'meaning_id_to_pos_segment_id_list': meaning_id_to_pos_segment_id_list,
        }, outfile, protocol=cPickle.HIGHEST_PROTOCOL)