# IMPORTS

In [None]:
%load_ext cython

In [None]:
import cfg

import cPickle
import gzip
import json
import os
import pandas as pd
import time
from collections import OrderedDict

import progress_bar as pb
from efficient_query_expansion.normalize_text import normalize_text, get_stopword_set
from efficient_query_expansion.query_expansion_support import QueryExpansionSupport

# LOAD ALL THE DOCUMENTS, QUERIES AND ASSOCIATIONS IN MEMORY

In [None]:
%%time
def jsonConvertKeys(constructor):
    return lambda x: dict((constructor(k),v) for k,v in x.iteritems())

qid_to_query = json.load(open(cfg.training_dir + "qid_to_query.json", "r"), object_hook=jsonConvertKeys(int))
qid_to_docid_list = json.load(open(cfg.training_dir + "qid_to_docid_list.json", "r"), object_hook=jsonConvertKeys(int))
docid_to_rawtext = json.load(open(cfg.training_dir + "docid_to_rawtext.json", "r"), object_hook=jsonConvertKeys(long))

assert len(qid_to_query) == len(qid_to_docid_list)
assert all(docid in docid_to_rawtext for docid_list in qid_to_docid_list.itervalues() for docid in docid_list)

In [None]:
%%time
docid_to_text = dict(
    (docid, normalize_text(raw_text))
    for docid, raw_text in pb.iteritems_progress(docid_to_rawtext)
)

In [None]:
# revert the qid_to_docid_list associations
docid_to_qid_list = dict()
for qid, docid_list in qid_to_docid_list.iteritems():
    for docid in docid_list:
        if docid in docid_to_qid_list:
            docid_to_qid_list[docid].append(qid)
        else:
            docid_to_qid_list[docid] = [qid]

# LOAD REWRITING STRATEGIES SUPPORT

In [None]:
%time expansion_support = cPickle.load(open(cfg.thesaurus_dir + "expansion_support.pickle", "rb"))

In [None]:
# this set of terms represents an overestimation of the good terms.
# We filled it with all terms having document frequency greater than 20
%time good_unary_terms = set(line.strip() for line in gzip.open(cfg.raw_dir + "frequent_terms.txt.gz"))
print len(good_unary_terms)

In [None]:
# collection dependent term-statistics. This dictionary depends from the dataset

# segment_to_phrase_freq contains the document frequency of each thesaurus' n-gram
segment_to_phrase_freq = cPickle.load(open(cfg.thesaurus_dir + "segment_to_phrase_freq.pickle", "r"))

# segment_to_and_freq contains the number of documents containing all terms of each thesaurus' n-gram.
# to avoid duplicates the keys are collapsed according to: " ".join(sorted(ngram.split()))
segment_to_and_freq = cPickle.load(open(cfg.thesaurus_dir + "segment_to_and_freq.pickle", "r"))

In [None]:
assert all(
    segment in segment_to_phrase_freq and " ".join(sorted(segment.split())) in segment_to_and_freq
    for segment in expansion_support["segment_id_to_segment"]
    if " " in segment
)

In [None]:
stopwords = get_stopword_set()

In [None]:
qes = QueryExpansionSupport(expansion_support, good_unary_terms, stopwords, segment_to_phrase_freq, segment_to_and_freq)

# REWRITING STRATEGIES

In [None]:
# creates the query representation without any expansion
def query_to_base(query):
    global stopwords

    # normalize and tokenize the text
    query = normalize_text(query).split()

    # remove the stop words, but if the query is composed only by stopwords use all original terms
    query = filter((lambda x: x not in stopwords), query) or query

    # simulate the "synset" to match the signature. The synset is composed only by the term istelf (and its tag)
    base_query = map((lambda term: [(term,)]), query)
    candidates = map((lambda term: []), query)

    # the query is composed only of one "segmentation".
    # In case of more segmentations the two arrays contain more CNF queries
    return [base_query], [candidates]

In [None]:
strategies = OrderedDict([
    ("Base", query_to_base),
    ("SegmentedThesaurusExpansion", qes.get_all_theraurus_expansions),
])

In [None]:
%%time
strategy_name_to_qid_to_base_query = OrderedDict()
strategy_name_to_qid_to_candidates = OrderedDict()
keys = []
table = []
for strategy_name, strategy in strategies.iteritems():
    strategy_name_to_qid_to_base_query[strategy_name] = qid_to_base_query = dict()
    strategy_name_to_qid_to_candidates[strategy_name] = qid_to_candidates = dict()

    start_time = time.time()
    for qid, query in pb.iteritems_progress(qid_to_query, labeling_fun={"prefix":strategy_name}, hide_bar_on_success=True):
        qid_to_base_query[qid], qid_to_candidates[qid] = strategy(query)
    keys.append(strategy_name)
    table.append([1.0 * (time.time()-start_time) / len(qid_to_base_query)])

    del qid_to_candidates, qid_to_base_query
# it lasts 7min 30s

In [None]:
pd.DataFrame(table, index=keys, columns=["Avg. expansions time"])

# COMPUTE THE NUMBER OF MATCHES OF THE EXPANDED QUERIES

In [None]:
%%cython

def query_match(text, base_query, candidates):
    return any(  # or level: at least one of the OR macro terms must match the text
        all(  # and level: each synset must match the text
            any(  # or level: at least one of the word in the synset must be in the text
                (" " + word_and_tags[0] + " ") in text
                for word_and_tags in synset + candidates[i][j]
            ) for j, synset in enumerate(and_query)
        ) for i, and_query in enumerate(base_query)
        if len(and_query) > 0
    )

## Compute the number of matches of each rewrite

In [None]:
%%time
strategy_name_to_qid_to_num_match = OrderedDict()

for strategy_name in strategies:
    strategy_name_to_qid_to_num_match[strategy_name] = dict((qid, 0) for qid in qid_to_docid_list)

for docid, doc_text in pb.iteritems_progress(docid_to_text):
    if docid not in docid_to_qid_list:
        continue
    for qid in docid_to_qid_list[docid]:
        # for each strategy check if the query matchs the document
        for strategy_name, qid_to_base_query in strategy_name_to_qid_to_base_query.iteritems():
            strategy_name_to_qid_to_num_match[strategy_name][qid] += \
                query_match(doc_text, qid_to_base_query[qid], strategy_name_to_qid_to_candidates[strategy_name][qid])

# GROUND TRUTH BUILD (using the same format used previously)

In [None]:
strategy_name = "SegmentedThesaurusExpansion"

In [None]:
assert strategy_name in strategies

In [None]:
# for the training of the models we consider only the queries having at least one candidate expansion that can improve its recall.
queries_with_recall_improvement = [
    qid
    for qid, num_match in strategy_name_to_qid_to_num_match[strategy_name].iteritems()
    if num_match > strategy_name_to_qid_to_num_match["Base"][qid]
]
print len(queries_with_recall_improvement)

## COMPUTE THE WORD OCCURRENCES OF EACH QUERY, NEEDED BY THE TRAINING

In [None]:
%%cython

cdef _get_word_set(query_repr):
    return set(
        word_and_tags[0]
        for and_query in query_repr
        for synset in and_query
        for word_and_tags in synset
    )

def compute_word_occurrence_set(base_query, candidates, docid_list, docid_to_text):
    word_set = _get_word_set(base_query) | _get_word_set(candidates)

    return dict(
        (word, set(docid
                   for docid in docid_list
                   if (" " + word + " ") in docid_to_text[docid]))
        for word in word_set)

In [None]:
%%time
qid_to_word_to_occurrence_set = dict()

for qid in pb.iter_progress(qid_to_query):
    if qid < 0:
        continue
    qid_to_word_to_occurrence_set[qid] = compute_word_occurrence_set(
        strategy_name_to_qid_to_base_query[strategy_name][qid],
        strategy_name_to_qid_to_candidates[strategy_name][qid],
        qid_to_docid_list[qid],
        docid_to_text
    )

## SAVE THE GROUND TRUTH

In [None]:
if not os.path.isdir(cfg.processed_dir):
    os.mkdir(cfg.processed_dir)

In [None]:
%%time
with open(cfg.processed_dir + "queries_with_recall_improvement.pickle", "wb") as outfile:
    cPickle.dump(queries_with_recall_improvement, outfile, protocol=cPickle.HIGHEST_PROTOCOL)

In [None]:
%%time
with open(cfg.processed_dir + "qid_to_word_to_occurrence_set.pickle", "wb") as outfile:
    cPickle.dump(qid_to_word_to_occurrence_set, outfile, protocol=cPickle.HIGHEST_PROTOCOL)

In [None]:
%%time
with open(cfg.processed_dir + "qid_to_base_query.pickle", "wb") as outfile:
    cPickle.dump(
        dict(
            (qid, base_query)
            for qid, base_query in strategy_name_to_qid_to_base_query[strategy_name].iteritems()
            if qid in qid_to_query
        ),
        outfile,
        protocol=cPickle.HIGHEST_PROTOCOL
    )

In [None]:
%%time
with open(cfg.processed_dir + "qid_to_candidates.pickle", "wb") as outfile:
    cPickle.dump(
        dict(
            (qid, candidates)
            for qid, candidates in strategy_name_to_qid_to_candidates[strategy_name].iteritems()
            if qid in qid_to_query
        ),
        outfile,
        protocol=cPickle.HIGHEST_PROTOCOL
    )