In [1]:
# The Function: prDualRank()
# It takes the follow parameters:
#     (1) seedEidsWithConfidence: list of [seed, confidence] elements (confidence = 0 for all, it's not used)
#     (2) eid2patterns: map of eid t to its associated patterns
#     (3) pattern2eids: map of pattern p to its associated eids    
#     (4) eid2ename: map of eid to ename (simply, an ID to name map for the tuples)
#     (5) eidsFeatureCount: eidsFeatureCount[t][p] stores |I_tp|
#     (6) eids2Sup: map of eid t to |I_t|
#     (7) patterns2Sup: map of pattern p to |I_p|
#     -------- debug parameters -------- 
#     (8) FLAGS_VERBOSE=False
#     (9) FLAGS_DEBUG=False
# 
#     -------- currently unused parameters -------- 
#     ---------------------- negativeSeedEids: not used!
#     ---------------------- eidAndPattern2strength: not used!
#     ---------------------- eid2types: not used!
#     ---------------------- type2eids: not used!
#     ---------------------- eidAndType2strength: not used!
#
# It returns the following four tuple, all sorted in decreasing order of secondary values:
#     (1) Pattern Precision
#     (2) Pattern Recall
#     (3) Tuples Precision
#     (4) Tuples Recall

# We have:
#     (1) the search engine S (Whoosh)
#     (2) the extractor E (Spacy-based, which takes the name of the file to be read, and patterns used to extract)
#     (3) the prDualRank() function, as described above
#    
# Using these, we code patternSearch() and tupleExtraction()

In [None]:
# S = whoosh
# E = extractor by Shuhan
def tupleExtraction(P_s_recall, P_s_precision, P_e_recall, P_e_precision, k_width, k_depth, k_ext, k_cand):
    
    P_s_fscore_based = {}
    for pattern, recall in P_s_recall.items():
        precision = P_s_precision[pattern]
        P_s_fscore_based[pattern] = ((2 * recall * precision) / (recall + precision))
    sorted_P_s_fscore_based = sorted(P_s_fscore_based, key=P_s_fscore_based.__getitem__, reverse=True)
    P_s = take(k_width, sorted_P_s_fscore_based)
    
    D_search = []
    for pattern, _ in P_s.items():
        qp = QueryParser("documents", schema=index.schema)
        q = qp.parse(pattern)
        with index.searcher() as s:
            results = s.search(q, limit = k_depth)
            D_search.extend(results)
    
    P_e_fscore_based = {}
    for pattern, recall in P_e_recall.items():
        precision = P_e_precision[pattern]
        P_e_fscore_based[pattern] = ((2 * recall * precision) / (recall + precision))
    sorted_P_e_fscore_based = sorted(P_e_fscore_based, key=P_e_fscore_based.__getitem__, reverse=True)
    P_e = take(k_ext, sorted_P_e_fscore_based)
    
    candidates = []
    for pattern in P_e:
        for document in D_search:
            candidates.extend(extractor(document, pattenr))
    
    candidates_with_fscore = {}
    for phrase in candidates:
        qp = QueryParser("documents", schema=index.schema)
        q = qp.parse(phrase)
        with index.searcher() as s:
            D_alt = s.search(q, limit = k_depth)
        precision, recall = get_tuple_precision_recall(phrase, D_alt)
        f_score = (2 * precision * recall) / (precision + recall)
        candidates_with_fscore[phrase] = f_score
    sorted_candidates_with_fscore = sorted(candidates_with_fscore key=candidates_with_fscore.__getitem__, reverse=True)
    best_candidates = take(k_cand, sorted_candidates_with_fscore)
    
    return best_candidates
    