In [98]:
import nltk
from nltk import ne_chunk, pos_tag
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.wsd import lesk
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
stemmer = PorterStemmer()
wn_lem = WordNetLemmatizer() # wn_lem.lemmatize()

"""
    Input: (data, word_range, which_type)
        data - one sentence
        word_range - a nested list of clusters with their ranges (ie. [[[0,0],[0,19]],[[19,19],[59,59]]])
        which_type - one of ("name", "pro", "term", "all"/nothing)

    return case: 
        1. if which_type == "name" and the function returns True >> THERE IS NAME LINK >> REMOVE
        2. if which_type == "pro" and function returns True >> ALL LINKS ARE PRONOUN >> REMOVE
        3. if which_type == "term" and function returns True >> THERE IS GENDER TERM LINK >> REMOVE
        4. if which_type == "all" or not given and function returns True >> one of three remove cases hold >> REMOVE
        5. if the function returns False >> NO NAME LINK and ALL LINKS ARE NOT PRONOUNS >> KEEP

"""


def check_remove(data, tok, word_range, which_type="all"):
    gen_fam_term = ["father", "mother", "son", "daughter", "husband", "wife", "brother", "sister",
                    "grandfather", "grandmother", "grandson", "granddaughter", "uncle", "aunt", "nephew", "niece"]
    gen_term = ["female", "male", "woman", "man", "girl", "boy"]
    pro_lst = ["he", "she", "him", "her", "his", "hers", "himself", "herself"]

    result = []
    #tok = word_tokenize(data)
    for cluster in word_range:
        # print(name_lst)
        #print("#### ",word_range)
        if (which_type == "name"):  # check if the cluster has name link
            #print("&&&&___NAME: ", data)
            #print("&&&&___NAME_RANGE: ", cluster)

            # Check all the instances of human names in the sentence and build "name_lst"
            name_lst = []
            for sent_chunk in ne_chunk(pos_tag(tok)):
                if hasattr(sent_chunk, 'label'):
                    if (sent_chunk.label() == "PERSON"):
                        name_lst.append(' '.join(c[0] for c in sent_chunk))
                        (print("TESTING", c[0]) for c in sent_chunk)
            result.append(any([((' '.join(w for w in tok[c[0]:c[1] + 1])) in name_lst)
                               for c in cluster]))

        elif (which_type == "pro"):  # check if the cluster has only pronoun links
            #print("&&&&___PRO: ", data)
            #print("&&&&___PRO_RANGE: ", cluster)
            #print(tok)
            #[print(c) for c in cluster]
            result.append(all([((c[0] == c[1]) and (tok[c[0]]).lower() in pro_lst) for c in cluster]))


        elif (which_type == "term"):  # check if the cluster has gendered term
            #print("&&&&___TERM: ", data)
            #print("&&&&___TERM_CLUSTER: ", cluster)

            for c in cluster:
                for i in c:
                    word_disam = lesk(tok, tok[i], 'n')  # check definition assigned from word disambiguation
                    # print(word_disam)
                    # if the word is a valid English word check if it's person word and the definition contains gendered meaning
                    if (word_disam is not None) and (word_disam.lexname() == "noun.person"):
                        # now looking at all nouns in the range but after ACL we can use dependency parsing and only look at the head noun
                        # print(word_disam.definition())
                        result.append(any([wn_lem.lemmatize(w) in (gen_fam_term + gen_term + pro_lst)
                                           and (x in (gen_fam_term + gen_term + pro_lst)
                                                for x in wn._morphy(w, wn.NOUN))
                                           # checks all possible morphological functions
                                           for w in word_tokenize(word_disam.definition())]
                                          + [tok[i] in (gen_fam_term + gen_term + pro_lst)]))
                    else:
                        result.append(False)
                else:
                    continue
        else:  # check all conditions at the same time
            result.append(any([check_remove(data, tok, word_range, which_type="name"),
                               check_remove(data, tok, word_range, which_type="pro"),
                               check_remove(data, tok, word_range, which_type="term")]))

    return any(result)


def filter_by_corpus(corpus, tok_corpus, coref_ranges, prev_result, which_type="all"):
    this_result = []
    for i in range(0, len(corpus)):
        if prev_result[i] == 1:
            # print(corpus[i],coref_ranges[i])
            if check_remove(corpus[i],tok_corpus[i], coref_ranges[i], which_type):
                this_result.append(0)
            else:
                this_result.append(1)
        else:
            this_result.append(0)
    return this_result

In [99]:
# use case: "name" filter isn't working correctly (should be True when it returns False)
sent1 = "Paul Lukas plays the very earnest and decent head of his family."
tok1 = ['Paul', 'Lukas', 'plays', 'the', 'very', 'earnest', 'and', 'decent', 'head', 'of', 'his', 'family', '.']
range1 = [[[0, 1], [10, 10]]]
check_remove(sent1, tok1, range1, "name")

False

In [100]:
# same use case as above (another example): "name" filter isn't working correctly (should be True when it returns False)
sent1_1="All fines that have been given to us unjustly and against the law of the land, and all fines that we have exacted unjustly, shall be entirely remitted or the matter decided by a majority judgement of the twenty-five barons referred to below in the clause for securing the peace together with Stephen, archbishop of Canterbury, if he can be present, and such others as he wishes to bring with him."
tok1_1=['All', 'fines', 'that', 'have', 'been', 'given', 'to', 'us', 'unjustly', 'and', 'against', 'the', 'law', 'of', 'the', 'land', ',', 'and', 'all', 'fines', 'that', 'we', 'have', 'exacted', 'unjustly', ',', 'shall', 'be', 'entirely', 'remitted', 'or', 'the', 'matter', 'decided', 'by', 'a', 'majority', 'judgement', 'of', 'the', 'twenty', '-', 'five', 'barons', 'referred', 'to', 'below', 'in', 'the', 'clause', 'for', 'securing', 'the', 'peace', 'together', 'with', 'Stephen', ',', 'archbishop', 'of', 'Canterbury', ',', 'if', 'he', 'can', 'be', 'present', ',', 'and', 'such', 'others', 'as', 'he', 'wishes', 'to', 'bring', 'with', 'him', '.']
range1_1=[[[7, 7], [21, 21]], [[56, 60], [63, 63], [72, 72], [77, 77]]]
check_remove(sent1_1, tok1_1, range1_1, "name")

False

In [101]:
# use case: 2 gender generalization clusters (highlight clusters in two different colors) 
sent2 = "When the maker of a lost note pays the amount to the original owner, he should receive from him what is known as a bond of indemnity."
tok2 = ['When', 'the', 'maker', 'of', 'a', 'lost', 'note', 'pays', 'the', 'amount', 'to', 'the', 'original', 'owner', ',', 'he', 'should', 'receive', 'from', 'him', 'what', 'is', 'known', 'as', 'a', 'bond', 'of', 'indemnity', '.']
range2 = [[[1, 6], [15, 15]], [[11, 13], [19, 19]]]
check_remove(sent2, tok2, range2, "all")

False

In [102]:
# use case: one cluster is gender generalization and the other cluster is not
sent3 = "A nurse must take care of her patients and the boy likes to play with his toys."
tok3 = ['A', 'nurse', 'must', 'take', 'care', 'of', 'her', 'patients', 'and', 'the', 'boy', 'likes', 'to', 'play', 'with', 'his', 'toys', '.']
range3=[[[0, 1], [6, 6]], [[9, 10], [15, 15]]]
check_remove(sent3, tok3, range3, "all")


True

In [103]:
# use case: a cluster contains "the king" and should be removed by "term" (returns False when should be True)
sent4 = "They sought him through all the ranks, and every time they met one of these officers they thought they had come face to face with the king."
tok4 = ['They', 'sought', 'him', 'through', 'all', 'the', 'ranks', ',', 'and', 'every', 'time', 'they', 'met', 'one', 'of', 'these', 'officers', 'they', 'thought', 'they', 'had', 'come', 'face', 'to', 'face', 'with', 'the', 'king', '.']
range4 = [[[0, 0], [11, 11], [17, 17], [19, 19]], [[2, 2], [26, 27]]]
check_remove(sent4, tok4, range4, "term")

False

In [104]:
# same as above use case: a cluster contains "the queen" and should be removed by "term" (returns False when should be True)

sent5 = "This I have repeatedly witnessed in my observing hives, and admired the sagacity of the queen in economizing her necessary work after this fashion, instead of laboriously depositing the eggs in cells where they are not wanted."
tok5 = ['This', 'I', 'have', 'repeatedly', 'witnessed', 'in', 'my', 'observing', 'hives', ',', 'and', 'admired', 'the', 'sagacity', 'of', 'the', 'queen', 'in', 'economizing', 'her', 'necessary', 'work', 'after', 'this', 'fashion', ',', 'instead', 'of', 'laboriously', 'depositing', 'the', 'eggs', 'in', 'cells', 'where', 'they', 'are', 'not', 'wanted', '.']
range5 = [[[1, 1], [6, 6]], [[15, 16], [19, 19]]]
check_remove(sent5, tok5, range5, "term")


False