In [174]:
import nltk
from nltk import ne_chunk, pos_tag
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.wsd import lesk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
stemmer = PorterStemmer()
wn_lem = WordNetLemmatizer() # wn_lem.lemmatize()

"""
    Input: (data, word_range, which_type)
        data - one sentence
        word_range - a nested list of clusters with their ranges (ie. [[[0,0],[0,19]],[[19,19],[59,59]]])
        which_type - one of ("name", "pro", "term", "all"/nothing)
        
    return case: 
        1. if which_type == "name" and the function returns True >> THERE IS NAME LINK >> REMOVE
        2. if which_type == "pro" and function returns True >> ALL LINKS ARE PRONOUN >> REMOVE
        3. if which_type == "term" and function returns True >> THERE IS GENDER TERM LINK >> REMOVE
        4. if which_type == "all" or not given and function returns True >> one of three remove cases hold >> REMOVE
        5. if the function returns False >> NO NAME LINK and ALL LINKS ARE NOT PRONOUNS >> KEEP
                   
"""

def check_remove(data, word_range, which_type="all"):
    gen_fam_term = ["father", "mother", "son", "daughter", "husband", "wife", "brother", "sister", 
                    "grandfather", "grandmother","grandson", "granddaughter", "uncle", "aunt", "nephew", "niece"]
    gen_term = ["female", "male", "woman", "man","girl", "boy"]
    pro_lst = ["he", "she", "him", "her", "his", "hers", "himself", "herself"]
    
    result = []
    tok = word_tokenize(data)

    for cluster in word_range:
        #print(name_lst)
        if (which_type == "name"): # check if the cluster has name link
            # Check all the instances of human names in the sentence and build "name_lst"
            name_lst = []
            for sent_chunk in ne_chunk(pos_tag(word_tokenize(data))):
                if hasattr(sent_chunk, 'label'):
                    if (sent_chunk.label() == "PERSON"):
                        name_lst.append(' '.join(c[0] for c in sent_chunk))
                        #(print("TESTING", c[0]) for c in sent_chunk)
            result.append(any([((' '.join(w for w in tok[c[0]:c[1]+1])) in name_lst) 
                                for c in cluster]))
            
        elif (which_type == "pro"): # check if the cluster has only pronoun links
            result.append(all([((c[0] == c[1]) and (tok[c[0]]).lower() in pro_lst) for c in cluster]))
            
            
        elif (which_type == "term"): # check if the cluster has gendered term
            for c in cluster:
                for i in c:
                    word_disam = lesk(tok, tok[i], 'n') # check definition assigned from word disambiguation
                    #print(word_disam)
                    # if the word is a valid English word check if it's person word and the definition contains gendered meaning
                    if (word_disam is not None) and (word_disam.lexname() == "noun.person"): 
                        # now looking at all nouns in the range but after ACL we can use dependency parsing and only look at the head noun
                        #print(word_disam.definition())
                        result.append(any([wn_lem.lemmatize(w) in (gen_fam_term + gen_term + pro_lst)
                                           and (x in (gen_fam_term + gen_term + pro_lst)
                                           for x in wn._morphy(w, wordnet.NOUN)) # checks all possible morphological functions
                                           for w in word_tokenize(word_disam.definition())]
                                        + [tok[i] in (gen_fam_term + gen_term + pro_lst)]))
                    else:
                        result.append(False)
                else:
                    continue
        else: # check all conditions at the same time
            result.append(any([check_remove(data, word_range, which_type="name"),
                              check_remove(data, word_range, which_type="pro"),
                             check_remove(data, word_range, which_type="term")]))
            
    return any(result)

def filter_by_corpus(corpus,coref_ranges,prev_result,which_type="all"):
    this_result = []
    for i in range(0,len(corpus)):
        if prev_result[i] == 1:
            print(corpus[i],coref_ranges[i])
            if check_remove(corpus[i], coref_ranges[i], which_type):
                this_result.append(0)
            else:
                this_result.append(1)
        else:
            this_result.append(0)
    return this_result

In [175]:
sent_A = "Men more often than not connect through indicators of sexual access just as much as his do through sex."

In [176]:
sent = "WASHINGTON-- In the wake of a string of abuses by New York police officers in the 1990s, she, Loretta E. Lynch, the top federal prosecutor in Brooklyn, spoke forcefully about the pain of a broken trust that African-Americans felt and said the responsibility for repairing generations of miscommunication and mistrust fell to law enforcement she."
sent1 = "SOME RANDOM SENT"
sent2 = 'Men went to his bank to deposit money.'
test_sent = [sent,sent1,sent2,sent_A]
ranges = [[[[0,0],[0,19]],[[19,19],[59,59]]],[],[[[0,0],[5,5]]],[[[0, 0], [15, 15]]]] # We want [f,0,t,t,]
prev_result = [1,0,1,1]

In [177]:
check_remove(sent_A, [[[0, 0], [15, 15]]], "term")

True

In [178]:
check_remove(sent, [[[0,0],[0,19]],[[19,19],[59,59]]], "term")

#check_remove(sent, [[[21,s23],[19,19]],[[19,19],[59,59]]])
check_remove(sent2, [[[0,0],[5,5]]], "term")
#check_remove(sent2, [[[1,1],[4,4]]], "pro")

True

In [183]:
filter_by_corpus(test_sent,ranges,prev_result,"term")

WASHINGTON-- In the wake of a string of abuses by New York police officers in the 1990s, she, Loretta E. Lynch, the top federal prosecutor in Brooklyn, spoke forcefully about the pain of a broken trust that African-Americans felt and said the responsibility for repairing generations of miscommunication and mistrust fell to law enforcement she. [[[0, 0], [0, 19]], [[19, 19], [59, 59]]]
Men went to his bank to deposit money. [[[0, 0], [5, 5]]]
Men more often than not connect through indicators of sexual access just as much as his do through sex. [[[0, 0], [15, 15]]]


[1, 0, 0, 0]

In [101]:
wn._morphy("men", wn.NOUN)

['men', 'man']

In [16]:
wordnet.synsets("landlord")

'a manservant who acts as a personal attendant to his employer'

In [85]:
wn_lem.lemmatize("days")

'day'