In [1]:
#Get access to the xml file of the sentences for the task.
import xml.etree.ElementTree as ET
tree = ET.parse("lexsub_trial.xml")
root = tree.getroot()

#Getting a list of the sentences with nouns as heads.
noun_stuff = []
for child in root:
    word = child.attrib['item']
    if word.endswith('.n'):
        noun_stuff.append(child)

In [2]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import semcor as sc
from nltk.tokenize import word_tokenize as wt
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

In [3]:
#Create a list of the words of interest.
heads_list = []
for i in range(len(noun_stuff)):
    for j in range(len(noun_stuff[i])):
        word = wnl.lemmatize(noun_stuff[i][j][0][0].text.lower())
        if word not in heads_list:
            heads_list.append(word)

#Create a list of the synonyms of interest, from the list of head words.
synsets = []
for h in heads_list:
    syns = [s.name() for s in wn.synsets(h, pos=wn.NOUN)]
    synsets += syns

In [4]:
#A function taking a list of synsets of interest, and returning a dictionary of word senses and corresponding 
#context from semcor.

def get_sense_dict(synsets):
    sc_ids = sc.fileids() #get all the semcor fileids
    lookup = {} #initialise a dictionary
    
    #Search through all the sentences in all the files for any words sense-tagged with a synset of interest.
    for id in sc_ids: #for each id
        sents = sc.xml(id).findall('context/p/s') #get all of the sentences in the file, in xml form
        sents_text = sc.sents(id) #get all of the sentences in the file, as a list of strings
        
        for i in range(len(sents)): #for each sentence
            for wordform in sents[i].getchildren(): #for each word in the sentence
                if wordform.get('lexsn'): #if the word has been assigned a lex_sense
                    syn_key = wordform.get('lemma') + '%' + wordform.get('lexsn') #generate the synset key for the word
                    
                    try: #try to use that key to find the corresponding synset
                        s = wn.lemma_from_key(syn_key).synset().name()
                    except Exception: #if that doesn't work, construct a synset ID from other info
                        try:
                            sense = '%s.%s.%02d' % (wordform.get('lemma'), wordform.get('pos'), int(wordform.get('wnsn')))
                        except ValueError: #if the system doesn't like the int form of wnsn (happens sometimes)
                            sense = wordform.get('lemma')+'.'+wordform.get('pos')+'.'+wordform.get('wnsn')
                            
                    if s in synsets: #if the synset is one of the ones we're interested in...
                        if s in lookup: #if already in the dictionary, add the rest of the sentence to the value entry.
                            lookup[s] += [w for w in sents_text[i] if w not in heads_list]
                        else: #otherwise create new key,value pairing, using the synset and rest of the sentence as value.
                            lookup[s] = [w for w in sents_text[i] if w not in heads_list]
    return lookup

In [5]:
#Generate the dictionary of synsets and contexts from semcor.
sense_dict = get_sense_dict(synsets)

In [194]:
#A function taking a sentence with word to be replaced as input, and returning a replacement word.
#Calls a number of sub-functions to do its work (see below).

#PARAMETERS:

#syn_approach - 1=take most frequent synset as sense; 2=use knowledge to try and identify sense.

#depth - 1=synset glosses to contain description and examples for synset alone; 
#...2=glosses include hyponym description and examples; 3=hypernym description and examples added;
#...4=words from SemCor sentences containing a word with given synset added to glosses;
#...5=only synset description and examples + SemCor words in glosses. To be passed to get_synset,
#and then on to get_thesaurus_stuff functions.

#lemm_approach - 1=take first alternative lemma WordNet gives for synset;
#...2=take most frequently occuring lemma of the alternatives;
#...3=take lemma with highest distributional similarity to target word (using Word2Vec).
#To be passed to the get_lemma function.

def get_replacement(instance, syn_approach=1, depth=1, lemm_approach=1):
    
    #Treat a lemmatised version of the head as the word to be replaced, and get the word's synsets.
    word = wnl.lemmatize(instance[0][0].text.lower())
    syns = wn.synsets(word, wn.NOUN)
    
    #If chosen approach is just to take most common synset, return the first in the list.
    if syn_approach==1:
        synset = syns[0]
        
    #Otherwise, get the context the word appears in and use this to identify likely sense, and best synset.
    else:    
        #Treat the text either side of the head word as the context.
        if instance[0].text: #if the head word isn't the first of the sentence, context is what comes before and after it
            context = instance[0].text, instance[0][0].tail
            context = wt(' '.join(context))
        else: #if head is first of sentence, context is whatever comes after it
            context = wt(instance[0][0].tail)
        context = [w.lower() for w in context if w.isalpha() and w not in stopwords]
    
        synset = get_synset(syns, context, depth) #pass synsets and context to get_synset function
        
    #Given the synset returned by get_synset, find an alternative lemma
    lemma = get_lemma(word, synset, lemm_approach)
    
    #If no lemma is returned, likely because the only lemma for given synset was original word, use hypernyms
    #of the sense to find a replacement word.
    if not lemma:
        hyper_syns = synset.hypernyms()

        if len(hyper_syns) > 1: #If there is more than one hypernym:
            synset = get_synset(hyper_syns, context, depth) #Try to identify which hypernym is best.
                
        else: #Otherwise just use the hypernym as the new synset.
            synset = hyper_syns.pop()
                
        lemma = get_lemma(word, synset, lemm_approach) #Find the best lemma from the hypernym synset.   
    
    return lemma #Return whichever lemma was found as the best replacement.

In [195]:
#A function to find the best synset given context. Follows simplified Lesk algorithm, comparing the context the
#word appears in with the glosses and examples from WordNet and sentences from SemCor corresponding to each sense.

#PARAMETERS:

#synsets - the set of WordNet synsets for the target word.
#context - the context (words in sentence) the target word appears in.
#depth - see description above get_replacement function; to be passed to get_thesaurus_stuff.

def get_synset(synsets, context, depth):
    max_count = 0 #a value to monitor the best crossover score
    best_set = None #to update with best synset, based on crossover score

    for k in range(len(synsets)): #for each synset
        syn_gloss = get_thesaurus_stuff(synsets[k], depth) #get the gloss
        crossover = [w for w in context if w in syn_gloss] #pick out words appearing in gloss and target context
        count = len(crossover) #crossover count is length of list of crossover words

        if count > max_count: #if this is the largest crossover so far
            max_count = count #update the best crossover score accordingly...
            best_set = synsets[k] #and do the same for the best synset.
    
    if not best_set: #if no best set is identified (if there is no crossover for any of them)
        best_set = synsets[0] #pick the first WordNet synset as the best

    return best_set

In [216]:
#A function to get the thesaurus gloss for a given synset. This can be expanded to include that
#of its hypo- and hypernyms, and words from SemCor sentences containing the sense.

#PARAMETERS:

#synset - the synset in question, from get_synset.
#depth - see description above get_replacement function; from get_replacement via get_synset.

def get_thesaurus_stuff(synset, depth):
    main_def = wt(synset.definition()) #get the definition for the WordNet synset
    main_examples = wt(' '.join(synset.examples())) #get the examples for the WordNet synset
    def_eg = main_def + main_examples #combine the words in the definition and examples
    gloss = [w.lower() for w in def_eg if w.isalpha() and w not in stopwords] #convert to lower case, prune stopwords
    
    if depth > 1 and depth < 5: #if info from hyponyms is to be added
        
        hyponyms = synset.hyponyms() #get the hyponyms of the synset
        for l in range(len(hyponyms)): #for each hyponym
            definition = wt(hyponyms[l].definition()) #get the definition for the hyponym
            joined_examples = wt(' '.join(hyponyms[l].examples())) #get the examples for the hyponym
            stuff = definition + joined_examples #combine the words in the definition and examples
            bag = [w.lower() for w in stuff if w.isalpha() and w not in stopwords]
            gloss += bag #add to gloss
            
    if depth > 2 and depth < 5: #do the same again for hypernyms, if selected.
        
        hypernyms = synset.synonyms()
        for l in range(len(hypernyms)):
            definition = wt(hypernyms[l].definition())
            joined_examples = wt(' '.join(hypernyms[l].examples()))
            stuff = definition + joined_examples
            bag = [w.lower() for w in stuff if w.isalpha() and w not in stopwords]
            gloss += bag
    
    if depth > 3: #if words from SemCor sentences are to be added to the gloss
        if synset.name() in sense_dict: #lookup the synset in the dictionary from get_sense_dict
            semcor_words = [w.lower() for w in sense_dict[synset.name()] if w not in stopwords] #prune stopwords
            gloss += semcor_words #add to gloss
            
    return gloss

In [82]:
#import vectors and libraries for distributional similarity measure in get_lemma function.
import gensim
from gensim.models import Word2Vec
filename="GoogleNews-vectors-negative300.bin"
mymodel = gensim.models.KeyedVectors.load_word2vec_format(filename, binary=True)

In [212]:
#Function to find the best lemma from the given synset to use as the word's replacement.

#PARAMETERS:

#word - the target word, from get_replacement.
#synset - the synset selected as matching the target word's sense, from get_replacement.
#approach - see lemm_approach description above get_replacement.

def get_lemma(word, synset, approach):

    poss_lemmas1 = [l for l in synset.lemmas() if str(l.name())!=word] #list of possible alternative lemmas, in lemma form
    lemmas = [str(lemma.name()) for lemma in synset.lemmas()] #list of wordforms for lemmas associated with synset
    poss_lemmas2 = [l.lower() for l in lemmas if l.lower()!=word] #list of possible alternative words
    
    if len(poss_lemmas2)== 0: #if there are no alternative lemmas
        return None
    
    else:
         
        if approach==1: #just return first alternative word
            return poss_lemmas2[0]

        if approach==2: #find the lemma with the highest WordNet count
            max_count = 0
            best = None
            for lem in poss_lemmas1: #for each possible lemma
                count = lem.count() #get its count
                if count > max_count:
                    best = str(lem.name())
                    max_count = count
                    
        else: #find the lemma deemed most similar to the target word
            max_sim = 0
            best = None
            for lem in poss_lemmas2: #for each possible lemma
                try: #get a similarity score between it and the target word
                    sim = mymodel.similarity(word, lem)
                except Exception: #if, for instance, either word doesn't appear in the vocabulary
                    sim = 0 #just make the similarity score 0
                if sim > max_sim:
                    max_sim = sim
                    best = lem
                    
        if best is None: #if no best lemma is identified
            best = poss_lemmas2[0] #just pick the first one.
                    
        return best