In [1]:
import random
import nltk
from nltk.corpus import semcor, wordnet, stopwords
from nltk.corpus.reader.wordnet import Synset
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import accuracy_score

In [None]:
nltk.download('all')

In [3]:
def semcor_extraction():
    sentences = []
    for i in range(0, 50):
        nouns = []
        senpos = semcor.tagged_sents()[i]
        sentag = semcor.tagged_sents(tag='sem')[i]
        for w in range(0, len(senpos)):
            if str(senpos[w].label()) == "NN":
                nouns.append(sentag[w])
        random_word = random.choice(nouns)
        w = random_word.leaves()[0]
        for j in range(1, len(random_word.leaves())):
            w = w + " " + random_word.leaves()[j]
        sentences.append({
            "sentence":  (" ".join(semcor.sents()[i])),
            "word": w,
            "sense": random_word.label().synset()
        })
    return sentences


In [4]:
def simplified_lesk(word: str, sentence: str) -> Synset:
    """
    Computes the max_overlap to understand what is the best sense
    :param word: word to dissmbiguate
    :param sentence: sentence in which word appears
    :return: Synset that maximizes the overlap
    """
    synsets = wordnet.synsets(word)

    try:
        lemmatizer = WordNetLemmatizer()

        best_sense = wordnet.synsets(word)[0]
        max_overlap = 0
        context = set(lemmatizer.lemmatize(word) for word in sentence.split(" "))

        for sense in synsets:
            signature = set(lemmatizer.lemmatize(word)for word in sense.definition().split(" "))
            for example in sense.examples():
                signature.union(set(lemmatizer.lemmatize(word)for word in example.split(" ")))

            overlap = len(signature.intersection(context))
            if overlap > max_overlap:
                max_overlap = overlap
                best_sense = sense

        return best_sense

    except:
        return Synset('None')

In [5]:
def removes_stopwords_lesk(word: str, sentence: str) -> Synset:
    """
    Computes the max_overlap to understand what is the best sense, eliminating the stopwords
    :param word: word to dissmbiguate
    :param sentence: sentence in which word appears
    :return: Synset that maximizes the overlap
    """
    stopwords_set = set(stopwords.words('english'))
    synsets = wordnet.synsets(word)

    try:
        lemmatizer = WordNetLemmatizer()

        best_sense = wordnet.synsets(word)[0]
        max_overlap = 0
        context = set(lemmatizer.lemmatize(word)for word in sentence.split(" "))

        for sense in synsets:
            signature = set(lemmatizer.lemmatize(word)for word in sense.definition().split(" "))
            for example in sense.examples():
                signature.union(set(lemmatizer.lemmatize(word)for word in example.split(" ")))

            signature.difference(stopwords_set)

            overlap = len(signature.intersection(context))
            if overlap > max_overlap:
                max_overlap = overlap
                best_sense = sense

        return best_sense

    except:
        return Synset('None')

In [6]:
def extended_context_lesk(word: str, sentence: str) -> Synset:
    """
    Computes the max_overlap to understand what is the best sense, using hypernyms and hyponyms
    :param word: the word to be disambiguated
    :param sentence: input sentence which contains param 'word'
    :return: best_sense, which is a Wordnet Synset, for param 'word'
    """
    stopwords_set = set(stopwords.words('english'))
    synsets = wordnet.synsets(word)

    try:
        lemmatizer = WordNetLemmatizer()

        best_sense = wordnet.synsets(word)[0]
        max_overlap = 0
        context = set(lemmatizer.lemmatize(word)for word in sentence.split(" "))

        for sense in synsets:
            signature = set(lemmatizer.lemmatize(word)for word in sense.definition().split(" "))

            for example in sense.examples():
                signature.union(set(lemmatizer.lemmatize(word)for word in example.split(" ")))

            for hypernym in sense.hypernyms():
                signature = signature.union(set(lemmatizer.lemmatize(word)for word in hypernym.definition().split(" ")))

            for hyponym in sense.hyponyms():
                signature = signature.union(set(lemmatizer.lemmatize(word)for word in hyponym.definition().split(" ")))

            signature.difference(stopwords_set)

            overlap = len(signature.intersection(context))
            if overlap > max_overlap:
                max_overlap = overlap
                best_sense = sense

        return best_sense

    except:
        return Synset('None')

In [7]:
def find_senses(data):
    data["lesk"] = {}
    data["lesk"]["simple"] = simplified_lesk(data["word"], data["sentence"])
    data["lesk"]["stopwords"] = removes_stopwords_lesk(data["word"], data["sentence"])
    data["lesk"]["extended"] = extended_context_lesk(data["word"], data["sentence"])
    return data

In [8]:
semcore_extracted = semcor_extraction()

In [9]:
final_data = list(map(find_senses, semcore_extracted))

In [10]:
def nop(elem):
    return elem if elem is not None else 'None'

In [11]:
"Simplified Lesk Accuracy: " + str(accuracy_score(
    list(map(lambda data: nop(data["lesk"]["simple"].name()), final_data)),
    list(map(lambda data: nop(data["sense"].name()), final_data))
))

'Simplified Lesk Accuracy: 0.48'

In [13]:
"Stopwords Lesk Accuracy: " + str(accuracy_score(
    list(map(lambda data: nop(data["lesk"]["stopwords"].name()), final_data)), 
    list(map(lambda data: nop(data["sense"].name()), final_data))
))

'Stopwords Lesk Accuracy: 0.48'

In [14]:
"Extended Lesk Accuracy: " + str(accuracy_score(
    list(map(lambda data: nop(data["lesk"]["extended"].name()), final_data)), 
    list(map(lambda data: nop(data["sense"].name()), final_data))
))

'Extended Lesk Accuracy: 0.48'