In [55]:
import nltk
nltk.download(['punkt', 'wordnet', 'semcor', 'stopwords', 'averaged_perceptron_tagger'])

from nltk import word_tokenize
from nltk.corpus import semcor
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
nltk.download('omw-1.4')
import random
import numpy as np
from string import punctuation
from num2words import num2words
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

[nltk_data] Downloading package punkt to /home/krishna/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/krishna/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package semcor to /home/krishna/nltk_data...
[nltk_data]   Package semcor is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/krishna/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/krishna/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to /home/krishna/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [56]:
from gensim.models import KeyedVectors
W2V = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary = True)

In [57]:
EXTRA_SW = [
    "''",
    "'s",
    "``"
]

SW = stopwords.words("english")
SW += [p for p in punctuation]
SW += EXTRA_SW

In [58]:
lemmatizer = WordNetLemmatizer()

In [59]:
def cosineSimilarity(a, b):
    cs = np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
    return cs

In [60]:
def isNumber(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

In [61]:
def n2w(w):
    if isNumber(w) and w.lower() != "infinity" and w.lower() != "nan":
        w = num2words(w)
    return w

In [62]:
def lemmatize(w, tag):
    if tag is None:
        return lemmatizer.lemmatize(w)
    else:
        return lemmatizer.lemmatize(w, tag)

In [63]:
def treebank2wn(ttag):
    if ttag.startswith("N"):
        return wn.NOUN
    # elif ttag.startswith("V"):
    #     return wn.VERB
    # elif ttag.startswith("J"):
    #     return wn.ADJ
    # elif ttag.startswith("R"):
    #     return wn.ADV
    else:
        return None

In [64]:
def clean(tokens):
    tagged = nltk.pos_tag(tokens)
    lemmatized = [lemmatize(w, treebank2wn(tag)) for w, tag in tagged]
    cleaned = [n2w(w) for w in lemmatized if w.lower() not in SW]
    return cleaned

In [65]:
def getVec(w):
    try:
        v = W2V[w]
        return v
    except KeyError:
        return None

In [66]:
def syn2sense(syn):
    s = syn.name()
    return s

In [67]:
# def treebank2wn(ttag):
#     if ttag.startswith("J"):
#         return wn.ADJ
#     elif ttag.startswith("V"):
#         return wn.VERB
#     elif ttag.startswith("N"):
#         return wn.NOUN
#     elif ttag.startswith("R"):
#         return wn.ADV
#     else:
#         return None

In [68]:
def sent2vec(tokens):

    v = 0
    n = 0

    for w in tokens:

        tkns = word_tokenize(w)

        if len(tkns) > 1:
            for t in tkns:
                vt = getVec(t)
                if vt is not None:
                    n += 1
                    v += vt
        else:
            vw = getVec(w)
            if vw is not None:
                n += 1
                v += vw

    if n == 0:
        v = None
    else:
        v /= n

    return v

In [69]:
def parse(d):

    tokens = []
    senses = []

    for e in d:

        if isinstance(e, nltk.tree.Tree):

            lemma = e.label()
            
            if isinstance(lemma, nltk.corpus.reader.wordnet.Lemma):
                synset = lemma.synset()
                sense = syn2sense(synset)
            else:
                sense = None
            
            le = len(e)
            if le == 1:
                w = e[0]
                if isinstance(w, nltk.tree.Tree) or isinstance(w, list):
                    lw = len(w)
                    w = " ".join([w[i] for i in range(lw)])
            else:
                w = " ".join([e[i] for i in range(le)])

        elif isinstance(e, list):
            w = e[0]
            sense = None

        else:
            invtype = type(e)
            raise TypeError(f"Invalid type: {invtype}")

        if w:
            tokens.append(w)
            senses.append(sense)

    return tokens, senses

In [70]:
def getCandidates(w, tag):

    w = w.replace(".", "")
    w = w.replace("-", "")

    tkns = word_tokenize(w)
    if len(tkns) > 1:
        tagged = nltk.pos_tag(tkns)
        tags = [treebank2wn(p[1]) for p in tagged]
        ltkns = [lemmatize(w, t) for w, t in zip(tkns, tags)]
        w = "_".join(ltkns)

    syns = wn.synsets(w, tag)

    if len(syns) == 0:
        w = "_".join(tkns)
        syns = wn.synsets(w, tag)

    sense_vectors = []
    sense_labels = []

    for syn in syns:

        label = syn2sense(syn)

        defn = syn.definition()

        defn = defn.replace("_", " ")
        defn = defn.replace("-", " ")

        tkns = word_tokenize(defn)
        if len(tkns) == 0:
            raise ValueError(f"0 tokens found: {defn}")

        clnd = clean(tkns)
        if len(clnd) < 2:
            clnd = tkns

        sv = sent2vec(clnd)

        if sv is None:
            print(f"Empty sense vector. Word: {w}, Definition: {defn}, Cleaned: {clnd}. Using a random vector as sense.")
            sv = np.random.rand(300,)
        
        sense_vectors.append(sv)
        sense_labels.append(label)

    return sense_vectors, sense_labels

In [71]:
data = semcor.tagged_sents(tag = "sem")

In [72]:
n_total = 0
n_correct = 0
n_samples = 0

true = []
pred = []

for d in data:

    try:

        tokens, senses = parse(d)
        n_tokens = len(tokens)

        tagged = nltk.pos_tag(tokens)
        tags = [treebank2wn(p[1]) for p in tagged]
        tokens = [lemmatize(w, tag) for w, tag in zip(tokens, tags)]

        for i in range(n_tokens):

            w = tokens[i]
            tag = tags[i]
            s_true = senses[i]

            if not isinstance(w, str):
                raise TypeError(f"Invalid type: {type(w)} : {w} : {tokens}")

            if s_true is None:
                continue

            context = tokens.copy()
            del context[i]

            cleaned = clean(context)
            if len(cleaned) < 2:
                cleaned = context
                
            cv = sent2vec(cleaned)

            if cv is None:
                print(f"Empty context vector. Word: {w}, Cleaned: {cleaned}, Tokens: {tokens}. Using a random vector as context.")
                cv = np.random.rand(300,)

            sense_vectors, sense_labels = getCandidates(w, tag)
            n_candidates = len(sense_labels)

            s_pred = None
            if n_candidates == 0:
                sense_vectors, sense_labels = getCandidates(w, None)
                n_candidates = len(sense_labels)
                if n_candidates == 0:
                    s_pred = random.choice(["group.n.01", "person.n.01", "location.n.01"])
            
            best = -1 
            for j in range(n_candidates):
                sv = sense_vectors[j]
                cs = cosineSimilarity(cv, sv)
                if cs > best:
                    best = cs
                    s_pred = sense_labels[j]

            if s_true == s_pred:
                n_correct += 1
            n_total += 1

            true.append(s_true)
            pred.append(s_pred)

    except Exception as e:
        print(f"Error at: {n_samples}")
        print(str(e))
        raise ValueError("Error")

    n_samples += 1

    if n_samples%200 == 0:
        print(f"{n_samples} sentences processed")
        acc = (n_correct/n_total)*100
        print(f"Accuracy: {acc:.4f}")
        print()

200 sentences processed
Accuracy: 41.7108

400 sentences processed
Accuracy: 39.7835

600 sentences processed
Accuracy: 38.9822

800 sentences processed
Accuracy: 38.1238

1000 sentences processed
Accuracy: 38.2491

1200 sentences processed
Accuracy: 37.9501

1400 sentences processed
Accuracy: 37.9943

1600 sentences processed
Accuracy: 38.3436

1800 sentences processed
Accuracy: 38.2648

2000 sentences processed
Accuracy: 38.1458

2200 sentences processed
Accuracy: 38.1071

Empty context vector. Word: Cancer, Cleaned: ['``', "''", '!'], Tokens: ['``', 'Cancer', "''", '!']. Using a random vector as context.
Empty context vector. Word: By no means, Cleaned: ['.'], Tokens: ['By no means', '.']. Using a random vector as context.
Empty context vector. Word: For instance, Cleaned: [':'], Tokens: ['For instance', ':']. Using a random vector as context.
Empty context vector. Word: Death, Cleaned: ['!'], Tokens: ['Death', '!']. Using a random vector as context.
2400 sentences processed
Accurac

In [73]:
pred_sense_set = set(pred)
true_sense_set = set(true)
all_senses = sorted(list(true_sense_set.union(pred_sense_set)))
not_predicted = true_sense_set - pred_sense_set
extra_predicted = pred_sense_set - true_sense_set

In [74]:
acc = accuracy_score(true, pred)
prec = precision_score(true, pred, average = "macro")
rec = recall_score(true, pred, average = "macro")
f1 = f1_score(true, pred, average = "macro")

print(f"Accuracy: {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {rec:.4f}")
print(f"F1-Score: {f1:.4f}")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.3478
Precision: 0.3837
Recall: 0.3739
F1-Score: 0.3488


In [75]:
def predict(sent):

    senses = []
    tokens = word_tokenize(sent)
    tagged = nltk.pos_tag(tokens)
    tags = [treebank2wn(p[1]) for p in tagged]
    tokens = [lemmatize(w, tag) for w, tag in zip(tokens, tags)]
    n_tokens = len(tokens)

    for i in range(n_tokens):

        w = tokens[i]
        tag = tags[i]

        context = tokens.copy()
        del context[i]

        cv = sent2vec(context)

        if cv is None:
            print(f"Empty context vector. Word: {w}, Tokens: {tokens}. Using a random vector as context.")
            cv = np.random.rand(300,)

        sense_vectors, sense_labels = getCandidates(w, tag)
        n_candidates = len(sense_labels)

        s_pred = None
        if n_candidates == 0:
            sense_vectors, sense_labels = getCandidates(w, None)
            n_candidates = len(sense_labels)
            if n_candidates == 0:
                # print(f"No synsets found: {w}")
                s_pred = None

        best = -1 
        for j in range(n_candidates):
            sv = sense_vectors[j]
            cs = cosineSimilarity(cv, sv)
            if cs > best:
                best = cs
                s_pred = sense_labels[j]

        senses.append(s_pred)

    return senses

In [83]:
sents = [
    "The boy crossed the river bank to go to the bank", 
 ]

for sent in sents:
    senses = predict(sent)
    for s in senses:
        if s is not None:
            print(s, ":", wn.synset(s).definition())
    print()

boy.n.02 : a friendly informal reference to a grown man
crossed.a.02 : (of a check) marked for deposit only as indicated by having two lines drawn across it
river.n.01 : a large natural stream of water (larger than a creek)
bank.n.01 : sloping land (especially the slope beside a body of water)
rifle.v.02 : go through in search of something; search through someone's belongings in an unauthorized way
bank.n.01 : sloping land (especially the slope beside a body of water)

