In [2]:
import nltk

In [3]:
import os
from itertools import chain
import re, math
from collections import Counter
import pandas as pd

[nltk_data] Downloading package semcor to /home/krishna/nltk_data...


True

In [4]:
#following codeblockas are adapted from the github repo https://github.com/alvations/pywsd

from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk import pos_tag

from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

from nltk.corpus import stopwords
from string import punctuation
pywsd_stopwords = [u"'s", u"``", u"`"]
EN_STOPWORDS = set(stopwords.words('english') + list(punctuation) + pywsd_stopwords)

In [5]:
def penn2morphy(penntag) -> str:
    """
    Converts tags from Penn format (input: single string) to Morphy.
    """
    morphy_tag = {'NN':'n', 'JJ':'a', 'VB':'v', 'RB':'r'}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return 'n'

In [6]:
wnl = WordNetLemmatizer()
porter = PorterStemmer()

def lemmatize(ambiguous_word: str, pos: str = None,
              lemmatizer=wnl, stemmer=porter) -> str:
    """
    Tries to convert a surface word into lemma, and if lemmatize word is not in
    wordnet then try and convert surface word into its stem.
    This is to handle the case where users input a surface word as an ambiguous
    word and the surface word is a not a lemma.
    """

    # Try to be a little smarter and use most frequent POS.
    pos = pos if pos else penn2morphy(pos_tag([ambiguous_word])[0][1])
    lemma = lemmatizer.lemmatize(ambiguous_word, pos=pos)
    stem = stemmer.stem(ambiguous_word)
    # Ensure that ambiguous word is a lemma.
    if not wn.synsets(lemma):
        if not wn.synsets(stem):
            return ambiguous_word
        else:
            return stem
    else:
        return lemma

In [7]:
def lemmatize_sentence(sentence: str,
                       tokenizer=word_tokenize, postagger=pos_tag,
                       lemmatizer=wnl, stemmer=porter) -> list:

    words, lemmas, poss = [], [], []
    for word, pos in postagger(tokenizer(sentence)):
        pos = penn2morphy(pos)
        lemmas.append(lemmatize(word.lower(), pos,
                                lemmatizer, stemmer))
        poss.append(pos)
        words.append(word)

    
    return words, lemmas, poss

In [8]:
def synset_signatures(ss: "wn.Synset") -> set:
    """
    Takes a Synset and returns its signature words.
    :param ss: An instance of wn.Synset.
    :return: A set of signature strings
    """

    # Collects the signatures from WordNet.
    signature = []

    # Adds the definition, example sentences and lemma_names.
    signature += word_tokenize(ss.definition())

    # Adds the examples and lemma names.
    signature += chain(*[word_tokenize(eg) for eg in ss.examples()])
    signature += ss.lemma_names()

    # Includes lemma_names of hyper-/hyponyms.
    
    hyperhyponyms = set(ss.hyponyms() + ss.hypernyms() + ss.instance_hyponyms() + ss.instance_hypernyms())
    signature += set(chain(*[i.lemma_names() for i in hyperhyponyms]))

    # Lowercase.
    signature = set(s.lower() for s in signature) 

    # Removes stopwords.
    signature = set(signature).difference(EN_STOPWORDS) 

    # Lemmatized context is preferred over stemmed context.
    signature = [lemmatize(s) 
                  for s in signature
                  if not (s.isdigit())]

    # Keep only the unique bag-of-words
    return set(signature)

In [10]:
def signatures(ambiguous_word: str, pos: str = None) -> dict:
    """
    Takes an ambiguous word and optionally its Part-Of-Speech and returns
    a dictionary where keys are the synsets and values are sets of signatures.
    :param ambiguous_word: String, a single word.
    :param pos: String, one of 'a', 'r', 's', 'n', 'v', or None.
    :return: dict(synset:{signatures}).
    """

    # Ensure that the POS is supported.
    # pos = pos if pos in ['a', 'r', 's', 'n', 'v', None] else None
    pos = pos if pos in ['n', None] else None
    # If the POS specified isn't found but other POS is in wordnet.
    if not wn.synsets(ambiguous_word, pos) and wn.synsets(ambiguous_word):
        pos = None

    # Holds the synset->signature dictionary.
    ss_sign = {}
    for ss in wn.synsets(ambiguous_word, pos):
        ss_sign[ss] = synset_signatures(ss)

    return ss_sign

In [11]:
def cosine_similarity(sent1: str, sent2: str) -> float:
    """
    Calculates cosine similarity between 2 sentences/documents.
    Thanks to @vpekar, see http://goo.gl/ykibJY
    """

    WORD = re.compile(r'\w+')
    def get_cosine(vec1, vec2):
        intersection = set(vec1.keys()) & set(vec2.keys())
        numerator = sum([vec1[x] * vec2[x] for x in intersection])

        sum1 = sum([vec1[x]**2 for x in vec1.keys()])
        sum2 = sum([vec2[x]**2 for x in vec2.keys()])
        denominator = math.sqrt(sum1) * math.sqrt(sum2)

        if not denominator:
            return 0.0
        else:
            return float(numerator) / denominator

    def text_to_vector(text):
        words = WORD.findall(text)
        return Counter(words)

    vector1 = text_to_vector(sent1)
    vector2 = text_to_vector(sent2)
    cosine = get_cosine(vector1, vector2)

    return cosine

In [12]:
def e_lesk(context_sentence: str, ambiguous_word: str,
                pos: str = None) -> "wn.Synset":
    """
    In line with vector space models, we can use cosine to calculate overlaps
    instead of using raw overlap counts. Essentially, the idea of using
    signatures (aka 'sense paraphrases') is lesk-like.
    :param context_sentence: String, sentence or document.
    :param ambiguous_word: String, a single word.
    :param pos: String, one of 'a', 'r', 's', 'n', 'v', or None.
    :return: A Synset for the estimated best sense.
    """

    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word)

    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    ss_sign = signatures(ambiguous_word, pos)

    surface_words, lemmas, morphy_poss = lemmatize_sentence(context_sentence)
    
    context_sentence = " ".join(lemmas)

    scores = []
    for ss, signature in ss_sign.items():
        # Lowercase and replace "_" with spaces.
        signature = " ".join(map(str, signature)).lower().replace("_", " ")
        scores.append((cosine_similarity(context_sentence, signature), ss))

    scores = sorted(scores, reverse=True)
    return scores[0][1]

In [13]:
def disambiguate(sentence, algorithm=e_lesk,
                 tokenizer=word_tokenize):

    tagged_sentence = []
    # Pre-lemmatize the sentnece before WSD
    
    surface_words, lemmas, morphy_poss = lemmatize_sentence(sentence, tokenizer=tokenizer)
    lemma_sentence = " ".join(lemmas)

    for word, lemma, pos in zip(surface_words, lemmas, morphy_poss):
        if lemma not in EN_STOPWORDS: # Checks if it is a content word
            if wn.synsets(lemma):
                synset = algorithm(lemma_sentence, lemma, pos=pos)
            else: # In case the content word is not in WordNet.
                synset = '#NOT_IN_WN#'
        else:
            synset = '#STOPWORD/PUNCTUATION#'
        
        tagged_sentence.append((word, synset))

    # Change #NOT_IN_WN# and #STOPWORD/PUNCTUATION# into None.
    tagged_sentence = [(word, None) if str(tag).startswith('#')
                        else (word, tag, tag.definition()) for word, tag in tagged_sentence]
    return tagged_sentence

In [14]:
disambiguate("I went to the bank to deposit my money")

LookupError: 
**********************************************************************
  Resource [93maveraged_perceptron_tagger[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('averaged_perceptron_tagger')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtaggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle[0m

  Searched in:
    - '/home/krishna/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************
