In [12]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /home/kevin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/kevin/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [24]:
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet

In [14]:
from IPython.display import display
lemmatizer = nltk.WordNetLemmatizer()

In [43]:
a="a member of the human race"
b="a complex system"
c="a person who shares their knowledge"
d="a person who's occupation is teaching"
e="a common employee of a university"

document = d
document

"a person who's occupation is teaching"

In [44]:
tokens = [nltk.word_tokenize(sent) for sent in [document]]
tokens

[['a', 'person', 'who', "'s", 'occupation', 'is', 'teaching']]

In [45]:
postag = [nltk.pos_tag(sent) for sent in tokens][0]
postag

[('a', 'DT'),
 ('person', 'NN'),
 ('who', 'WP'),
 ("'s", 'VBZ'),
 ('occupation', 'NN'),
 ('is', 'VBZ'),
 ('teaching', 'VBG')]

In [46]:
pos_tag([document])

[("a person who's occupation is teaching", 'VBG')]

In [47]:
grammar = r"""
    NBAR:
        {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
        {<RB.?>*<VB.?>*<JJ>*<VB.?>+<VB>?} # Verbs and Verb Phrases
        
    NP:
        {<NBAR>}
        {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
        
"""

cp = nltk.RegexpParser(grammar)
tree = cp.parse(postag)
print(tree)

(S
  a/DT
  (NP (NBAR person/NN))
  who/WP
  (NP (NBAR 's/VBZ))
  (NP (NBAR occupation/NN))
  (NP (NBAR is/VBZ teaching/VBG)))


In [31]:
# tree.draw()

In [48]:
def leaves(tree):
    """Finds NP (nounphrase) leaf nodes of a chunk tree."""
    for subtree in tree.subtrees(filter = lambda t: t.label() =='NP'):
        yield subtree.leaves()
        
def get_word_postag(word):
    if pos_tag([word])[0][1].startswith('J'):
        return wordnet.ADJ
    if pos_tag([word])[0][1].startswith('V'):
        return wordnet.VERB
    if pos_tag([word])[0][1].startswith('N'):
        return wordnet.NOUN
    else:
        return wordnet.NOUN
    
def normalise(word):
    """Normalises words to lowercase and stems and lemmatizes it."""
    word = word.lower()
    postag = get_word_postag(word)
    word = lemmatizer.lemmatize(word,postag)
    return word

def get_terms(tree):    
    for leaf in leaves(tree):
        terms = [normalise(w) for w,t in leaf]
        yield terms

terms = get_terms(tree)

features = []
for term in terms:
    _term = ''
    for word in term:
        _term += ' ' + word
    features.append(_term.strip())
features

['person', "'s", 'occupation', 'be teach']