In [2]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

## Parts of Speech (POS) Tagging

In [9]:
# import statements
from nltk import pos_tag, ne_chunk

In [4]:
text = "I always lie down to tell a lie."

In [7]:
# tokenize text
sentence = word_tokenize(text)

# tag each word with part of speech
pos_tag(sentence)

[('I', 'PRP'),
 ('always', 'RB'),
 ('lie', 'VBP'),
 ('down', 'RP'),
 ('to', 'TO'),
 ('tell', 'VB'),
 ('a', 'DT'),
 ('lie', 'NN'),
 ('.', '.')]

## Named Entity Recognition (NER)

In [8]:
text = "Antonio joined Udacity Inc. in California."

In [10]:
# tokenize, pos tag, then recognize named entities in text
tree = ne_chunk(pos_tag(word_tokenize(text)))
print(tree)

(S
  (PERSON Antonio/NNP)
  joined/VBD
  (ORGANIZATION Udacity/NNP Inc./NNP)
  in/IN
  (GPE California/NNP)
  ./.)


### Sentence Parsing
[CFG Methods](https://tedboy.github.io/nlps/generated/generated/nltk.CFG.html) <br>
[Formalizing Context Free Grammars](http://nltk.sourceforge.net/doc/en/ch07.html#formalizing-context-free-grammars)

In [11]:
# Define a custom grammar (Context Free Grammar)
# Return the CFG corresponding to the input string(s).
my_grammar = nltk.CFG.fromstring("""
S -> NP VP
PP -> P NP
NP -> Det N | Det N PP | 'I'
VP -> V NP | VP PP
Det -> 'an' | 'my'
N -> 'elephant' | 'pajamas'
V -> 'shot'
P -> 'in'
""")
parser = nltk.ChartParser(my_grammar)

In [13]:
# Parse a sentence
sentence = word_tokenize("I shot an elephant in my pajamas")
for tree in parser.parse(sentence):
    print(tree)

(S
  (NP I)
  (VP
    (VP (V shot) (NP (Det an) (N elephant)))
    (PP (P in) (NP (Det my) (N pajamas)))))
(S
  (NP I)
  (VP
    (V shot)
    (NP (Det an) (N elephant) (PP (P in) (NP (Det my) (N pajamas))))))


In [None]:
for tree in parser.parse(sentence):
    tree.draw() # return a chart in pop-up window

Note: Part-of-speech tagging using a predefined grammar like this is a simple, but limited, solution. It can be very tedious and error-prone for a large corpus of text, since you have to account for all possible sentence structures and tags!

There are other more advanced forms of POS tagging that can learn sentence structures and tags from given data, including Hidden Markov Models (HMMs) and Recurrent Neural Networks (RNNs).