POS tagging

Named entity recognition

Dependency parsing

# NLTK

In [1]:
import nltk
from nltk import word_tokenize
from nltk import pos_tag

In [2]:
sentence = "The father didn't father natural language processing."

In [3]:
words = word_tokenize(sentence)
print(pos_tag(words))

[('The', 'DT'), ('father', 'NN'), ('did', 'VBD'), ("n't", 'RB'), ('father', 'VB'), ('natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('.', '.')]


POS tag list: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

In [4]:
from nltk import ne_chunk

In [5]:
sentence = 'Jimmy studies at Northwestern University in IL'

In [6]:
words = word_tokenize(sentence)
pg = pos_tag(words)
print(ne_chunk(pg))

(S
  (GPE Jimmy/NNP)
  studies/NNS
  at/IN
  (FACILITY Northwestern/NNP University/NNP)
  in/IN
  IL/NNP)


In [7]:
from nltk.grammar import DependencyGrammar

In [8]:
groucho_grammar = nltk.CFG.fromstring("""
... S -> NP VP
... PP -> P NP
... NP -> Det N | Det N PP | 'I'
... VP -> V NP | VP PP
... Det -> 'an' | 'my'
... N -> 'elephant' | 'pajamas'
... V -> 'shot'
... P -> 'in'
... """)

In [9]:
sentence = 'I shot an elephant in my pajamas'
words = word_tokenize(sentence)

parser = nltk.ChartParser(groucho_grammar)
for tree in parser.parse(words):
    print(tree)

(S
  (NP I)
  (VP
    (VP (V shot) (NP (Det an) (N elephant)))
    (PP (P in) (NP (Det my) (N pajamas)))))
(S
  (NP I)
  (VP
    (V shot)
    (NP (Det an) (N elephant) (PP (P in) (NP (Det my) (N pajamas))))))


Draw the trees ~ https://www.nltk.org/book/ch08.html

# Standford Tool kits

https://nlp.stanford.edu/software/

In [10]:
from nltk.tag.stanford import StanfordPOSTagger

In [11]:
english_postagger = StanfordPOSTagger('/Users/xiaofengzhu/Downloads/stanford_models/stanford-postagger-2018-02-27/models/english-bidirectional-distsim.tagger', '/Users/xiaofengzhu/Downloads/stanford_models/stanford-postagger-2018-02-27/stanford-postagger.jar')

In [12]:
english_postagger.tag('The father fathered natural language processing.'.split())

[('The', 'DT'),
 ('father', 'NN'),
 ('fathered', 'VBD'),
 ('natural', 'JJ'),
 ('language', 'NN'),
 ('processing.', 'NN')]

POS tag list: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

In [13]:
from nltk.tag.stanford import StanfordNERTagger

In [14]:
english_nertagger = StanfordNERTagger('/Users/xiaofengzhu/Downloads/stanford_models/stanford-ner-2018-02-27/classifiers/english.all.3class.distsim.crf.ser.gz', '/Users/xiaofengzhu/Downloads/stanford_models/stanford-ner-2018-02-27/stanford-ner.jar')

In [15]:
english_nertagger.tag('Jimmy studies at Northwestern University in IL'.split())

[('Jimmy', 'PERSON'),
 ('studies', 'O'),
 ('at', 'O'),
 ('Northwestern', 'ORGANIZATION'),
 ('University', 'ORGANIZATION'),
 ('in', 'O'),
 ('IL', 'O')]

In [16]:
from nltk.parse.stanford import StanfordParser

Online demo: http://nlp.stanford.edu:8080/parser/

In [17]:
path_to_jar = '/Users/xiaofengzhu/Downloads/stanford_models/stanford-parser-full-2014-08-27/stanford-parser.jar'
path_to_models_jar = '/Users/xiaofengzhu/Downloads/stanford_models/stanford-parser-full-2014-08-27/stanford-parser-3.4.1-models.jar'

In [18]:
english_parser = StanfordParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)

In [19]:
results = english_parser.raw_parse_sents('I shot an elephant in my pajamas'.split())

In [20]:
for result in results:
    for list_thing in result:
        print(list_thing)

(ROOT (FRAG (NP (PRP I))))
(ROOT (NP (NN shot)))
(ROOT (FRAG (DT an)))
(ROOT (NP (NN elephant)))
(ROOT (X (IN in)))
(ROOT (X (NP (JJ my))))
(ROOT (NP (NNS pajamas)))


In [21]:
from nltk.parse.stanford import StanfordDependencyParser

dependency_parser = StanfordDependencyParser(path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)

result = dependency_parser.raw_parse('I shot an elephant in my pajamas')
dep = next(result)

list(dep.triples())

[(('shot', 'VBD'), 'nsubj', ('I', 'PRP')),
 (('shot', 'VBD'), 'dobj', ('elephant', 'NN')),
 (('elephant', 'NN'), 'det', ('an', 'DT')),
 (('shot', 'VBD'), 'prep', ('in', 'IN')),
 (('in', 'IN'), 'pobj', ('pajamas', 'NNS')),
 (('pajamas', 'NNS'), 'poss', ('my', 'PRP$'))]