In [6]:
# Lab: POS tagging in NLTK
# This file has small examples that are meant to be run individually
#   in the Python shell

import nltk

In [4]:
## Part 1: demo for POS Tagged Corpora in nltk:Brown and Penn Treebank
# the Brown corpus has its own set of POS tags

from nltk.corpus import brown

In [12]:
from nltk.corpus import brown
brown_news_tagged = brown.tagged_words(tagset='universal')
tag_fd = nltk.FreqDist(tag for (word, tag) in brown_news_tagged)
tag_fd.most_common()

[('NOUN', 275558),
 ('VERB', 182750),
 ('.', 147565),
 ('ADP', 144766),
 ('DET', 137019),
 ('ADJ', 83721),
 ('ADV', 56239),
 ('PRON', 49334),
 ('CONJ', 38151),
 ('PRT', 29829),
 ('NUM', 14874),
 ('X', 1386)]

In [3]:
# the tagged_sents function gives POS tagged sentences 
brown.tagged_sents()[:2]



[[('The', 'AT'),
  ('Fulton', 'NP-TL'),
  ('County', 'NN-TL'),
  ('Grand', 'JJ-TL'),
  ('Jury', 'NN-TL'),
  ('said', 'VBD'),
  ('Friday', 'NR'),
  ('an', 'AT'),
  ('investigation', 'NN'),
  ('of', 'IN'),
  ("Atlanta's", 'NP$'),
  ('recent', 'JJ'),
  ('primary', 'NN'),
  ('election', 'NN'),
  ('produced', 'VBD'),
  ('``', '``'),
  ('no', 'AT'),
  ('evidence', 'NN'),
  ("''", "''"),
  ('that', 'CS'),
  ('any', 'DTI'),
  ('irregularities', 'NNS'),
  ('took', 'VBD'),
  ('place', 'NN'),
  ('.', '.')],
 [('The', 'AT'),
  ('jury', 'NN'),
  ('further', 'RBR'),
  ('said', 'VBD'),
  ('in', 'IN'),
  ('term-end', 'NN'),
  ('presentments', 'NNS'),
  ('that', 'CS'),
  ('the', 'AT'),
  ('City', 'NN-TL'),
  ('Executive', 'JJ-TL'),
  ('Committee', 'NN-TL'),
  (',', ','),
  ('which', 'WDT'),
  ('had', 'HVD'),
  ('over-all', 'JJ'),
  ('charge', 'NN'),
  ('of', 'IN'),
  ('the', 'AT'),
  ('election', 'NN'),
  (',', ','),
  ('``', '``'),
  ('deserves', 'VBZ'),
  ('the', 'AT'),
  ('praise', 'NN'),
  ('and', 

In [4]:
# what does this code do? First 20 tokens.
brown.tagged_words()[:20]

[('The', 'AT'),
 ('Fulton', 'NP-TL'),
 ('County', 'NN-TL'),
 ('Grand', 'JJ-TL'),
 ('Jury', 'NN-TL'),
 ('said', 'VBD'),
 ('Friday', 'NR'),
 ('an', 'AT'),
 ('investigation', 'NN'),
 ('of', 'IN'),
 ("Atlanta's", 'NP$'),
 ('recent', 'JJ'),
 ('primary', 'NN'),
 ('election', 'NN'),
 ('produced', 'VBD'),
 ('``', '``'),
 ('no', 'AT'),
 ('evidence', 'NN'),
 ("''", "''"),
 ('that', 'CS')]

In [6]:
# Each tagged word is a pair, which Python calls a tuple  
#  it behaves like a list except that you can't change the elements (immutable)
wordtag = brown.tagged_words()[0]
wordtag


('The', 'AT')

In [7]:
# let's check out the type of wordtag
type(wordtag)


tuple

In [8]:
# we can look at the items in the tuple separately by changing the number in the bracket
wordtag[0]


'The'

In [9]:
# the brown corpus can also be accessed by category
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [10]:
# let's look at the category "humor" and return the first 20 tokens.
brown_humor_tagged = brown.tagged_words(categories='humor', tagset='universal')
brown_humor_tagged[:20]

[('It', 'PRON'),
 ('was', 'VERB'),
 ('among', 'ADP'),
 ('these', 'DET'),
 ('that', 'ADP'),
 ('Hinkle', 'NOUN'),
 ('identified', 'VERB'),
 ('a', 'DET'),
 ('photograph', 'NOUN'),
 ('of', 'ADP'),
 ('Barco', 'NOUN'),
 ('!', '.'),
 ('!', '.'),
 ('For', 'ADP'),
 ('it', 'PRON'),
 ('seems', 'VERB'),
 ('that', 'ADP'),
 ('Barco', 'NOUN'),
 (',', '.'),
 ('fancying', 'VERB')]

In [11]:
# Penn treebank
from nltk.corpus import treebank

In [12]:
# use corpus methods to get the text as strings and as tokens as before
treebank_text = treebank.raw() # To raw tokens
print(treebank_text[:150])


( (S 
    (NP-SBJ 
      (NP (NNP Pierre) (NNP Vinken) )
      (, ,) 
      (ADJP 
        (NP (CD 61) (NNS years) )
        (JJ old) )
      (, ,) )


In [13]:
# treebank also provides tokenization
treebank_tokens = treebank.words()
treebank_tokens[:20]
# look at the words level, return the first 20 tokens


['Pierre',
 'Vinken',
 ',',
 '61',
 'years',
 'old',
 ',',
 'will',
 'join',
 'the',
 'board',
 'as',
 'a',
 'nonexecutive',
 'director',
 'Nov.',
 '29',
 '.',
 'Mr.',
 'Vinken']

In [14]:
#  we also have functions to get words with tags and sentences with tagged words
treebank_tagged_words = treebank.tagged_words()
print(treebank_tagged_words[:20])

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.'), ('Mr.', 'NNP'), ('Vinken', 'NNP')]


In [15]:
# we can look at the tagging at the sentence level
treebank_tagged = treebank.tagged_sents()
print(treebank_tagged[:2])

[[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')], [('Mr.', 'NNP'), ('Vinken', 'NNP'), ('is', 'VBZ'), ('chairman', 'NN'), ('of', 'IN'), ('Elsevier', 'NNP'), ('N.V.', 'NNP'), (',', ','), ('the', 'DT'), ('Dutch', 'NNP'), ('publishing', 'VBG'), ('group', 'NN'), ('.', '.')]]


In [16]:
## Frequency distribution of tags in Penn Treebank
tag_fd = nltk.FreqDist(tag for (word, tag) in treebank.tagged_words())
tag_fd.keys()


dict_keys(['NNP', ',', 'CD', 'NNS', 'JJ', 'MD', 'VB', 'DT', 'NN', 'IN', '.', 'VBZ', 'VBG', 'CC', 'VBD', 'VBN', '-NONE-', 'RB', 'TO', 'PRP', 'RBR', 'WDT', 'VBP', 'RP', 'PRP$', 'JJS', 'POS', '``', 'EX', "''", 'WP', ':', 'JJR', 'WRB', '$', 'NNPS', 'WP$', '-LRB-', '-RRB-', 'PDT', 'RBS', 'FW', 'UH', 'SYM', 'LS', '#'])

In [26]:
# what does this code do?
for tag,freq in tag_fd.most_common():
    print (tag, freq)
# prints out the most common freq in order throught the corpus / Re-rank

NN 13166
IN 9857
NNP 9410
DT 8165
-NONE- 6592
NNS 6047
JJ 5834
, 4886
. 3874
CD 3546
VBD 3043
RB 2822
VB 2554
CC 2265
TO 2179
VBN 2134
VBZ 2125
PRP 1716
VBG 1460
VBP 1321
MD 927
POS 824
PRP$ 766
$ 724
`` 712
'' 694
: 563
WDT 445
JJR 381
NNPS 244
WP 241
RP 216
JJS 182
WRB 178
RBR 136
-RRB- 126
-LRB- 120
EX 88
RBS 35
PDT 27
# 16
WP$ 14
LS 13
FW 4
UH 3
SYM 1


In [27]:
# use the first letter of the POS tag to get classes of tags
tag_classes_fd = nltk.FreqDist(tag[0] for (word, tag) in treebank.tagged_words())
print(tag_classes_fd.keys(), '\n') # only shows the first letter
for tag,freq in tag_classes_fd.most_common():
    print (tag, freq)
# re-rank the most common one in order

dict_keys(['N', ',', 'C', 'J', 'M', 'V', 'D', 'I', '.', '-', 'R', 'T', 'P', 'W', '`', 'E', "'", ':', '$', 'F', 'U', 'S', 'L', '#']) 

N 28867
V 12637
I 9857
D 8165
- 6838
J 6397
C 5811
, 4886
. 3874
P 3333
R 3209
T 2179
M 927
W 878
$ 724
` 712
' 694
: 563
E 88
# 16
L 13
F 4
U 3
S 1


## Part 2: POS Tagging


In [19]:
# Separating the data into training and test data:90% for training data
size = int(len(treebank_tagged) * 0.9)
treebank_train = treebank_tagged[:size]
treebank_test = treebank_tagged[size:]
size

3522

In [20]:
# Default Tagger assign 'NN' to every word
# creates the tagger
t0 = nltk.DefaultTagger('NN')
# show the effect of the tagger by tagging the first 50 words
print(t0.tag(treebank_tokens[:50]))



[('Pierre', 'NN'), ('Vinken', 'NN'), (',', 'NN'), ('61', 'NN'), ('years', 'NN'), ('old', 'NN'), (',', 'NN'), ('will', 'NN'), ('join', 'NN'), ('the', 'NN'), ('board', 'NN'), ('as', 'NN'), ('a', 'NN'), ('nonexecutive', 'NN'), ('director', 'NN'), ('Nov.', 'NN'), ('29', 'NN'), ('.', 'NN'), ('Mr.', 'NN'), ('Vinken', 'NN'), ('is', 'NN'), ('chairman', 'NN'), ('of', 'NN'), ('Elsevier', 'NN'), ('N.V.', 'NN'), (',', 'NN'), ('the', 'NN'), ('Dutch', 'NN'), ('publishing', 'NN'), ('group', 'NN'), ('.', 'NN'), ('Rudolph', 'NN'), ('Agnew', 'NN'), (',', 'NN'), ('55', 'NN'), ('years', 'NN'), ('old', 'NN'), ('and', 'NN'), ('former', 'NN'), ('chairman', 'NN'), ('of', 'NN'), ('Consolidated', 'NN'), ('Gold', 'NN'), ('Fields', 'NN'), ('PLC', 'NN'), (',', 'NN'), ('was', 'NN'), ('named', 'NN'), ('*-1', 'NN'), ('a', 'NN')]


In [21]:
# evaluate function applies the tagger t0 to the untagged version of treebank
#   and compares with the tagged version
print(t0.accuracy(treebank_test))

0.14697201017811704


## Part 3: n-gram tagger

In [22]:
# Unigram tagger learns tag with the highest probability for each word
# creates the tagger on the training set
t1 = nltk.UnigramTagger(treebank_train)

In [23]:
# show the effect of the tagger by tagging the first 20 words
print(t1.tag(treebank_tokens[:20]))


[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.'), ('Mr.', 'NNP'), ('Vinken', 'NNP')]


In [24]:
# evaluates the tagger on the test set
t1.accuracy(treebank_test)

0.8627989821882952

In [28]:
# Bigram Tagging with Backoff to Combine Taggers
# create a sequence of taggers with backoff to get a bigram tagger
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(treebank_train, backoff=t0)
t2 = nltk.BigramTagger(treebank_train, backoff=t1)


In [29]:
# Accuracy with BigramTagger: 
t2.accuracy(treebank_test)

0.8905852417302799

In [30]:
# Task: Using the bigram tagger on some new text
text = "Three Calgarians have found a rather unusual way of leaving snow and ice behind. They set off this week on foot and by camels on a grueling trek across the burning Arabian desert."

In [31]:

# step1: we should separate the text into sentences 
textsplit = nltk.sent_tokenize(text)
print(textsplit)

['Three Calgarians have found a rather unusual way of leaving snow and ice behind.', 'They set off this week on foot and by camels on a grueling trek across the burning Arabian desert.']


In [32]:
# step 2: apply the word tokenizer to each sentence
tokentext = [nltk.word_tokenize(sent) for sent in textsplit]
print(tokentext)

[['Three', 'Calgarians', 'have', 'found', 'a', 'rather', 'unusual', 'way', 'of', 'leaving', 'snow', 'and', 'ice', 'behind', '.'], ['They', 'set', 'off', 'this', 'week', 'on', 'foot', 'and', 'by', 'camels', 'on', 'a', 'grueling', 'trek', 'across', 'the', 'burning', 'Arabian', 'desert', '.']]


In [33]:
# step 3: use the t2 bigram tagger to tag each sentence tokens
taggedtext = [t2.tag(tokens) for tokens in tokentext]
print(taggedtext)

[[('Three', 'CD'), ('Calgarians', 'NN'), ('have', 'VBP'), ('found', 'VBN'), ('a', 'DT'), ('rather', 'RB'), ('unusual', 'JJ'), ('way', 'NN'), ('of', 'IN'), ('leaving', 'VBG'), ('snow', 'NN'), ('and', 'CC'), ('ice', 'NN'), ('behind', 'IN'), ('.', '.')], [('They', 'PRP'), ('set', 'VBN'), ('off', 'RP'), ('this', 'DT'), ('week', 'NN'), ('on', 'IN'), ('foot', 'NN'), ('and', 'CC'), ('by', 'IN'), ('camels', 'NN'), ('on', 'IN'), ('a', 'DT'), ('grueling', 'NN'), ('trek', 'NN'), ('across', 'IN'), ('the', 'DT'), ('burning', 'NN'), ('Arabian', 'NN'), ('desert', 'NN'), ('.', '.')]]


In [34]:
# different tagger: use the Stanford POS tagger to tag each sentence tokens
taggedtextStanford = [nltk.pos_tag(tokens) for tokens in tokentext]
print(taggedtextStanford)

[[('Three', 'CD'), ('Calgarians', 'NNPS'), ('have', 'VBP'), ('found', 'VBN'), ('a', 'DT'), ('rather', 'RB'), ('unusual', 'JJ'), ('way', 'NN'), ('of', 'IN'), ('leaving', 'VBG'), ('snow', 'NN'), ('and', 'CC'), ('ice', 'NN'), ('behind', 'NN'), ('.', '.')], [('They', 'PRP'), ('set', 'VBD'), ('off', 'RP'), ('this', 'DT'), ('week', 'NN'), ('on', 'IN'), ('foot', 'NN'), ('and', 'CC'), ('by', 'IN'), ('camels', 'NNS'), ('on', 'IN'), ('a', 'DT'), ('grueling', 'NN'), ('trek', 'NN'), ('across', 'IN'), ('the', 'DT'), ('burning', 'NN'), ('Arabian', 'JJ'), ('desert', 'NN'), ('.', '.')]]
