# POS tagging in NLTK

NLTK has a pos tagger that takes as input a list of tokens and outputs a list of tuples in the form (token, pos).


In [3]:
import nltk
sentence = 'The quick brown fox jumped over the lazy dog.'  # add comma quick, brown to see brown=JJ
tokens = nltk.word_tokenize(sentence)
tags = nltk.pos_tag(tokens)
print(tags)

[('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumped', 'VBD'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN'), ('.', '.')]


Many nouns and verbs have the same form. It can be difficult for pos taggers to distinguish them. The pos tagger missed the main verb of the sentence (5th token).

In [4]:
sentence = 'My sense of danger senses that he has lost his senses.'
tags = nltk.pos_tag(nltk.word_tokenize(sentence))
print(tags)

[('My', 'PRP$'), ('sense', 'NN'), ('of', 'IN'), ('danger', 'NN'), ('senses', 'NNS'), ('that', 'IN'), ('he', 'PRP'), ('has', 'VBZ'), ('lost', 'VBN'), ('his', 'PRP$'), ('senses', 'NNS'), ('.', '.')]


In [6]:
sentence = 'My spidey sense senses danger.'
tags = nltk.pos_tag(nltk.word_tokenize(sentence))
print(tags)

[('My', 'PRP$'), ('spidey', 'NN'), ('sense', 'NN'), ('senses', 'VBZ'), ('danger', 'NN'), ('.', '.')]


Try out some ambiguous words.

In [10]:
text = 'They wind back the clock while we chase the wind.'
print(nltk.pos_tag(nltk.word_tokenize(text)))

[('They', 'PRP'), ('wind', 'VBP'), ('back', 'RB'), ('the', 'DT'), ('clock', 'NN'), ('while', 'IN'), ('we', 'PRP'), ('chase', 'VBP'), ('the', 'DT'), ('wind', 'NN'), ('.', '.')]


# Practice:

* use the following text
* perform POS tagging
* make a dictionary POS -> count
* print the dictionary from highest to lowest count

In [7]:
text = """On an exceptionally hot evening early in July a young man came out of the garret in which he lodged in \
S. Place and walked slowly, as though in hesitation, towards K. bridge. \
He had successfully avoided meeting his landlady on the staircase. His \
garret was under the roof of a high, five-storied house and was more \
like a cupboard than a room. The landlady who provided him with garret, \
dinners, and attendance, lived on the floor below, and every time \
he went out he was obliged to pass her kitchen, the door of which \
invariably stood open. And each time he passed, the young man had a \
sick, frightened feeling, which made him scowl and feel ashamed. He was \
hopelessly in debt to his landlady, and was afraid of meeting her."""

text

'On an exceptionally hot evening early in July a young man came out of the garret in which he lodged in S. Place and walked slowly, as though in hesitation, towards K. bridge. He had successfully avoided meeting his landlady on the staircase. His garret was under the roof of a high, five-storied house and was more like a cupboard than a room. The landlady who provided him with garret, dinners, and attendance, lived on the floor below, and every time he went out he was obliged to pass her kitchen, the door of which invariably stood open. And each time he passed, the young man had a sick, frightened feeling, which made him scowl and feel ashamed. He was hopelessly in debt to his landlady, and was afraid of meeting her.'

In [8]:
tokens = nltk.word_tokenize(text)
tags = nltk.pos_tag(tokens)
tags

[('On', 'IN'),
 ('an', 'DT'),
 ('exceptionally', 'RB'),
 ('hot', 'JJ'),
 ('evening', 'VBG'),
 ('early', 'JJ'),
 ('in', 'IN'),
 ('July', 'NNP'),
 ('a', 'DT'),
 ('young', 'JJ'),
 ('man', 'NN'),
 ('came', 'VBD'),
 ('out', 'IN'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('garret', 'NN'),
 ('in', 'IN'),
 ('which', 'WDT'),
 ('he', 'PRP'),
 ('lodged', 'VBD'),
 ('in', 'IN'),
 ('S.', 'NNP'),
 ('Place', 'NNP'),
 ('and', 'CC'),
 ('walked', 'VBD'),
 ('slowly', 'RB'),
 (',', ','),
 ('as', 'IN'),
 ('though', 'IN'),
 ('in', 'IN'),
 ('hesitation', 'NN'),
 (',', ','),
 ('towards', 'NNS'),
 ('K.', 'NNP'),
 ('bridge', 'NN'),
 ('.', '.'),
 ('He', 'PRP'),
 ('had', 'VBD'),
 ('successfully', 'RB'),
 ('avoided', 'VBN'),
 ('meeting', 'VBG'),
 ('his', 'PRP$'),
 ('landlady', 'NN'),
 ('on', 'IN'),
 ('the', 'DT'),
 ('staircase', 'NN'),
 ('.', '.'),
 ('His', 'PRP$'),
 ('garret', 'NN'),
 ('was', 'VBD'),
 ('under', 'IN'),
 ('the', 'DT'),
 ('roof', 'NN'),
 ('of', 'IN'),
 ('a', 'DT'),
 ('high', 'JJ'),
 (',', ','),
 ('five-stori

In [9]:
pos_dict = {}
for token, pos in tags:
    if pos not in pos_dict:
        pos_dict[pos] = 1
    else:
        pos_dict[pos] += 1
        
for pos in sorted(pos_dict, key=pos_dict.get, reverse=True):
    print(pos, ':', pos_dict[pos])

NN : 25
IN : 21
VBD : 17
DT : 15
, : 12
JJ : 8
PRP : 8
CC : 7
RB : 6
. : 6
PRP$ : 5
NNP : 4
VBG : 3
WDT : 3
VBN : 3
NNS : 2
TO : 2
RBR : 1
WP : 1
VB : 1
