# Chapter 05: Categorzing and Tagging Words

In [22]:
import nltk

In [23]:
text = nltk.word_tokenize("And now for something completely different")
nltk.pos_tag(text)

[('And', 'CC'),
 ('now', 'RB'),
 ('for', 'IN'),
 ('something', 'NN'),
 ('completely', 'RB'),
 ('different', 'JJ')]

In [24]:
text = nltk.word_tokenize("They refuse to permit us to obtain the refuse permit")
nltk.pos_tag(text)

[('They', 'PRP'),
 ('refuse', 'VBP'),
 ('to', 'TO'),
 ('permit', 'VB'),
 ('us', 'PRP'),
 ('to', 'TO'),
 ('obtain', 'VB'),
 ('the', 'DT'),
 ('refuse', 'NN'),
 ('permit', 'NN')]

In [25]:
import nltk

text = "I will book the appointment and brush my hair."
words = nltk.word_tokenize(text)
tagged = nltk.pos_tag(words)
print(tagged)

[('I', 'PRP'), ('will', 'MD'), ('book', 'NN'), ('the', 'DT'), ('appointment', 'NN'), ('and', 'CC'), ('brush', 'VB'), ('my', 'PRP$'), ('hair', 'NN'), ('.', '.')]


In [26]:
text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())
text.similar('woman')

man time day year car moment world house family child country boy
state job place way war girl work word


In [27]:
text.similar('bought')

made said done put had seen found given left heard was been brought
set got that took in told felt


In [28]:
text.similar('beauty')

that and time the children place in is all more back men feet af one
money night education death women


In [29]:
text.similar('over')

in on to of and for with from at by that into as up out down through
is all about


In [30]:
text.similar('the')

a his this their its her an that our any all one these my in your no
some other and


## Tagged Corpora

### Representing Tagged Tokens

In [31]:
tagged_tokens = nltk.tag.str2tuple('fly/NN')
tagged_tokens

('fly', 'NN')

In [32]:
tagged_tokens[0]

'fly'

In [33]:
tagged_tokens[1]

'NN'

In [34]:
#List of taggedtokkens
sent = '''
    The/AT grand/JJ jury/NN commented/VBD on/IN a/AT number/NN of/IN
    other/AP topics/NNS ,/, AMONG/IN them/PPO the/AT Atlanta/NP and/CC
    Fulton/NP-tl County/NN-tl purchasing/VBG departments/NNS which/WDT it/PPS
    said/VBD ``/`` ARE/BER well/QL operated/VBN and/CC follow/VB generally/RB
    accepted/VBN practices/NNS which/WDT inure/VB to/IN the/AT best/JJT
    interest/NN of/IN both/ABX governments/NNS ''/'' ./.
'''
[nltk.tag.str2tuple(t) for t in sent.split()]

[('The', 'AT'),
 ('grand', 'JJ'),
 ('jury', 'NN'),
 ('commented', 'VBD'),
 ('on', 'IN'),
 ('a', 'AT'),
 ('number', 'NN'),
 ('of', 'IN'),
 ('other', 'AP'),
 ('topics', 'NNS'),
 (',', ','),
 ('AMONG', 'IN'),
 ('them', 'PPO'),
 ('the', 'AT'),
 ('Atlanta', 'NP'),
 ('and', 'CC'),
 ('Fulton', 'NP-TL'),
 ('County', 'NN-TL'),
 ('purchasing', 'VBG'),
 ('departments', 'NNS'),
 ('which', 'WDT'),
 ('it', 'PPS'),
 ('said', 'VBD'),
 ('``', '``'),
 ('ARE', 'BER'),
 ('well', 'QL'),
 ('operated', 'VBN'),
 ('and', 'CC'),
 ('follow', 'VB'),
 ('generally', 'RB'),
 ('accepted', 'VBN'),
 ('practices', 'NNS'),
 ('which', 'WDT'),
 ('inure', 'VB'),
 ('to', 'IN'),
 ('the', 'AT'),
 ('best', 'JJT'),
 ('interest', 'NN'),
 ('of', 'IN'),
 ('both', 'ABX'),
 ('governments', 'NNS'),
 ("''", "''"),
 ('.', '.')]

### Reading taggled corpora

In [35]:
nltk.corpus.brown.tagged_words()

[('The', 'AT'), ('Fulton', 'NP-TL'), ...]

In [36]:
print(nltk.corpus.brown.tagged_words())

[('The', 'AT'), ('Fulton', 'NP-TL'), ...]


In [37]:
nltk.corpus.brown.conll2000.tagged_words()

AttributeError: 'CategorizedTaggedCorpusReader' object has no attribute 'conll2000'

In [None]:
nltk.download('treebank')

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\netra\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\treebank.zip.


True

In [None]:
nltk.corpus.treebank.tagged_words()

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ...]

In [None]:
nltk.download('sinica_treebank')

[nltk_data] Downloading package sinica_treebank to
[nltk_data]     C:\Users\netra\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\sinica_treebank.zip.


True

In [None]:
nltk.corpus.sinica_treebank.tagged_words()

[('一', 'Neu'), ('友情', 'Nad'), ('嘉珍', 'Nba'), ...]

In [None]:
nltk.download('indian')

[nltk_data] Downloading package indian to
[nltk_data]     C:\Users\netra\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\indian.zip.


True

In [None]:
nltk.corpus.indian.tagged_words()

[('মহিষের', 'NN'), ('সন্তান', 'NN'), (':', 'SYM'), ...]

![image.png](attachment:image.png)

In [None]:
from nltk.corpus import brown
brown_news_tagged = brown.tagged_words(categories='news', tagset='universal')
tag_fd = nltk.FreqDist(tag for (word, tag) in brown_news_tagged)
tag_fd.keys()

LookupError: 
**********************************************************************
  Resource [93muniversal_tagset[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('universal_tagset')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtaggers/universal_tagset/en-brown.map[0m

  Searched in:
    - 'C:\\Users\\netra/nltk_data'
    - 'e:\\Python\\myenv\\nltk_data'
    - 'e:\\Python\\myenv\\share\\nltk_data'
    - 'e:\\Python\\myenv\\lib\\nltk_data'
    - 'C:\\Users\\netra\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************


In [None]:
wsj = nltk.corpus.treebank.tagged_words(simplify_tags = True)

word_tag_fd = nltk.FreqDist(wsj)
[wt for (wt, _) in word_tag_fd.most_common() if wt[1] == 'VERB']

TypeError: SyntaxCorpusReader.tagged_words() got an unexpected keyword argument 'simplify_tags'

In [None]:
from nltk.corpus import brown
brown_news_tagged = brown.tagged_words(categories='news', tagset='universal')
tag_fd = nltk.FreqDist(tag for (word, tag) in brown_news_tagged)
tag_fd.keys()

LookupError: 
**********************************************************************
  Resource [93muniversal_tagset[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('universal_tagset')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtaggers/universal_tagset/en-brown.map[0m

  Searched in:
    - 'C:\\Users\\netra/nltk_data'
    - 'e:\\Python\\myenv\\nltk_data'
    - 'e:\\Python\\myenv\\share\\nltk_data'
    - 'e:\\Python\\myenv\\lib\\nltk_data'
    - 'C:\\Users\\netra\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************


In [None]:
tag_fd.plot(cumulative=True)

NameError: name 'tag_fd' is not defined