In [8]:
import nltk

In [9]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

# Goal
 - What are lexical categories, and how are they used in natural language processing?
 - What is a good Python data structure for storing words and their categories?
 - How can we automatically tag each word of a text with its word class?

# Using a Tagger

In [10]:
text = nltk.word_tokenize("And now for something completely different")
nltk.pos_tag(text)

[('And', 'CC'),
 ('now', 'RB'),
 ('for', 'IN'),
 ('something', 'NN'),
 ('completely', 'RB'),
 ('different', 'JJ')]

In [11]:
nltk.help.upenn_tagset('NN')

NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...


In [12]:
text = nltk.word_tokenize("They refuse to permit us to obtain the refuse permit")
nltk.pos_tag(text)

[('They', 'PRP'),
 ('refuse', 'VBP'),
 ('to', 'TO'),
 ('permit', 'VB'),
 ('us', 'PRP'),
 ('to', 'TO'),
 ('obtain', 'VB'),
 ('the', 'DT'),
 ('refuse', 'NN'),
 ('permit', 'NN')]

In [13]:
text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())
text.similar('woman')

man time day year moment car world family house child country boy job
state girl war way place case room


In [14]:
text.similar('bought')

made done said put had seen found given left heard set been brought
was got felt that called took told


In [15]:
text.similar('the')

a his this their its her an that our any all one these my in your no
some other and


# Tagged Corpora
## Representing Tagged Tokens

In [16]:
tagged_token = nltk.tag.str2tuple('fly/NN')
tagged_token

('fly', 'NN')

In [17]:
sent = '''
The/AT grand/JJ jury/NN commented/VBD on/IN a/AT number/NN of/IN
other/AP topics/NNS ,/, AMONG/IN them/PPO the/AT Atlanta/NP and/CC
Fulton/NP-tl County/NN-tl purchasing/VBG departments/NNS which/WDT it/PPS
said/VBD ``/`` ARE/BER well/QL operated/VBN and/CC follow/VB generally/RB
accepted/VBN practices/NNS which/WDT inure/VB to/IN the/AT best/JJT
interest/NN of/IN both/ABX governments/NNS ''/'' ./.
'''
[nltk.tag.str2tuple(t) for t in sent.split()]

[('The', 'AT'),
 ('grand', 'JJ'),
 ('jury', 'NN'),
 ('commented', 'VBD'),
 ('on', 'IN'),
 ('a', 'AT'),
 ('number', 'NN'),
 ('of', 'IN'),
 ('other', 'AP'),
 ('topics', 'NNS'),
 (',', ','),
 ('AMONG', 'IN'),
 ('them', 'PPO'),
 ('the', 'AT'),
 ('Atlanta', 'NP'),
 ('and', 'CC'),
 ('Fulton', 'NP-TL'),
 ('County', 'NN-TL'),
 ('purchasing', 'VBG'),
 ('departments', 'NNS'),
 ('which', 'WDT'),
 ('it', 'PPS'),
 ('said', 'VBD'),
 ('``', '``'),
 ('ARE', 'BER'),
 ('well', 'QL'),
 ('operated', 'VBN'),
 ('and', 'CC'),
 ('follow', 'VB'),
 ('generally', 'RB'),
 ('accepted', 'VBN'),
 ('practices', 'NNS'),
 ('which', 'WDT'),
 ('inure', 'VB'),
 ('to', 'IN'),
 ('the', 'AT'),
 ('best', 'JJT'),
 ('interest', 'NN'),
 ('of', 'IN'),
 ('both', 'ABX'),
 ('governments', 'NNS'),
 ("''", "''"),
 ('.', '.')]

## Reading Tagged Corpora

In [18]:
t_words = nltk.corpus.brown.tagged_words()
print(t_words)
print(t_words[10])

[('The', 'AT'), ('Fulton', 'NP-TL'), ...]
("Atlanta's", 'NP$')


In [19]:
t_words = nltk.corpus.brown.tagged_words(tagset='universal')
print(t_words)
print(t_words[10])

[('The', 'DET'), ('Fulton', 'NOUN'), ...]
("Atlanta's", 'NOUN')


In [20]:
nltk.corpus.nps_chat.tagged_words()

[('now', 'RB'), ('im', 'PRP'), ('left', 'VBD'), ...]

## A Simplified Part-of-Speech Tagset
![tagset](tagset.png)

In [21]:
from nltk.corpus import brown
brown_news_tagged = brown.tagged_words(categories='news', tagset='universal')
tag_fd = nltk.FreqDist(tag for (word, tag) in brown_news_tagged)
tag_fd.keys()

dict_keys(['.', 'NUM', 'NOUN', 'VERB', 'DET', 'ADV', 'CONJ', 'ADJ', 'PRON', 'PRT', 'ADP', 'X'])

## Nouns

Nouns generally refer to people, places, things, or concepts

Nouns can appear after determiners and adjectives, and can be the subject of object of the verb

In [23]:
word_tag_pairs = nltk.bigrams(brown_news_tagged)
list(nltk.FreqDist(a[1] for (a, b) in word_tag_pairs if b[1] == 'NOUN'))

['.',
 'NUM',
 'PRON',
 'VERB',
 'DET',
 'ADV',
 'CONJ',
 'ADJ',
 'NOUN',
 'PRT',
 'ADP',
 'X']

## Verbs

Verbs are words that describe events and actions

In the context of a sentence, verbs typically express a relationi involving the referents of one or more noun phrases.

In [24]:
wsj = nltk.corpus.treebank.tagged_words()
word_tag_fd = nltk.FreqDist(wsj)
[word + "/" + tag for (word, tag) in word_tag_fd if tag.startswith('V')]

['surfaced/VBD',
 'prompted/VBN',
 'note/VBP',
 'counts/VBZ',
 'hailed/VBN',
 'building/VBG',
 'do/VB',
 'shed/VBD',
 'subpoenaed/VBN',
 'depended/VBD',
 'attack/VB',
 'echoed/VBN',
 'stake/VB',
 'buying/VBG',
 'swallow/VB',
 'pleaded/VBD',
 'define/VB',
 'defeat/VB',
 'matter/VB',
 'preapproved/VBN',
 'undercut/VBP',
 'moved/VBD',
 'undercutting/VBG',
 'soaring/VBG',
 'surviving/VBG',
 'emphasized/VBD',
 'claim/VB',
 'coupled/VBN',
 'considering/VBG',
 'spending/VBG',
 'feeling/VBG',
 'firmed/VBD',
 'endorse/VB',
 'blocked/VBN',
 'suspects/VBZ',
 'edged/VBN',
 'spurred/VBN',
 'inching/VBG',
 'encircling/VBG',
 'withdrawn/VBN',
 'license/VB',
 'written/VBN',
 'zoomed/VBN',
 'favor/VB',
 'shoot/VB',
 'become/VBN',
 'squeezed/VBN',
 'concerned/VBN',
 'enact/VB',
 'transformed/VBD',
 'succeed/VBP',
 'contradict/VB',
 'maintained/VBN',
 'adjusting/VBG',
 'skip/VBP',
 'invested/VBD',
 'discontinued/VBN',
 'changed/VBN',
 'pins/VBZ',
 'say/VB',
 'blamed/VBN',
 'file/VB',
 'elaborate/VB',
 'h

In [25]:
cfd1 = nltk.ConditionalFreqDist(wsj)
cfd1['cut']

FreqDist({'NN': 3, 'VB': 12, 'VBD': 10, 'VBN': 3})

In [26]:
cfd2 = nltk.ConditionalFreqDist((tag, word) for (word, tag) in wsj)
cfd2['VB']

FreqDist({'Buy': 3,
          "C'mon": 1,
          'Choose': 1,
          'Compare': 1,
          'Consider': 1,
          'Do': 1,
          'Eliminate': 1,
          'Forget': 1,
          'Hold': 1,
          'Kill': 1,
          'Make': 1,
          'Note': 1,
          'Pick': 1,
          'Put': 2,
          'Remember': 1,
          'Send': 1,
          'Sit': 1,
          'Take': 6,
          'Think': 2,
          'abandon': 2,
          'abide': 1,
          'accept': 2,
          'accommodate': 3,
          'accompany': 1,
          'account': 1,
          'accrue': 1,
          'achieve': 2,
          'acknowledge': 1,
          'acquire': 10,
          'act': 6,
          'add': 5,
          'address': 2,
          'administer': 1,
          'adopt': 1,
          'advance': 1,
          'advertise': 3,
          'affect': 1,
          'afford': 1,
          'agree': 1,
          'aid': 1,
          'aim': 1,
          'alleviate': 1,
          'allow': 5,
          'amend':

## Adjectives and Adverbs

Two other important word classes are adjectives and adverbs. Adjectives describe
nouns, and can be used as modifiers (e.g., large in the large pizza), or as predicates (e.g.,
the pizza is large).

Adjectives describe nouns, and can be used a modifiers(e.g., large in the large pizza), or as predicated(e.g., the pizza is large). English adjectives can have internal structure(e.g., fall+ing in the falling stocks).

Adverbs modify verbs to specify the time, manner, place, or direction of the event described by the verb(e.g., quickly in the stocks fell quickly). Adverbs my also modify adjectives(e.g., really in Mary's teacher was really nice).

## Exploring Tagged Corpora

In [27]:
brown_learned_text = brown.words(categories='learned')
sorted(set(b for (a, b) in nltk.bigrams(brown_learned_text) if a == 'often'))

[',',
 '.',
 'accomplished',
 'analytically',
 'appear',
 'apt',
 'associated',
 'assuming',
 'became',
 'become',
 'been',
 'began',
 'call',
 'called',
 'carefully',
 'chose',
 'classified',
 'colorful',
 'composed',
 'contain',
 'differed',
 'difficult',
 'encountered',
 'enough',
 'equate',
 'extremely',
 'found',
 'happens',
 'have',
 'ignored',
 'in',
 'involved',
 'more',
 'needed',
 'nightly',
 'observed',
 'of',
 'on',
 'out',
 'quite',
 'represent',
 'responsible',
 'revamped',
 'seclude',
 'set',
 'shortened',
 'sing',
 'sounded',
 'stated',
 'still',
 'sung',
 'supported',
 'than',
 'to',
 'when',
 'work']

In [28]:
brown_lrnd_tagged = brown.tagged_words(categories='learned', tagset='universal')
tags = [b[1] for (a, b) in nltk.bigrams(brown_lrnd_tagged) if a[0] == 'often']
fd = nltk.FreqDist(tags)
fd.tabulate() 

VERB  ADV  ADP  ADJ    .  PRT 
  37    8    7    6    4    2 


In [29]:
# Searching for three-word phrases usign POS tags.
def process(sentence):
    for (w1,t1), (w2,t2), (w3,t3) in nltk.trigrams(sentence):
        if (t1.startswith('V') and t2 == 'TO' and t3.startswith('V')):
            print(w1, w2, w3)
            
            
for tagged_sent in brown.tagged_sents():
    process(tagged_sent)

combined to achieve
continue to place
serve to protect
wanted to wait
allowed to place
expected to become
expected to approve
expected to make
intends to make
seek to set
like to see
designed to provide
get to hear
expects to tell
expected to give
prefer to pay
required to obtain
permitted to teach
designed to reduce
Asked to elaborate
got to go
raised to pay
scheduled to go
cut to meet
needed to meet
hastened to add
found to prevent
continue to insist
compelled to make
made to remove
revamped to give
want to risk
appear to spark
fails to consider
plans to call
going to examine
plans to name
come to pass
voted to accept
happens to hold
authorized to adopt
hesitated to prosecute
try to make
decided to spend
taken to preserve
left to preserve
stand to bring
decided to seek
trying to induce
proposing to make
decided to run
directed to investigate
expected to pass
expected to make
expected to encounter
hopes to pass
came to pay
expected to receive
understood to follow
wanted to vote
decide

In [30]:
brown_news_tagged = brown.tagged_words(categories='news', tagset='universal')
data = nltk.ConditionalFreqDist((word.lower(), tag)
                                for (word, tag) in brown_news_tagged)

for word in data.conditions():
    if len(data[word]) > 3:
        tags = data[word].keys()
        print(word, ' '.join(tags))

best NOUN VERB ADV ADJ
open NOUN VERB ADV ADJ
present NOUN VERB ADV ADJ
that PRON ADP DET ADV
close NOUN VERB ADV ADJ


# Mapping Words to Properties Using Python Dictionaries
## Dictionaries in Python

In [31]:
pos = {}
pos['colorless'] = 'ADJ'
pos['ideas'] = 'N'
pos['sleep'] = 'V'
pos['furiously'] = 'ADV'
pos

{'colorless': 'ADJ', 'furiously': 'ADV', 'ideas': 'N', 'sleep': 'V'}

In [32]:
pos['ideas']

'N'

In [33]:
pos.keys()

dict_keys(['furiously', 'colorless', 'sleep', 'ideas'])

In [34]:
pos.values()

dict_values(['ADV', 'ADJ', 'V', 'N'])

In [35]:
pos.items()

dict_items([('furiously', 'ADV'), ('colorless', 'ADJ'), ('sleep', 'V'), ('ideas', 'N')])

## Defining Dictionaries

In [36]:
pos = {'colorless': 'ADJ', 'ideas': 'N', 'sleep': 'V', 'furiously': 'ADV'}
pos

{'colorless': 'ADJ', 'furiously': 'ADV', 'ideas': 'N', 'sleep': 'V'}

In [37]:
pos = dict(colorless='ADJ', ideas='N', sleep='V', furiously='ADV')
pos

{'colorless': 'ADJ', 'furiously': 'ADV', 'ideas': 'N', 'sleep': 'V'}

## Default Dictionaries

In [38]:
frequency = nltk.defaultdict(int)
frequency['a'] = 0

In [39]:
pos = nltk.defaultdict(list)
pos['sleep'] = ['N', 'V']
pos['ideas']

[]

In [40]:
alice = nltk.corpus.gutenberg.words('carroll-alice.txt')
vocab = nltk.FreqDist(alice)
v1000 = list(vocab)[:1000]
mapping = nltk.defaultdict(lambda: 'UNK')
for v in v1000:
    mapping[v] = v

alice2 = [mapping[v] for v in alice]
alice2[:100]

['[',
 'UNK',
 'UNK',
 's',
 'UNK',
 'in',
 'UNK',
 'UNK',
 'Lewis',
 'UNK',
 '1865',
 'UNK',
 'UNK',
 'UNK',
 'UNK',
 'UNK',
 'the',
 'UNK',
 'UNK',
 'Hole',
 'UNK',
 'was',
 'UNK',
 'UNK',
 'get',
 'UNK',
 'tired',
 'of',
 'sitting',
 'UNK',
 'her',
 'UNK',
 'UNK',
 'the',
 'bank',
 'UNK',
 'UNK',
 'of',
 'UNK',
 'UNK',
 'UNK',
 'UNK',
 ':',
 'UNK',
 'or',
 'twice',
 'she',
 'UNK',
 'UNK',
 'UNK',
 'the',
 'UNK',
 'her',
 'UNK',
 'was',
 'UNK',
 'UNK',
 'UNK',
 'UNK',
 'UNK',
 'UNK',
 'UNK',
 'or',
 'conversations',
 'in',
 'UNK',
 'UNK',
 'UNK',
 'UNK',
 'what',
 'UNK',
 'the',
 'UNK',
 'of',
 'a',
 'UNK',
 ",'",
 'UNK',
 'UNK',
 'UNK',
 'UNK',
 'UNK',
 'or',
 'conversation',
 'UNK',
 'UNK',
 'she',
 'was',
 'UNK',
 'in',
 'her',
 'UNK',
 'mind',
 'UNK',
 'UNK',
 'UNK',
 'UNK',
 'she',
 'could',
 'UNK']

## Incrementally Updating a Dictionary

In [41]:
counts = nltk.defaultdict(int)

for (word, tag) in brown.tagged_words(categories='news'):
    counts[tag] += 1

counts['BEZ']

730

In [42]:
list(counts)

['FW-*',
 'WPS+BEZ',
 'PP$$',
 'ABN-HL',
 'ABN',
 'NPS-TL',
 'JJ',
 'ABL',
 'RB',
 'RB$',
 'NR$-TL',
 'HVD-HL',
 ')-HL',
 'FW-IN+AT-TL',
 'JJ-HL',
 'NP-TL',
 'BER-HL',
 'VBZ',
 'DO*',
 'NP-HL',
 'JJS',
 'BEDZ',
 '.',
 'IN-TL',
 'FW-NN',
 'PPSS',
 'PN$',
 'BER*',
 'JJS-TL',
 'MD+HV',
 'CS-HL',
 'BEZ-HL',
 'NP$',
 'WPO',
 'CD$',
 'JJR-TL',
 'NR-TL',
 'NNS-TL',
 'EX',
 'BED*',
 'WP$',
 'NNS$-TL',
 'DOZ',
 'PPL',
 'CD-TL',
 'FW-IN+NN',
 'IN-HL',
 'FW-VB',
 'VBN-TL',
 'NN',
 'VBG-HL',
 'MD',
 'AT-TL',
 'HV',
 'VBD-TL',
 'AT-HL',
 'VB-TL',
 'VBG-TL',
 'JJ-NC',
 'PPS+HVZ',
 '--',
 'FW-NN-TL',
 'HVN',
 'ABX',
 'MD-TL',
 'OD-TL',
 'JJT-HL',
 'DOD',
 'JJ-TL',
 'BE',
 'NPS$',
 'VB+PPO',
 'AP$',
 'PP$-TL',
 'NPS-HL',
 'WDT',
 'VBN-HL',
 'RP-HL',
 'FW-IN+NN-TL',
 '``',
 'NPS$-TL',
 'NR',
 'NNS-HL',
 'PPS+BEZ',
 'DT-HL',
 'RB+BEZ',
 'DT',
 'OD',
 'NN-NC',
 'BEDZ*',
 'VBG',
 ')',
 'FW-JJ-TL',
 'NNS$',
 'PN-HL',
 'DTI',
 'VB-HL',
 'TO',
 'NP+BEZ',
 'FW-WDT',
 'MD*-HL',
 'QLP',
 'PPS+MD',
 'PPSS+BER',


## Complex Keys and Values

In [43]:
pos = nltk.defaultdict(lambda: nltk.defaultdict(int))
brown_news_tagged = brown.tagged_words(categories='news')
for ((w1, t1), (w2, t2)) in nltk.bigrams(brown_news_tagged):
    pos[(t1, w2)][t2] += 1

pos[('NN', 'being')]

defaultdict(int, {'BEG': 7})

## Inverting a Dctionary

In [44]:
counts = nltk.defaultdict(int)
for word in nltk.corpus.gutenberg.words('milton-paradise.txt'):
    counts[word] += 1
    
[key for (key, value) in counts.items() if value == 32]

['mortal',
 'every',
 'virtue',
 'Him',
 'been',
 'thine',
 'Against',
 'King',
 'There',
 'brought']

# Automatic Tagging
## The Default Tagger

In [45]:
tags = [tag for (word, tag) in brown.tagged_words(categories='news')]
nltk.FreqDist(tags).max() 

'NN'

In [46]:
tags

['AT',
 'NP-TL',
 'NN-TL',
 'JJ-TL',
 'NN-TL',
 'VBD',
 'NR',
 'AT',
 'NN',
 'IN',
 'NP$',
 'JJ',
 'NN',
 'NN',
 'VBD',
 '``',
 'AT',
 'NN',
 "''",
 'CS',
 'DTI',
 'NNS',
 'VBD',
 'NN',
 '.',
 'AT',
 'NN',
 'RBR',
 'VBD',
 'IN',
 'NN',
 'NNS',
 'CS',
 'AT',
 'NN-TL',
 'JJ-TL',
 'NN-TL',
 ',',
 'WDT',
 'HVD',
 'JJ',
 'NN',
 'IN',
 'AT',
 'NN',
 ',',
 '``',
 'VBZ',
 'AT',
 'NN',
 'CC',
 'NNS',
 'IN',
 'AT',
 'NN-TL',
 'IN-TL',
 'NP-TL',
 "''",
 'IN',
 'AT',
 'NN',
 'IN',
 'WDT',
 'AT',
 'NN',
 'BEDZ',
 'VBN',
 '.',
 'AT',
 'NP',
 'NN',
 'NN',
 'HVD',
 'BEN',
 'VBN',
 'IN',
 'NP-TL',
 'JJ-TL',
 'NN-TL',
 'NN-TL',
 'NP',
 'NP',
 'TO',
 'VB',
 'NNS',
 'IN',
 'JJ',
 '``',
 'NNS',
 "''",
 'IN',
 'AT',
 'JJ',
 'NN',
 'WDT',
 'BEDZ',
 'VBN',
 'IN',
 'NN-TL',
 'NP',
 'NP',
 'NP',
 '.',
 '``',
 'RB',
 'AT',
 'JJ',
 'NN',
 'IN',
 'JJ',
 'NNS',
 'BEDZ',
 'VBN',
 "''",
 ',',
 'AT',
 'NN',
 'VBD',
 ',',
 '``',
 'IN',
 'AT',
 'JJ',
 'NN',
 'IN',
 'AT',
 'NN',
 ',',
 'AT',
 'NN',
 'IN',
 'NNS',
 'CC',


In [47]:
raw = 'I do not like green eggs and ham, I do not like them Sam I am!'
tokens = nltk.word_tokenize(raw)
default_tagger = nltk.DefaultTagger('NN')
default_tagger.tag(tokens) 

[('I', 'NN'),
 ('do', 'NN'),
 ('not', 'NN'),
 ('like', 'NN'),
 ('green', 'NN'),
 ('eggs', 'NN'),
 ('and', 'NN'),
 ('ham', 'NN'),
 (',', 'NN'),
 ('I', 'NN'),
 ('do', 'NN'),
 ('not', 'NN'),
 ('like', 'NN'),
 ('them', 'NN'),
 ('Sam', 'NN'),
 ('I', 'NN'),
 ('am', 'NN'),
 ('!', 'NN')]

## The Regular Expression Tagger

In [90]:
patterns = [
    (r'.*ing$', 'VBG'),               # gerunds
    (r'.*ed$', 'VBD'),                # simple past
    (r'.*es$', 'VBZ'),                # 3rd singular present
    (r'.*ould$', 'MD'),               # modals
    (r'.*\'s$', 'NN$'),               # possessive nouns
    (r'.*s$', 'NNS'),                 # plural nouns
    (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
    (r'.*', 'NN')                     # nouns (default)
] 

In [91]:
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news') 

regexp_tagger = nltk.RegexpTagger(patterns)
regexp_tagger.tag(brown_sents[3]) 

[('``', 'NN'),
 ('Only', 'NN'),
 ('a', 'NN'),
 ('relative', 'NN'),
 ('handful', 'NN'),
 ('of', 'NN'),
 ('such', 'NN'),
 ('reports', 'NNS'),
 ('was', 'NNS'),
 ('received', 'VBD'),
 ("''", 'NN'),
 (',', 'NN'),
 ('the', 'NN'),
 ('jury', 'NN'),
 ('said', 'NN'),
 (',', 'NN'),
 ('``', 'NN'),
 ('considering', 'VBG'),
 ('the', 'NN'),
 ('widespread', 'NN'),
 ('interest', 'NN'),
 ('in', 'NN'),
 ('the', 'NN'),
 ('election', 'NN'),
 (',', 'NN'),
 ('the', 'NN'),
 ('number', 'NN'),
 ('of', 'NN'),
 ('voters', 'NNS'),
 ('and', 'NN'),
 ('the', 'NN'),
 ('size', 'NN'),
 ('of', 'NN'),
 ('this', 'NNS'),
 ('city', 'NN'),
 ("''", 'NN'),
 ('.', 'NN')]

## The Lookup Tagger

In [92]:
fd = nltk.FreqDist(brown.words(categories='news'))
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
most_freq_words = list(fd.keys())[:100]
likely_tags = dict((word, cfd[word].max()) for word in most_freq_words)
baseline_tagger = nltk.UnigramTagger(model=likely_tags)
baseline_tagger.evaluate(brown_tagged_sents) 

0.004335978678123197

In [93]:
sent = brown.sents(categories='news')[3]
baseline_tagger.tag(sent)

[('``', None),
 ('Only', None),
 ('a', None),
 ('relative', None),
 ('handful', None),
 ('of', None),
 ('such', None),
 ('reports', None),
 ('was', None),
 ('received', None),
 ("''", None),
 (',', None),
 ('the', None),
 ('jury', None),
 ('said', None),
 (',', None),
 ('``', None),
 ('considering', None),
 ('the', None),
 ('widespread', None),
 ('interest', None),
 ('in', None),
 ('the', None),
 ('election', None),
 (',', None),
 ('the', None),
 ('number', None),
 ('of', None),
 ('voters', None),
 ('and', None),
 ('the', None),
 ('size', None),
 ('of', None),
 ('this', None),
 ('city', None),
 ("''", None),
 ('.', None)]

In [56]:
def performance(cfd, wordlist):    
    lt = dict((word, cfd[word].max()) for word in wordlist)    
    baseline_tagger = nltk.UnigramTagger(model=lt, backoff=nltk.DefaultTagger('NN'))    
    return baseline_tagger.evaluate(brown.tagged_sents(categories='news'))

def display():    
    import pylab    
    words_by_freq = list(nltk.FreqDist(brown.words(categories='news')))    
    cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))    
    sizes = 2 ** pylab.arange(15)    
    perfs = [performance(cfd, words_by_freq[:size]) for size in sizes]    
    pylab.plot(sizes, perfs, '-bo')    
    pylab.title('Lookup Tagger Performance with Varying Model Size')    
    pylab.xlabel('Model Size')    
    pylab.ylabel('Performance')    
    pylab.show() 
    
display()

## Evaluation
![lookup tagger](lookup_tagger.PNG)

# N-Gram Tagging
## Unigram Tagging

In [22]:
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
unigram_tagger.tag(brown_sents[2007]) 

[('Various', 'JJ'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('apartments', 'NNS'),
 ('are', 'BER'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('terrace', 'NN'),
 ('type', 'NN'),
 (',', ','),
 ('being', 'BEG'),
 ('on', 'IN'),
 ('the', 'AT'),
 ('ground', 'NN'),
 ('floor', 'NN'),
 ('so', 'QL'),
 ('that', 'CS'),
 ('entrance', 'NN'),
 ('is', 'BEZ'),
 ('direct', 'JJ'),
 ('.', '.')]

## Separating the Training and Testing Data

In [97]:
size = int(len(brown_tagged_sents) * 0.9)
size

4160

In [98]:
train_sents = brown_tagged_sents[:size]
test_sents = brown_tagged_sents[size:]
unigram_tagger = nltk.UnigramTagger(train_sents)
unigram_tagger.evaluate(test_sents) 

0.8125186883285159

## General N-Gram Tagging
![tagger context](tagger_context.PNG)

In [30]:
bigram_tagger = nltk.BigramTagger(train_sents)
bigram_tagger.tag(brown_sents[2007]) 

[('Various', 'JJ'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('apartments', 'NNS'),
 ('are', 'BER'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('terrace', 'NN'),
 ('type', 'NN'),
 (',', ','),
 ('being', 'BEG'),
 ('on', 'IN'),
 ('the', 'AT'),
 ('ground', 'NN'),
 ('floor', 'NN'),
 ('so', 'CS'),
 ('that', 'CS'),
 ('entrance', 'NN'),
 ('is', 'BEZ'),
 ('direct', 'JJ'),
 ('.', '.')]

In [32]:
unseen_sent = brown_sents[4203]
bigram_tagger.tag(unseen_sent)

[('The', 'AT'),
 ('population', 'NN'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('Congo', 'NP'),
 ('is', 'BEZ'),
 ('13.5', None),
 ('million', None),
 (',', None),
 ('divided', None),
 ('into', None),
 ('at', None),
 ('least', None),
 ('seven', None),
 ('major', None),
 ('``', None),
 ('culture', None),
 ('clusters', None),
 ("''", None),
 ('and', None),
 ('innumerable', None),
 ('tribes', None),
 ('speaking', None),
 ('400', None),
 ('separate', None),
 ('dialects', None),
 ('.', None)]

In [34]:
bigram_tagger.evaluate(test_sents) 

0.10305990232233629

## Combining Taggers

In [99]:
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sents, backoff=t1)
t2.evaluate(test_sents) 

0.8464068573706768

## Storing Taggers

In [113]:
from pickle import dump
output = open('tagger.pkl', 'wb')
dump(t2, output, 0)
output.close()

In [114]:
from pickle import load
input = open('tagger.pkl', 'rb')
s_tagger = load(input)
input.close()
s_tagger.tag('This is test')

[('T', 'NN'),
 ('h', 'NN'),
 ('i', 'NN'),
 ('s', 'NN'),
 (' ', 'NN'),
 ('i', 'NN'),
 ('s', 'NN'),
 (' ', 'NN'),
 ('t', 'NN'),
 ('e', 'NN'),
 ('s', 'NN'),
 ('t', 'NN')]

## How to Determine the Category of a Word
- Morphological Clues
- Syntactic Clues
- Semantic Clues
- New Words
- Morphology in Part-of-Speech Tagsets