In [1]:
import nltk, re, pprint
from nltk import word_tokenize

# 1. Using a Tagger

In [2]:
text = word_tokenize("And now for something completely different")
nltk.pos_tag(text)

[('And', 'CC'),
 ('now', 'RB'),
 ('for', 'IN'),
 ('something', 'NN'),
 ('completely', 'RB'),
 ('different', 'JJ')]

In [4]:
# Including homonyms
text = word_tokenize("They refuse to permit us to obtain the refuse permit")
nltk.pos_tag(text)

[('They', 'PRP'),
 ('refuse', 'VBP'),
 ('to', 'TO'),
 ('permit', 'VB'),
 ('us', 'PRP'),
 ('to', 'TO'),
 ('obtain', 'VB'),
 ('the', 'DT'),
 ('refuse', 'NN'),
 ('permit', 'NN')]

In [5]:
text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())
text.similar('woman')

man time day year moment car world family house country boy child job
state war place girl way case work


In [6]:
text.similar('bought')

made put done said found seen had left given heard set been brought
was got told in that took felt


In [7]:
text.similar('over')

in on to of and for with from at by that into as up out down through
is all about


In [8]:
text.similar('the')

a his this their its her an that our any all one these my in your no
some other and


# 2 Tagged Corpora

## 2.1 Representing Tagged Tokens

In [10]:
tagged_token = nltk.tag.str2tuple('fly/NN')
tagged_token

('fly', 'NN')

In [11]:
sent = '''
The/AT grand/JJ jury/NN commented/VBD on/IN a/AT number/NN of/IN
other/AP topics/NNS ,/, AMONG/IN them/PPO the/AT Atlanta/NP and/CC
Fulton/NP-tl County/NN-tl purchasing/VBG departments/NNS which/WDT it/PPS
said/VBD ``/`` ARE/BER well/QL operated/VBN and/CC follow/VB generally/RB
accepted/VBN practices/NNS which/WDT inure/VB to/IN the/AT best/JJT
interest/NN of/IN both/ABX governments/NNS ''/'' ./.
'''
[nltk.tag.str2tuple(t) for t in sent.split()]

[('The', 'AT'),
 ('grand', 'JJ'),
 ('jury', 'NN'),
 ('commented', 'VBD'),
 ('on', 'IN'),
 ('a', 'AT'),
 ('number', 'NN'),
 ('of', 'IN'),
 ('other', 'AP'),
 ('topics', 'NNS'),
 (',', ','),
 ('AMONG', 'IN'),
 ('them', 'PPO'),
 ('the', 'AT'),
 ('Atlanta', 'NP'),
 ('and', 'CC'),
 ('Fulton', 'NP-TL'),
 ('County', 'NN-TL'),
 ('purchasing', 'VBG'),
 ('departments', 'NNS'),
 ('which', 'WDT'),
 ('it', 'PPS'),
 ('said', 'VBD'),
 ('``', '``'),
 ('ARE', 'BER'),
 ('well', 'QL'),
 ('operated', 'VBN'),
 ('and', 'CC'),
 ('follow', 'VB'),
 ('generally', 'RB'),
 ('accepted', 'VBN'),
 ('practices', 'NNS'),
 ('which', 'WDT'),
 ('inure', 'VB'),
 ('to', 'IN'),
 ('the', 'AT'),
 ('best', 'JJT'),
 ('interest', 'NN'),
 ('of', 'IN'),
 ('both', 'ABX'),
 ('governments', 'NNS'),
 ("''", "''"),
 ('.', '.')]

## 2.2 Reading Tagged Corpora

In [12]:
nltk.corpus.brown.tagged_words()

[('The', 'AT'), ('Fulton', 'NP-TL'), ...]

In [13]:
nltk.corpus.brown.tagged_words(tagset='universal')

[('The', 'DET'), ('Fulton', 'NOUN'), ...]

In [15]:
print(nltk.corpus.nps_chat.tagged_words())

[('now', 'RB'), ('im', 'PRP'), ('left', 'VBD'), ...]


In [16]:
nltk.corpus.conll2000.tagged_words()

[('Confidence', 'NN'), ('in', 'IN'), ('the', 'DT'), ...]

In [17]:
nltk.corpus.treebank.tagged_words()

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ...]

In [18]:
nltk.corpus.brown.tagged_words(tagset='universal')

[('The', 'DET'), ('Fulton', 'NOUN'), ...]

In [19]:
nltk.corpus.treebank.tagged_words(tagset='universal')

[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ...]

In [23]:
nltk.corpus.sinica_treebank.tagged_words()

[('一', 'Neu'), ('友情', 'Nad'), ('嘉珍', 'Nba'), ...]

In [24]:
nltk.corpus.indian.tagged_words()

[('মহিষের', 'NN'), ('সন্তান', 'NN'), (':', 'SYM'), ...]

In [29]:
# nltk.corpus.mac_morpho.tagged_words()
nltk.corpus.conll2002.tagged_words()
# nltk.corpus.cess_cat.tagged_words()

[('Sao', 'NC'), ('Paulo', 'VMI'), ('(', 'Fpa'), ...]

## 2.3 A Universal Part-of-Speech Tagset

In [30]:
from nltk.corpus import brown
brown_news_tagged = brown.tagged_words(categories='news', tagset='universal')
tag_fd = nltk.FreqDist(tag for (word, tag) in brown_news_tagged)
tag_fd.most_common()

[('NOUN', 30654),
 ('VERB', 14399),
 ('ADP', 12355),
 ('.', 11928),
 ('DET', 11389),
 ('ADJ', 6706),
 ('ADV', 3349),
 ('CONJ', 2717),
 ('PRON', 2535),
 ('PRT', 2264),
 ('NUM', 2166),
 ('X', 92)]

## 2.4 Nouns

In [31]:
word_tag_pairs = nltk.bigrams(brown_news_tagged)
noun_preceders = [a[1] for (a, b) in word_tag_pairs if b[1] == 'NOUN']
fdist = nltk.FreqDist(noun_preceders)
[tag for (tag, _) in fdist.most_common()]

['NOUN',
 'DET',
 'ADJ',
 'ADP',
 '.',
 'VERB',
 'CONJ',
 'NUM',
 'ADV',
 'PRT',
 'PRON',
 'X']

## 2.5 Verbs

In [33]:
wsj = nltk.corpus.treebank.tagged_words(tagset='universal')
word_tag_fd = nltk.FreqDist(wsj)
[wt[0] for (wt, _) in word_tag_fd.most_common() if wt[1] == 'VERB']

['is',
 'said',
 'was',
 'are',
 'be',
 'has',
 'have',
 'will',
 'says',
 'would',
 'were',
 'had',
 'been',
 'could',
 "'s",
 'can',
 'do',
 'say',
 'make',
 'may',
 'did',
 'rose',
 'made',
 'does',
 'expected',
 'buy',
 'take',
 'get',
 'might',
 'sell',
 'added',
 'sold',
 'help',
 'including',
 'should',
 'reported',
 'according',
 'compared',
 'pay',
 'being',
 'fell',
 'began',
 'based',
 'closed',
 'used',
 'want',
 'see',
 "'re",
 'took',
 'yield',
 'set',
 'offered',
 'priced',
 'noted',
 'come',
 'cut',
 'approved',
 'ended',
 'think',
 'increased',
 'found',
 'become',
 'go',
 'named',
 'trying',
 'declined',
 'proposed',
 'received',
 'growing',
 'held',
 'give',
 'put',
 'came',
 'use',
 'called',
 'paid',
 'continue',
 'going',
 'designed',
 'estimated',
 'raise',
 'making',
 'must',
 'plans',
 'seeking',
 'expects',
 'wo',
 'saying',
 'acquired',
 'increasing',
 'got',
 'fined',
 'owns',
 'gained',
 'trading',
 'included',
 'holding',
 'announced',
 'became',
 'reached

In [34]:
cfd1 = nltk.ConditionalFreqDist(wsj)
cfd1['yield'].most_common()

[('VERB', 28), ('NOUN', 20)]

In [35]:
cfd1['cut'].most_common()

[('VERB', 25), ('NOUN', 3)]

In [36]:
wsj = nltk.corpus.treebank.tagged_words()
cfd2 = nltk.ConditionalFreqDist((tag, word) for (word, tag) in wsj)
list(cfd2['VBN'])

['raised',
 'adopted',
 'linked',
 'charged',
 'spun',
 'attached',
 'stopped',
 'implemented',
 'alarmed',
 'attributed',
 'agreed',
 'assumed',
 'headlined',
 'engaged',
 'laid',
 'known',
 'afflicted',
 'drafted',
 'set',
 'Guaranteed',
 'fed',
 'finalized',
 'diluted',
 'expelled',
 'invested',
 'extended',
 'accelerated',
 'selected',
 'triggered',
 'expanded',
 'failed',
 'enjoyed',
 'frightened',
 'faded',
 'tracked',
 'spread',
 'purchased',
 'spurred',
 'represented',
 'jumped',
 'targeted',
 'carried',
 'refunded',
 'milked',
 'founded',
 'staid',
 'served',
 'Funded',
 'repaid',
 'reaped',
 'cluttered',
 'experienced',
 'documented',
 'suspended',
 'been',
 'condemned',
 'permitted',
 'subpoenaed',
 'fallen',
 'convicted',
 'opposed',
 'outdistanced',
 'retired',
 'built',
 'answered',
 'skyrocketed',
 'priced',
 'found',
 'advertised',
 'gotten',
 'prolonged',
 'obtained',
 'crowded',
 'removed',
 'clamped',
 'confused',
 'forced',
 'designed',
 'stabbed',
 'scrapped',
 'cl

In [37]:
[ w for w in cfd1.conditions() if 'VBD' in cfd1[w] and 'VBN' in cfd1[w]]

[]

In [47]:
idx1 = wsj.index(('kicked', 'VBD'))
wsj[idx1-4:idx1+1]

[('While', 'IN'),
 ('program', 'NN'),
 ('trades', 'NNS'),
 ('swiftly', 'RB'),
 ('kicked', 'VBD')]

In [48]:
idx2 = wsj.index(('kicked', 'VBN'))
wsj[idx2-4:idx2+1]

[('head', 'NN'),
 ('of', 'IN'),
 ('state', 'NN'),
 ('has', 'VBZ'),
 ('kicked', 'VBN')]

## 2.7 Unsimplified Tags

In [50]:
def findtags(tag_prefix, tagged_text):
    cfd = nltk.ConditionalFreqDist((tag, word) for (word, tag) in tagged_text
                                  if tag.startswith(tag_prefix))
    return dict((tag, cfd[tag].most_common(5)) for tag in cfd.conditions())

tagdict = findtags('NN', nltk.corpus.brown.tagged_words(categories='news'))
for tag in sorted(tagdict):
    print(tag, tagdict[tag])

NN [('year', 137), ('time', 97), ('state', 88), ('week', 85), ('man', 72)]
NN$ [("year's", 13), ("world's", 8), ("state's", 7), ("nation's", 6), ("city's", 6)]
NN$-HL [("Navy's", 1), ("Golf's", 1)]
NN$-TL [("President's", 11), ("Army's", 3), ("League's", 3), ("University's", 3), ("Administration's", 3)]
NN-HL [('sp.', 2), ('condition', 2), ('cut', 2), ('war', 2), ('business', 2)]
NN-NC [('aya', 1), ('eva', 1), ('ova', 1)]
NN-TL [('President', 88), ('House', 68), ('State', 59), ('University', 42), ('City', 41)]
NN-TL-HL [('Fort', 2), ('Street', 1), ('City', 1), ('Grove', 1), ('House', 1)]
NNS [('years', 101), ('members', 69), ('people', 52), ('sales', 51), ('men', 46)]
NNS$ [("children's", 7), ("women's", 5), ("men's", 3), ("janitors'", 3), ("years'", 2)]
NNS$-HL [("Dealers'", 1), ("Idols'", 1)]
NNS$-TL [("Women's", 4), ("States'", 3), ("Giants'", 2), ("Officers'", 1), ("Braves'", 1)]
NNS-HL [('payments', 1), ('strings', 1), ('successes', 1), ('troubles', 1), ('Legislators', 1)]
NNS-TL 

## 2.8 Exploring Tagged Corpora

In [51]:
brown_learned_text = brown.words(categories='learned')
sorted(set(b for (a, b) in nltk.bigrams(brown_learned_text) if a == 'often'))

[',',
 '.',
 'accomplished',
 'analytically',
 'appear',
 'apt',
 'associated',
 'assuming',
 'became',
 'become',
 'been',
 'began',
 'call',
 'called',
 'carefully',
 'chose',
 'classified',
 'colorful',
 'composed',
 'contain',
 'differed',
 'difficult',
 'encountered',
 'enough',
 'equate',
 'extremely',
 'found',
 'happens',
 'have',
 'ignored',
 'in',
 'involved',
 'more',
 'needed',
 'nightly',
 'observed',
 'of',
 'on',
 'out',
 'quite',
 'represent',
 'responsible',
 'revamped',
 'seclude',
 'set',
 'shortened',
 'sing',
 'sounded',
 'stated',
 'still',
 'sung',
 'supported',
 'than',
 'to',
 'when',
 'work']

In [53]:
brown_lrnd_tagged = brown.tagged_words(categories='learned', tagset='universal')
tags = [b[1] for (a, b) in nltk.bigrams(brown_lrnd_tagged) if a[0] == 'often']
fd = nltk.FreqDist(tags)
fd.tabulate()

VERB  ADV  ADP  ADJ    .  PRT 
  37    8    7    6    4    2 


In [54]:
from nltk.corpus import brown
def process(sentence):
    for (w1, t1), (w2, t2), (w3, t3) in nltk.trigrams(sentence):
        if (t1.startswith('V') and t2 == 'TO' and t3.startswith('V')):
            print(w1, w2, w3)
            
for tagged_sent in brown.tagged_sents():
    process(tagged_sent)

combined to achieve
continue to place
serve to protect
wanted to wait
allowed to place
expected to become
expected to approve
expected to make
intends to make
seek to set
like to see
designed to provide
get to hear
expects to tell
expected to give
prefer to pay
required to obtain
permitted to teach
designed to reduce
Asked to elaborate
got to go
raised to pay
scheduled to go
cut to meet
needed to meet
hastened to add
found to prevent
continue to insist
compelled to make
made to remove
revamped to give
want to risk
appear to spark
fails to consider
plans to call
going to examine
plans to name
come to pass
voted to accept
happens to hold
authorized to adopt
hesitated to prosecute
try to make
decided to spend
taken to preserve
left to preserve
stand to bring
decided to seek
trying to induce
proposing to make
decided to run
directed to investigate
expected to pass
expected to make
expected to encounter
hopes to pass
came to pay
expected to receive
understood to follow
wanted to vote
decide

seem to lend
allowed to mix
starts to swing
seems to stop
ordered to hold
fitted to endure
trying to follow
threatening to explode
rides to break
managed to make
dancing to display
hasten to report
seems to come
reassuring to see
profess to know
help to explain
forced to rely
attempt to homogenize
like to put
continue to try
try to come
seem to deal
decide to let
tries to take
trying to forget
trying to acquire
threatens to linger
decided to forego
managed to hold
intended to illustrate
tried to get
learn to live
helping to move
striving to hold
choose to work
tried to see
trying to create
made to appear
failed to make
seemed to deserve
managed to mix
want to hurt
liked to nip
manages to acquire
widened to enchant
serve to contradict
dare to experiment
tried to humanize
tries to preserve
helps to rebut
seems to make
began to play
cares to remember
serve to show
want to collect
designed to invite
attempt to make
designed to belong
seem to come
wanted to identify
neglect to cultivate
cho

failed to post
refused to permit
encouraged to beget
obliged to obey
united to push
try to oppose
made to impose
wanted to clarify
proposed to sail
determined to catch
forced to turn
seemed to sense
seemed to know
tried to brush
turning to repeat
tried to persuade
wanted to turn
preparing to pacify
forced to retreat
contracted to supply
forced to leave
offering to bring
attempt to bring
decided to cast
liked to tease
going to buy
com to sea
drilled to follow
born to command
come to recognize
allowed to account
created to fan
come to mean
trying to make
refusing to keep
wishes to discuss
want to ask
want to tap
said to use
employed to see
shoot to kill
refused to touch
threatened to shoot
said to let
begin to roll
held to assure
going to make
managed to get
wanted to play
prepared to counterattack
failed to rally
tried to rape
refused to speak
called to look
refused to say
mean to suggest
prepared to carry
designed to overthrow
trying to put
needed to work
disposed to exploit
fail to se

tailored to fit
obliged to describe
tried to block
chosen to edit
plotted to take
tried to halt
wanted to die
returned to make
like to believe
bother to look
used to go
seemed to thaw
came to give
wanted to see
used to look
meant to help
like to straighten
hope to give
bark to let
dash to get
tried to talk
decided to leave
used to tell
continue to reflect
appear to preach
intend to let
need to test
learned to meet
said to give
serves to reduce
thought to provide
tends to give
wish to deny
expect to find
seek to capture
allowed to claim
seeks to recapture
determined to bulldoze
sought to run
needed to make
hurry to catch
planned to bolt
fit to nominate
intend to support
refusing to abandon
begun to parallel
help to give
fail to convey
tends to lose
aimed to write
granted to serve
tends to underestimate
permitted to cross
demanding to know
obliged to remain
delighted to make
seem to shake
assigned to check
volunteered to advance
went to hurry
refused to notice
began to select
began to sp

begun to translate
besieged to serve
help to create
assumed to originate
used to describe
expected to cause
taken to study
required to ensure
used to measure
used to start
started to strike
allowed to pull
used to measure
generalized to include
eliminated to obtain
extended to include
used to derive
adjusted to minimize
required to make
taken to prevent
attempts to present
evaporate to leave
tend to stick
helps to float
acts to remove
tend to accelerate
appear to offer
allowed to take
allowed to distil
undertaken to see
allowed to stand
thought to contribute
made to characterize
expect to find
known to cause
allowed to stand
allowed to stand
used to test
allowed to clot
need to make
try to key
manage to keep
seem to prefer
continues to add
begin to play
begin to appear
begin to dig
cease to lay
manages to slip
seem to recognize
like to burrow
like to think
estimated to contain
love to visit
fail to show
attempts to weigh
given to complete
required to reach
prove to belong
varied to all

used to select
used to select
used to specify
used to specify
expected to serve
used to eliminate
designed to handle
made to take
tended to float
began to decrease
began to build
used to provide
tend to ensure
seems to strive
stated to emphasize
channeled to produce
expected to replace
developed to attack
needed to translate
needed to make
used to deny
required to localize
found to protect
used to demonstrate
shown to undergo
extended to include
shown to undergo
allowed to go
used to classify
found to keep
made to group
continued to supply
serves to inactivate
needed to inactivate
thought to offer
serves to extend
served to extend
required to accomplish
found to compare
required to remove
required to cut
tends to push
required to cut
seen to correlate
beginning to advance
managed to grow
started to open
seem to justify
claimed to give
used to denote
required to cause
combined to attain
used to slit
hope to compete
helped to alleviate
calculated to expand
lowered to permit
appears to pr

trying to escape
began to thrash
get to work
come to work
want to see
wanted to get
want to go
managed to swallow
threatened to fire
happen to see
going to eat
began to weep
happens to ask
want to rent
Try to imagine
promised to pay
planning to remarry
determined to get
seemed to swell
surprised to meet
trying to tell
known to run
seem to preserve
got to understand
got to know
came to ask
asked to see
began to abuse
hurry to hang
seemed to mind
started to cross
started to curse
offering to buy
began to feel
seem to wink
continued to stare
beginning to feel
like to listen
want to study
relieved to see
seemed to notice
decided to stay
startled to see
began to feel
bothered to ask
beginning to take
began to wish
want to encourage
began to talk
paused to moisten
went to join
got to go
trying to get
want to know
want to take
want to leave
got to assume
got to keep
burning to light
decided to leave
set to stay
going to kill
go to bat
stopped to cherish
struggling to bridge
learning to think


started to carry
began to look
going to happen
began to walk
trying to say
trying to say
started to say
wants to take
started to take
began to fascinate
left to spend
seemed to work
work to grow
trying to talk
lied to shorten
trying to make
wanted to force
refused to take
seemed to please
grew to depend
meant to tell
trying to pull
seemed to shiver
trying to remember
worked to recall
wished to call
seemed to stare
began to tremble
refusing to think
refusing to think
hoping to frighten
wanted to run
began to ache
began to bother
tried to take
wanted to kill
stopped to see
going to tell
going to push
wanted to slap
started to type
helped to build
refused to drive
wanted to go
likes to play
like to rise
tried to push
trying to run
tried to make
used to say
come to exist
fit to put
stoop to lift
wanted to draw
going to pick
intended to wait
determined to foil
strode to answer
trusted to carry
seemed to help
chose to come
tried to ignore
liked to break
began to aid
going to tear
like to exh

In [56]:
brown_news_tagged = brown.tagged_words(categories='news', tagset='universal')
data = nltk.ConditionalFreqDist((word.lower(), tag)
                             for (word, tag) in brown_news_tagged)
for word in sorted(data.conditions()):
    if len(data[word]) > 3:
        tags = [tag for (tag, _) in data[word].most_common()]
        print(word, ', '.join(tags))

best ADJ, NOUN, VERB, ADV
close ADV, ADJ, VERB, NOUN
open ADJ, VERB, NOUN, ADV
present ADJ, ADV, VERB, NOUN
that ADP, DET, PRON, ADV


# 3 Mapping Words to Properties Using Python Dictionaries

## 3.2 Dictionaries in Python

In [57]:
pos = {}
pos

{}

In [59]:
pos['colorless'] = 'ADJ'
pos

{'colorless': 'ADJ'}

In [60]:
pos['ideas'] = 'N'
pos['sleep'] = 'V'
pos['furiously'] = 'ADV'
pos

{'colorless': 'ADJ', 'furiously': 'ADV', 'ideas': 'N', 'sleep': 'V'}

In [61]:
pos['ideas']

'N'

In [62]:
list(pos)

['ideas', 'colorless', 'furiously', 'sleep']

In [63]:
sorted(pos)

['colorless', 'furiously', 'ideas', 'sleep']

In [64]:
[w for w in pos if w.endswith('s')]

['ideas', 'colorless']

In [65]:
for word in sorted(pos):
    print(word + ".", pos[word])

colorless. ADJ
furiously. ADV
ideas. N
sleep. V


In [66]:
list(pos.keys())

['ideas', 'colorless', 'furiously', 'sleep']

In [67]:
list(pos.values())

['N', 'ADJ', 'ADV', 'V']

In [68]:
list(pos.items())

[('ideas', 'N'), ('colorless', 'ADJ'), ('furiously', 'ADV'), ('sleep', 'V')]

In [69]:
for key, val in sorted(pos.items()):
    print(key + ".", val)

colorless. ADJ
furiously. ADV
ideas. N
sleep. V


In [70]:
pos['sleep'] = 'V'
print(pos['sleep'])
pos['sleep'] = 'N'
print(pos['sleep'])

V
N


## 3.3 Defining Dictionaries

In [72]:
pos = {'colorless': 'ADJ', 'ideas': 'N', 'sleep': 'V', 'furiously': 'ADV'}
pos = dict(colorless='ADJ', ideas='N', sleep='V', furiously='ADV')
# pos = {['ideas', 'blogs', 'adventures']: 'N'}

## 3.4 Default Dictionaries

In [73]:
from collections import defaultdict
frequency = defaultdict(int)
frequency['colorless'] = 4
frequency['ideas']

0

In [75]:
pos = defaultdict(list)
pos['sleep'] = ['NOUN', 'VERB']
pos['ideas']

[]

In [76]:
pos = defaultdict(lambda: 'NOUN')
pos['colorless'] = 'ADJ'
pos['blog']

'NOUN'

In [77]:
list(pos.items())

[('colorless', 'ADJ'), ('blog', 'NOUN')]

In [78]:
alice = nltk.corpus.gutenberg.words('carroll-alice.txt')
vocab = nltk.FreqDist(alice)
v1000 = [word for (word, _) in vocab.most_common(1000)]
mapping = defaultdict(lambda: 'UNK')
for v in v1000:
    mapping[v] = v
    
alice2 = [mapping[v] for v in alice]
alice2[:100]

['UNK',
 'Alice',
 "'",
 's',
 'UNK',
 'in',
 'UNK',
 'by',
 'UNK',
 'UNK',
 'UNK',
 'UNK',
 'CHAPTER',
 'I',
 '.',
 'UNK',
 'the',
 'Rabbit',
 '-',
 'UNK',
 'Alice',
 'was',
 'beginning',
 'to',
 'get',
 'very',
 'tired',
 'of',
 'sitting',
 'by',
 'her',
 'sister',
 'on',
 'the',
 'UNK',
 ',',
 'and',
 'of',
 'having',
 'nothing',
 'to',
 'do',
 ':',
 'once',
 'or',
 'twice',
 'she',
 'had',
 'UNK',
 'into',
 'the',
 'book',
 'her',
 'sister',
 'was',
 'UNK',
 ',',
 'but',
 'it',
 'had',
 'no',
 'pictures',
 'or',
 'UNK',
 'in',
 'it',
 ',',
 "'",
 'and',
 'what',
 'is',
 'the',
 'use',
 'of',
 'a',
 'book',
 ",'",
 'thought',
 'Alice',
 "'",
 'without',
 'pictures',
 'or',
 'conversation',
 "?'",
 'So',
 'she',
 'was',
 'UNK',
 'in',
 'her',
 'own',
 'mind',
 '(',
 'as',
 'well',
 'as',
 'she',
 'could',
 ',']

In [79]:
len(set(alice2))

1001

## 3.5 Incrementally Updating a Dictionary

In [80]:
from collections import defaultdict
counts = defaultdict(int)
from nltk.corpus import brown
for (word, tag) in brown.tagged_words(categories='news', tagset='universal'):
    counts[tag] += 1
counts['NOUN']

30654

In [82]:
sorted(counts)

['.',
 'ADJ',
 'ADP',
 'ADV',
 'CONJ',
 'DET',
 'NOUN',
 'NUM',
 'PRON',
 'PRT',
 'VERB',
 'X']

In [83]:
from operator import itemgetter
sorted(counts.items(), key=itemgetter(1), reverse=True)

[('NOUN', 30654),
 ('VERB', 14399),
 ('ADP', 12355),
 ('.', 11928),
 ('DET', 11389),
 ('ADJ', 6706),
 ('ADV', 3349),
 ('CONJ', 2717),
 ('PRON', 2535),
 ('PRT', 2264),
 ('NUM', 2166),
 ('X', 92)]

In [84]:
pair = ('NP', 8336)
pair[1]

8336

In [86]:
itemgetter(0)(pair)

'NP'

In [87]:
last_letters = defaultdict(list)
words = nltk.corpus.words.words('en')
for word in words: 
    key = word[-2:]
    last_letters[key].append(word)
    
last_letters['ly']

['abactinally',
 'abandonedly',
 'abasedly',
 'abashedly',
 'abashlessly',
 'abbreviately',
 'abdominally',
 'abhorrently',
 'abidingly',
 'abiogenetically',
 'abiologically',
 'abjectly',
 'ableptically',
 'ably',
 'abnormally',
 'abominably',
 'aborally',
 'aboriginally',
 'abortively',
 'aboundingly',
 'abridgedly',
 'abruptedly',
 'abruptly',
 'abscondedly',
 'absently',
 'absentmindedly',
 'absolutely',
 'absolutistically',
 'absorbedly',
 'absorbingly',
 'absorptively',
 'abstemiously',
 'abstinently',
 'abstractedly',
 'abstractively',
 'abstractly',
 'abstrusely',
 'absurdly',
 'abundantly',
 'abusedly',
 'abusefully',
 'abusively',
 'abysmally',
 'academically',
 'acceleratedly',
 'accentually',
 'acceptably',
 'acceptedly',
 'accessarily',
 'accessibly',
 'accessively',
 'accessorily',
 'accidentally',
 'accidently',
 'accommodately',
 'accommodatingly',
 'accordantly',
 'accordingly',
 'accountably',
 'accumulatively',
 'accurately',
 'accursedly',
 'accusably',
 'accusative

In [88]:
last_letters['zy']

['blazy',
 'bleezy',
 'blowzy',
 'boozy',
 'breezy',
 'bronzy',
 'buzzy',
 'Chazy',
 'cozy',
 'crazy',
 'dazy',
 'dizzy',
 'dozy',
 'enfrenzy',
 'fezzy',
 'fizzy',
 'floozy',
 'fozy',
 'franzy',
 'frenzy',
 'friezy',
 'frizzy',
 'frowzy',
 'furzy',
 'fuzzy',
 'gauzy',
 'gazy',
 'glazy',
 'groszy',
 'hazy',
 'heezy',
 'Izzy',
 'jazzy',
 'Jozy',
 'lawzy',
 'lazy',
 'mazy',
 'mizzy',
 'muzzy',
 'nizy',
 'oozy',
 'quartzy',
 'quizzy',
 'refrenzy',
 'ritzy',
 'Shortzy',
 'sizy',
 'sleazy',
 'sneezy',
 'snoozy',
 'squeezy',
 'Suzy',
 'tanzy',
 'tizzy',
 'topazy',
 'trotcozy',
 'twazzy',
 'unbreezy',
 'unfrizzy',
 'wheezy',
 'woozy',
 'wuzzy',
 'yezzy']

In [89]:
anagrams = defaultdict(list)
for word in words: 
    key = ''.join(sorted(word))
    anagrams[key].append(word)

anagrams['aeilnrt']

['entrail', 'latrine', 'ratline', 'reliant', 'retinal', 'trenail']

In [90]:
anagrams = nltk.Index((''.join(sorted(w)), w) for w in words)
anagrams['aeilnrt']

['entrail', 'latrine', 'ratline', 'reliant', 'retinal', 'trenail']

## 3.6 Complex Keys and Values

In [91]:
pos = defaultdict(lambda: defaultdict(int))
brown_news_tagged = brown.tagged_words(categories='news', tagset='universal')
for ((w1, t1), (w2, t2)) in nltk.bigrams(brown_news_tagged):
    pos[(t1, w2)][t2] += 1

pos[('DET', 'right')]

defaultdict(int, {'ADJ': 11, 'NOUN': 5})