In [None]:
import nltk
nltk.download('brown')
nltk.download('punkt')
nltk.download('universal_tagset')
brownwords = nltk.corpus.brown.tagged_words(categories='news', tagset='universal')

In [None]:
len(brownwords)

In [None]:
brownwords[5425]

In [None]:
fd = nltk.FreqDist(tag for (word,tag) in brownwords)

In [None]:
print('Number of nouns:',fd['NOUN'])
print('Number of adjectives:',fd['ADJ'])

In [None]:
# we can create our own tagger; start with some baselines!
bad_tagger = nltk.DefaultTagger('NOUN')
text = nltk.word_tokenize('It could be that it rained, or is raining heavily.')
bad_tagger.tag(text)

In [None]:
# nltk can evaluate the tagger, but we need it represented as tagged sentences:
brownsentences = nltk.corpus.brown.tagged_sents(categories='news', tagset='universal')
bad_tagger.evaluate(brownsentences)

In [None]:
# rule based tagger!
patterns = [
    (r'.*ly$', 'ADV'),
    (r'.*ing$', 'VERB'),
    (r'.*ed$', 'VERB'),
    (r'.*ould$', 'VERB'),
    (r'^[.]$', 'PUNCT'),
    (r'^[,]$', 'PUNCT'),
    (r'.*$', 'NOUN'),
]
rule_tagger = nltk.RegexpTagger(patterns)
rule_tagger.tag(text)

In [None]:
rule_tagger.evaluate(brownsentences)

In [None]:
cfd = nltk.ConditionalFreqDist(brownwords)

In [None]:
print(cfd['the']['DET'])

In [None]:
print(cfd['pay'].keys())

In [None]:
print(cfd['pay']['NOUN'])
print(cfd['pay']['VERB'])

In [None]:
justwords = nltk.FreqDist(nltk.corpus.brown.words(categories='news')).keys()
best_tags = dict((w, cfd[w].max()) for w in justwords)

In [None]:
print('Most frequent tag for "the":', best_tags['the'])
print('Most frequent tag for "pay":', best_tags['pay'])

In [None]:
split = int(len(brownsentences)*0.9)
train = brownsentences[:split]
test = brownsentences[split:]
unigram_tagger = nltk.UnigramTagger(train)
unigram_tagger.evaluate(test)

In [None]:
better_tagger = nltk.UnigramTagger(train, backoff=rule_tagger)
better_tagger.evaluate(test)

In [None]:
flipped = [(t,w) for sent in train for (w,t) in sent]
wordgiventag = nltk.ConditionalFreqDist(flipped)
# check same counts as above, but just training data:
print(wordgiventag['DET']['the'])
print(wordgiventag['VERB']['pay'])

In [None]:
# this is P(w|t), unsmoothed!
def P(w,t):
    return wordgiventag[t][w] / wordgiventag[t].N()

print('P(the|DET) =', P('the','DET'))
print('P(is|VERB) =', P('is','VERB'))
print('P(pay|VERB) =', P('pay','VERB'))

In [None]:
tag_bigrams = [(x,y) for sent in train for x,y in nltk.bigrams([t for (w,t) in sent])]

In [None]:
tag_bigrams[:10]

In [None]:
tag_bigram_counts = nltk.ConditionalFreqDist(tag_bigrams)
# this is count of noun tags following adjective tags (normal order in English)
print(tag_bigram_counts['ADJ']['NOUN'])
# this is count of adjective tags following noun tags
print(tag_bigram_counts['NOUN']['ADJ'])

In [None]:
# this is P(t2|t1), unsmoothed again!
def tagP(t2,t1):
    return tag_bigram_counts[t1][t2] / tag_bigram_counts[t1].N()
print('P(NOUN|ADJ) =',tagP('NOUN','ADJ'))
print('P(NOUN|DET) =',tagP('NOUN','DET'))

In [None]:
sentence_start = nltk.FreqDist(sent[0][1] for sent in train)
def initP(t):
    return sentence_start[t] / sentence_start.N()
print('initP(DET) =', initP('DET'))
print('initP(PRON) =', initP('PRON'))  # he, she, it, etc.
print('initP(NOUN) =', initP('NOUN'))
print('initP(VERB) =', initP('VERB'))

In [None]:
def argmax(V,tag_list,t,i):
    ans=-1
    best=None
    for s in tag_list:
        temp=V[(s,i-1)]*tagP(t,s)
        if temp > ans:
            ans = temp
            best = s
    return (best,ans)

In [None]:
def printV(sentence,tag_list,V,B):
    for i in range(len(sentence)):
        print('i='+str(i)+' ['+sentence[i]+']')
        for t in tag_list:
            if V[(t,i)] != 0:
                toprint='  '+t+'='+str(V[(t,i)])
                if i>0:
                    toprint += ' (from '+B[(t,i)]+')'
                print(toprint)
    

In [None]:
def viterbi(sentence):
    V = dict()    # keys are (t,i) where t is a tag (row label) and i is position in sentence (column label)
    B = dict()    # same keys as V; this stores the "backpointers" to remember best tag sequence
    tag_list = sentence_start.keys()
    for t in tag_list:
        V[(t,0)] = initP(t)*P(sentence[0],t)
    for i in range(1,len(sentence)):
        for t in tag_list:
            pair = argmax(V,tag_list,t,i)
            B[(t,i)] = pair[0]
            V[(t,i)] = pair[1]*P(sentence[i],t)
    printV(sentence,tag_list,V,B)

In [None]:
# note how best tag for past changes to (correct) ADP when we see "the" at i=8
viterbi('like one little flat near work well past the last right turn'.split())

In [None]:
viterbi('the beer was a little flat'.split())

In [None]:
viterbi('difficulties like high interest rates'.split())

In [None]:
viterbi('banks like high interest rates'.split())