### From corpora: download "Brown", from models download "tagsets", "averaged_perceptron_tagger"

In [1]:
# import nltk

# nltk.download() # corpora, brown, ptb, models tagsets

### Must tokenize string first

In [2]:
from nltk import pos_tag,word_tokenize, help
        
print(pos_tag('The man ate the apple')) # treats every character as word/token!

[('T', 'NNP'), ('h', 'NN'), ('e', 'NN'), (' ', 'NNP'), ('m', 'VBZ'), ('a', 'DT'), ('n', 'JJ'), (' ', 'NN'), ('a', 'DT'), ('t', 'NN'), ('e', 'NN'), (' ', 'NNP'), ('t', 'NN'), ('h', 'NN'), ('e', 'NN'), (' ', 'VBZ'), ('a', 'DT'), ('p', 'NN'), ('p', 'NN'), ('l', 'NN'), ('e', 'NN')]


### Using built in pos_tag function

In [3]:
# 1)a

print(pos_tag(word_tokenize('The boy ate the delicious cake')))

# 1)b
help.brown_tagset('DT') # Gets the description of a tag and examples

[('The', 'DT'), ('boy', 'NN'), ('ate', 'VB'), ('the', 'DT'), ('delicious', 'JJ'), ('cake', 'NN')]
DT: determiner/pronoun, singular
    this each another that 'nother


### Concatenating two lists

In [4]:
l1 = [1,2,3]
l2 = [3,4,5]

l = l1+l2
print(l)

[1, 2, 3, 3, 4, 5]


### Ex. 2

In [10]:
import os
from nltk import FreqDist
from collections import defaultdict

tagged = []

for root, dirs, files in os.walk('datasets/simple-wiki/single-docs'):
    for file in files[:10]:
        with open(os.path.join(root,file)) as f:
            tags = pos_tag(word_tokenize(f.read()))
            tagged+=tags # Build a list of pairs of all words and their tags in first 50 files

print(set([word for (word, tag) in tagged if tag[:2] == 'NN'])) # Print all words whose tags start with NN

{'Rhetoric', 'Farmville', 'fiction', 'District', 'Uvea', 'Supernatural', 'dividing', 'child', 'Often', 'Dracula', 'Freud', 'river', 'paper', 'coincidence', 'hour', 'miles', 'tragedy', 'language', 'purpose', 'November', 'Hurricane', 'television', 'Futuna', 'group', 'activity', 'radio', 'history', 'deals', 'Morales', 'causes', 'decision', 'events', 'work', 'Wallis', 'writing', 'Reality', 'perihelion', 'antagonist', 'climate', 'Nine', 'facts', 'circle', 'list', 'Joan', 'world', 'point', 'radiation', 'struggle', 'Prince', 'pity', 'Person', 'places', 'ability', 'Vishal', 'August', 'things', 'superego', 'system', 'Christabel', 'names', 'Cauca', 'danger', 'June', 'action', 'department', 's/he', 'id', 'Frankenstein', 'plot', 'acts', 'stage', 'confusion', 'repeat', 'types', 'cycles', 'aphelion', 'width', 'peoples', 'people', 'fictum', 'movies', 'opposite', 'Judah', 'Wuthering', 'storm', 'Freytag', 'psyche', 'One', 'dates', 'somebody', 'Brontë', 'year', 'Conflict', 'person', 'district', 'Pakista

In [6]:
test_dict = {}
# test_dict['plays']+=1 # This gives an error when uncommented, key has not been intialized

if 'plays' not in test_dict: # Using normal dict, must initalize key
    test_dict['plays'] = 0

### Using a default dict, no need to initialize key

In [7]:
from collections import defaultdict

test_dict = defaultdict(int)
test_dict['plays']+=1

print(test_dict['plays'])
print(test_dict['eats'])

1
0


### Dictionary of FreqDist objects, to count the parts of speech assigned to every word

In [8]:
from nltk import FreqDist
tags = defaultdict(lambda: FreqDist())

tags['play']['VB']+=3
tags['play']['NN']+=2
tags['play']['DT']+=1

print(tags['play'].most_common(2)) # Gets the 2 most common elements
print(tags['play'].most_common(2)[0]) # Gets the first most common element, pair of format (tag,count)
print(tags['play'].most_common(2)[0][0]) # Gets the tag of the first most common element

[('VB', 3), ('NN', 2)]
('VB', 3)
VB


### Loading the brown corpus

In [11]:
from nltk.corpus import brown

# 3)a
all_tagged = brown.tagged_sents()
print(all_tagged[0:2]) # The first two sentences in the tagged corpus: List of a list of (word, tag) pairs
print('Number of sents in brown corpus', len(all_tagged))
print('Number of tokens in brown corpus', sum([len(sent) for sent in all_tagged]))

# 3)b
train = all_tagged[:50000] # First 50000 sentences for training
test = all_tagged[50000:] # Rest for testing


[[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'AT'), ('evidence', 'NN'), ("''", "''"), ('that', 'CS'), ('any', 'DTI'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')], [('The', 'AT'), ('jury', 'NN'), ('further', 'RBR'), ('said', 'VBD'), ('in', 'IN'), ('term-end', 'NN'), ('presentments', 'NNS'), ('that', 'CS'), ('the', 'AT'), ('City', 'NN-TL'), ('Executive', 'JJ-TL'), ('Committee', 'NN-TL'), (',', ','), ('which', 'WDT'), ('had', 'HVD'), ('over-all', 'JJ'), ('charge', 'NN'), ('of', 'IN'), ('the', 'AT'), ('election', 'NN'), (',', ','), ('``', '``'), ('deserves', 'VBZ'), ('the', 'AT'), ('praise', 'NN'), ('and', 'CC'), ('thanks', 'NNS'), ('of', 'IN'), ('the', 'AT'), ('City', 'NN-TL'), ('of', 'IN-TL'), ('Atlant

### Building the unigram dictionary

In [12]:
# Example format for dictionary named "words"
#{'play': {'vb':200, 'nn':100}, eat:{'vb':100}}

# 3)c
words = defaultdict(lambda: FreqDist())

for sent in train: # Loop over all training sents
    for (word, pos) in sent: # Loop over all words and their true parts of speech in each setence
        words[word][pos]+=1 # This word has been tagged by this pos once

# Dictionary of words (keys) and their most repeated part of speech (values)
# 3) d
sorted_words = {}

for word in words:
    sorted_words[word] = words[word].most_common(1)[0][0]
    
print(list(sorted_words.items())[:10])

[('Rexroth', 'NP'), ('bitch', 'NN'), ('unsuspecting', 'JJ'), ('dignitaries', 'NNS'), ('Tractor', 'NN'), ('petty', 'JJ'), ('sumac', 'NN'), ('Erhart', 'NP'), ('Peiping', 'NP'), ('groundless', 'JJ')]


### 3) e,f

In [13]:
correct = 0
total = 0
for sent in test:
    for (word, pos) in sent:
        # If word in test sentence is present in unigram dictionary
        # AND if the predicted part of speech (sorted_words[word]) is equal to the true part of speech
        # Increment the number of correct predictions by 1
        if word in sorted_words and sorted_words[word] == pos:
            correct +=1
        total+=1 # Count the total number of words in the test set
        
print('Unigram model accuracy:', correct/total)

Unigram model accuracy: 0.8835427798667458


### Build a DeafultTagger that predicts all parts of speech as nouns

In [14]:
from nltk import DefaultTagger

default_tagger = DefaultTagger('NN')
print(default_tagger.evaluate(test))

print(default_tagger.tag(word_tokenize('The boy ate the apple')))

0.1091925588759153
[('The', 'NN'), ('boy', 'NN'), ('ate', 'NN'), ('the', 'NN'), ('apple', 'NN')]


### Unigram Tagger

In [19]:
from nltk import UnigramTagger

unigram_tagger = UnigramTagger(train)

print(unigram_tagger.evaluate(test))

print(unigram_tagger.tag(word_tokenize('He watched the play'))) # Tagged play as verb
print(unigram_tagger.tag(word_tokenize('The kids play in the garden')))
print(unigram_tagger.tag(word_tokenize('I saw a green spider')))
print(unigram_tagger.tag(word_tokenize('Salah scored the goal'))) # Can't find salah in train sents, tagged as none

0.8835427798667458
[('He', 'PPS'), ('watched', 'VBD'), ('the', 'AT'), ('play', 'VB')]
[('The', 'AT'), ('kids', 'NNS'), ('play', 'VB'), ('in', 'IN'), ('the', 'AT'), ('garden', 'NN')]
[('I', 'PPSS'), ('saw', 'VBD'), ('a', 'AT'), ('green', 'JJ'), ('spider', 'NN')]
[('Salah', None), ('scored', 'VBD'), ('the', 'AT'), ('goal', 'NN')]


### Bigram Tagger

In [16]:
from nltk import BigramTagger

bigram_tagger = BigramTagger(train)
print(bigram_tagger.evaluate(test))

print(bigram_tagger.tag(word_tokenize('He watched the play'))) # play correctly tagged as noun
print(bigram_tagger.tag(word_tokenize('The kids play in the garden')))
print(bigram_tagger.tag(word_tokenize('I saw a green spider'))) # can't find the bigram green spider

0.34665875057721485
[('He', 'PPS'), ('watched', 'VBD'), ('the', 'AT'), ('play', 'NN')]
[('The', 'AT'), ('kids', 'NNS'), ('play', 'VB'), ('in', 'IN'), ('the', 'AT'), ('garden', 'NN')]
[('I', 'PPSS'), ('saw', 'VBD'), ('a', 'AT'), ('green', 'JJ'), ('spider', None)]


### Backoff tagger, from Unigram to DefaultTagegr

In [20]:
backoff_unigram_tagger = UnigramTagger(train, backoff=default_tagger)
print(backoff_unigram_tagger.evaluate(test))
# couldn't find salah in unigram, so backed off to noun
print(backoff_unigram_tagger.tag(word_tokenize('Salah scored the goal'))) 

0.8901230292235636
[('Salah', 'NN'), ('scored', 'VBD'), ('the', 'AT'), ('goal', 'NN')]


### From Bigram to Unigram (which backs off to DefaultTagger)

In [22]:
backoff_bigram_tagger = BigramTagger(train, backoff = backoff_unigram_tagger)
print(backoff_bigram_tagger.evaluate(test))
print(backoff_bigram_tagger.tag(word_tokenize('I saw a green spider')))

0.9117603403918464
[('I', 'PPSS'), ('saw', 'VBD'), ('a', 'AT'), ('green', 'JJ'), ('spider', 'NN')]
