In [1]:
from nlp import get_word_tag, preprocess
import pandas as pd
from collections import defaultdict
import math
import numpy as np

# Part 0: Data Sources

In [2]:
with open("WSJ_02-21.pos", 'r') as f:
    training_corpus = f.readlines()

with open("hmm_vocab.txt", 'r') as f:
    voc_l = f.read().split('\n')

In [3]:
#Train corpus
for i in range(5):
    print(training_corpus[60:65][i])

said	VBD

it	PRP

expects	VBZ

its	PRP$

U.S.	NNP



In [4]:
print(voc_l[-10:])

['zeros', 'zinc', 'zip', 'zombie', 'zone', 'zones', 'zoning', '{', '}', '']


In [5]:
#Get an index for each word in vocab
vocab = {}
for i,word in enumerate(sorted(voc_l)):
    vocab[word] = i

In [6]:
#Test corpus
with open("WSJ_24.pos", 'r') as f:
    y = f.readlines()

In [7]:
pre = preprocess(vocab,"test.words")

# Part 1: Parts-of-speech tagging

## Part 1.1 - Training

In this section, you will find the words that are not ambiguous.

For example, the word is is a verb and it is not ambiguous.
In the WSJ corpus, $86$% of the token are unambiguous (meaning they have only one tag)
About $14\%$ are ambiguous (meaning that they have more than one tag)


In [8]:
def create_dictionaries(training_corpus, vocab):
        
    emission_counts = defaultdict(int)
    transition_counts = defaultdict(int)
    tag_counts = defaultdict(int)
    
    prev_tag = '--s--' 
    
    for word_tag in training_corpus:
        
        word, tag = get_word_tag(word_tag,vocab)
        
        transition_counts[(prev_tag,tag)] += 1
        emission_counts[(tag,word)] += 1
        tag_counts[tag] += 1
        
        prev_tag = tag
        
    return emission_counts, transition_counts, tag_counts

In [9]:
emission_counts, transition_counts, tag_counts = create_dictionaries(training_corpus, vocab)

In [10]:
print("Example states: \n\tNN:{0}\n\tStart:{1}".format(tag_counts["NN"], tag_counts["--s--"]))

Example states: 
	NN:132935
	Start:39832


The 'states' are the Parts-of-speech designations found in the training data. They will also be referred to as 'tags' or POS in this assignment.

    - "NN" is noun, singular,
    - 'NNS' is noun, plural.
    - In addition, there are helpful tags like '--s--' which indicate a start of a sentence.

In [11]:
print("ambiguous word example: ")
for tup,cnt in emission_counts.items():
    if tup[1] == 'back': print (tup, cnt)

ambiguous word example: 
('RB', 'back') 304
('VB', 'back') 20
('RP', 'back') 84
('JJ', 'back') 25
('NN', 'back') 29
('VBP', 'back') 4


## Part 1.2 - Testing 

Now you will test the accuracy of your parts-of-speech tagger using your emission_counts dictionary.

- Given your preprocessed test corpus prep, you will assign a parts-of-speech tag to every word in that corpus.
- Using the original tagged test corpus y, you will then compute what percent of the tags you got correct.


In [12]:
for word, y_tup in zip(pre[0][:4], y[:4]):
    print(word, y_tup)

The The	DT

economy economy	NN

's 's	POS

temperature temperature	NN



In [13]:
def predict_pos(prep, y, emission_counts, vocab, states):
    
    prep = prep[0]
    num_correct = 0
    all_words = set(emission_counts.keys())
    total = len(y)
    
    for word, y_tup in zip(prep, y): 

        y_tup_l = y_tup.split()
        
        if len(y_tup_l) == 2:
            true_label = y_tup_l[1]
    
        count_final = 0
        pos_final = ''
        
        if word in vocab:
            for pos in states:
                
                key = (pos,word)

                if key in emission_counts:
                    count = emission_counts[key]

                    if count>count_final: 
                        count_final = count
                        pos_final = pos
                        
            if pos_final == true_label:
                num_correct += 1

    accuracy = num_correct / total
    
    return accuracy

In [14]:
states = sorted(tag_counts.keys())
predict_pos(pre, y, emission_counts, vocab, states)

0.8658147899061376

### Missing --> Viterbi Algorithm