In [1]:
from nlp import get_word_tag, preprocess
import pandas as pd
from collections import defaultdict
import math
import numpy as np

# Part 0: Data Sources

In [2]:
with open("WSJ_02-21.pos", 'r') as f:
    training_corpus = f.readlines()

with open("hmm_vocab.txt", 'r') as f:
    voc_l = f.read().split('\n')

In [3]:
#Train corpus
for i in range(5):
    print(training_corpus[60:65][i])

said	VBD

it	PRP

expects	VBZ

its	PRP$

U.S.	NNP



In [4]:
print(voc_l[-10:])

['zeros', 'zinc', 'zip', 'zombie', 'zone', 'zones', 'zoning', '{', '}', '']


In [5]:
#Get an index for each word in vocab
vocab = {}
for i,word in enumerate(sorted(voc_l)):
    vocab[word] = i

In [6]:
#Test corpus
with open("WSJ_24.pos", 'r') as f:
    y = f.readlines()

In [7]:
pre = preprocess(vocab,"test.words")

# Part 1: Parts-of-speech tagging

## Part 1.1 - Training

In this section, you will find the words that are not ambiguous.

For example, the word is is a verb and it is not ambiguous.
In the WSJ corpus, $86$% of the token are unambiguous (meaning they have only one tag)
About $14\%$ are ambiguous (meaning that they have more than one tag)


In [8]:
def create_dictionaries(training_corpus, vocab):
        
    emission_counts = defaultdict(int)
    transition_counts = defaultdict(int)
    tag_counts = defaultdict(int)
    
    prev_tag = '--s--' 
    
    for word_tag in training_corpus:
        
        word, tag = get_word_tag(word_tag,vocab)
        
        transition_counts[(prev_tag,tag)] += 1
        emission_counts[(tag,word)] += 1
        tag_counts[tag] += 1
        
        prev_tag = tag
        
    return emission_counts, transition_counts, tag_counts

In [9]:
emission_counts, transition_counts, tag_counts = create_dictionaries(training_corpus, vocab)

In [10]:
print("Example states: \n\tNN:{0}\n\tStart:{1}".format(tag_counts["NN"], tag_counts["--s--"]))

Example states: 
	NN:132935
	Start:39832


The 'states' are the Parts-of-speech designations found in the training data. They will also be referred to as 'tags' or POS in this assignment.

    - "NN" is noun, singular,
    - 'NNS' is noun, plural.
    - In addition, there are helpful tags like '--s--' which indicate a start of a sentence.

In [11]:
print("ambiguous word example: ")
for tup,cnt in emission_counts.items():
    if tup[1] == 'back': print (tup, cnt)

ambiguous word example: 
('RB', 'back') 304
('VB', 'back') 20
('RP', 'back') 84
('JJ', 'back') 25
('NN', 'back') 29
('VBP', 'back') 4


## Part 1.2 - Testing 

Now you will test the accuracy of your parts-of-speech tagger using your emission_counts dictionary.

- Given your preprocessed test corpus prep, you will assign a parts-of-speech tag to every word in that corpus.
- Using the original tagged test corpus y, you will then compute what percent of the tags you got correct.


In [12]:
for word, y_tup in zip(pre[0][:4], y[:4]):
    print(word, y_tup)

The The	DT

economy economy	NN

's 's	POS

temperature temperature	NN



In [13]:
def predict_pos(prep, y, emission_counts, vocab, states):
    
    prep = prep[0]
    num_correct = 0
    all_words = set(emission_counts.keys())
    total = len(y)
    
    for word, y_tup in zip(prep, y): 

        y_tup_l = y_tup.split()
        
        if len(y_tup_l) == 2:
            true_label = y_tup_l[1]
    
        count_final = 0
        pos_final = ''
        
        if word in vocab:
            for pos in states:
                
                key = (pos,word)

                if key in emission_counts:
                    count = emission_counts[key]

                    if count>count_final: 
                        count_final = count
                        pos_final = pos
                        
            if pos_final == true_label:
                num_correct += 1

    accuracy = num_correct / total
    
    return accuracy

In [14]:
states = sorted(tag_counts.keys())
predict_pos(pre, y, emission_counts, vocab, states)

0.8658147899061376

# Part 2: Hidden Markov Models for POS
## Part 2.1 Generating Matrices
#### Creating the 'A' transition probabilities matrix

The smoothing was done as follows: 

$$ P(t_i | t_{i-1}) = \frac{C(t_{i-1}, t_{i}) + \alpha }{C(t_{i-1}) +\alpha * N}\tag{3}$$

- $N$ is the total number of tags
- $C(t_{i-1}, t_{i})$ is the count of the tuple (previous POS, current POS) in `transition_counts` dictionary.
- $C(t_{i-1})$ is the count of the previous POS in the `tag_counts` dictionary.
- $\alpha$ is a smoothing parameter.

In [15]:
def create_transition_matrix(alpha, tag_counts, transition_counts):
    ''' 
    Input: 
        alpha: number used for smoothing
        tag_counts: a dictionary mapping each tag to its respective count
        transition_counts: transition count for the previous word and tag
    Output:
        A: matrix of dimension (num_tags,num_tags)
    '''
    all_tags = sorted(tag_counts.keys())
    num_tags = len(all_tags)
    
    A = np.zeros((num_tags,num_tags))
    
    trans_keys = set(transition_counts.keys())
    
    for i in range(num_tags):
        for j in range(num_tags):
            count = 0
            key = (all_tags[i],all_tags[j])
            
            if key in transition_counts:
                count = transition_counts[key]
                
            count_prev_tag = tag_counts[all_tags[i]]
            A[i,j] = (count + alpha) / (count_prev_tag + alpha*num_tags)

    return A

In [39]:
alpha = 0.001
A = create_transition_matrix(alpha, tag_counts, transition_counts)
print("View a subset of transition matrix A")
A_sub = pd.DataFrame(A[5:10,5:10], index=states[5:10], columns = states[5:10] )
print(A_sub)

View a subset of transition matrix A
                  ,         --s--             .         :        CC
,      2.052248e-08  1.231554e-04  2.052248e-08  0.000103  0.092680
--s--  2.510541e-08  2.510541e-08  2.510541e-08  0.002561  0.056964
.      7.601693e-05  9.299599e-01  1.773391e-04  0.000203  0.000025
:      6.288707e-04  6.265677e-02  1.886004e-02  0.001677  0.067267
CC     7.683662e-03  4.175880e-08  4.175880e-08  0.000292  0.000209


#### Create the 'B' emission probabilities matrix

$$P(w_i | t_i) = \frac{C(t_i, word_i)+ \alpha}{C(t_{i}) +\alpha * N}\tag{4}$$

- $C(t_i, word_i)$ is the number of times $word_i$ was associated with $tag_i$ in the training data (stored in `emission_counts` dictionary).
- $C(t_i)$ is the number of times $tag_i$ was in the training data (stored in `tag_counts` dictionary).
- $N$ is the number of words in the vocabulary
- $\alpha$ is a smoothing parameter. 

In [25]:
def create_emission_matrix(alpha, tag_counts, emission_counts, vocab):
    '''
    Input: 
        alpha: tuning parameter used in smoothing 
        tag_counts: a dictionary mapping each tag to its respective count
        emission_counts: a dictionary where the keys are (tag, word) and the values are the counts
        vocab: a dictionary where keys are words in vocabulary and value is an index
    Output:
        B: a matrix of dimension (num_tags, len(vocab))
    '''
    num_tags = len(tag_counts)
    all_tags = sorted(tag_counts.keys())
    num_words = len(vocab)
    
    B = np.zeros((num_tags, num_words))
    
    emis_keys = set(list(emission_counts.keys()))

    for i in range(num_tags):
        for j in range(num_words):

            count = 0
            key = (all_tags[i],vocab[j])

            if key in emission_counts.keys():
                count = emission_counts[key]

            count_tag = tag_counts[all_tags[i]]
            B[i,j] = (count + alpha) / (count_tag+ alpha*num_words)

    return B

In [41]:
B = create_emission_matrix(alpha, tag_counts, emission_counts, list(vocab))
cidx  = ['725','adroitly','engineers', 'promoted', 'synergy']
cols = [vocab[a] for a in cidx]
rvals =['--s--','NN','NNS', 'VB','RB','RP']
rows = [states.index(a) for a in rvals]
B_sub = pd.DataFrame(B[np.ix_(rows,cols)], index=rvals, columns = cidx )
print(B_sub)

                725      adroitly     engineers      promoted       synergy
--s--  2.509047e-08  2.509047e-08  2.509047e-08  2.509047e-08  2.509047e-08
NN     7.521128e-09  7.521128e-09  7.521128e-09  7.521128e-09  2.257091e-05
NNS    1.670013e-08  1.670013e-08  4.676203e-04  1.670013e-08  1.670013e-08
VB     3.779036e-08  3.779036e-08  3.779036e-08  3.779036e-08  3.779036e-08
RB     3.226454e-08  6.456135e-05  3.226454e-08  3.226454e-08  3.226454e-08
RP     3.723317e-07  3.723317e-07  3.723317e-07  3.723317e-07  3.723317e-07


### Viterbi 