## POS tagging using modified Viterbi

### Data Preparation

In [54]:
#Importing libraries
import nltk
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from IPython.display import display

In [8]:
# reading the Treebank tagged sentences
nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))

In [9]:
# train - validation split =  95 - 5 
train_set, test_set = train_test_split(nltk_data,test_size=0.05,random_state=100)

In [10]:
# extracting (token,tag) tuples
taggedWords = [tup for sent in train_set for tup in sent]
print("List of training words")
print(len(taggedWords))

# extracting tokens
tokens = [pair[0] for pair in taggedWords]

# creating vocabulary
V = set(tokens)

# extracting all tags (unique)
T = set([pair[1] for pair in taggedWords])


List of training words
95949


In [11]:
## List of tags in the train set 
print(T)

{'.', 'ADJ', 'DET', 'X', 'CONJ', 'ADV', 'ADP', 'PRON', 'NUM', 'PRT', 'VERB', 'NOUN'}


### Build the vanilla Viterbi based POS tagger

In [12]:
# creating T x V matrix to store all probabilities
t = len(T)
v = len(V)
w_given_t = np.zeros((t, v))

# function of emission probability
def WordGivenTag(word,tag,trainData = taggedWords):
    tagList = [pair for pair in trainData if pair[1] == tag] 
    tagCount = len(tagList)
    wordOccurWithTagCount = len([pair[0] for pair in tagList if pair[0] == word])
    
    return (wordOccurWithTagCount,tagCount)

# function of transition probability 
def T2GivenT1(t2,t1,trainData = taggedWords):
    tagList = [pair[1] for pair in trainData]
    count_t1 = len([tag for tag in tagList if tag == t1])
    count_t2_t1 = 0
    for index in range(len(tagList)-1):
        if(tagList[index] == t1 and tagList[index + 1] == t2):
            count_t2_t1 += 1
    return (count_t2_t1,count_t1)

# creating trandition matrix
transMatrix = np.zeros((len(T),len(T)),dtype="float32")
for i,t1 in enumerate(list(T)):
    for j,t2 in enumerate(list(T)):
        t2_t1 = T2GivenT1(t2,t1)
        transMatrix[i,j] = t2_t1[0]/t2_t1[1]
        
df = pd.DataFrame(transMatrix, columns = list(T), index=list(T))
df

Unnamed: 0,.,ADJ,DET,X,CONJ,ADV,ADP,PRON,NUM,PRT,VERB,NOUN
.,0.092923,0.043681,0.173558,0.026908,0.058032,0.052292,0.092206,0.065208,0.081353,0.002511,0.088708,0.222531
ADJ,0.063882,0.067158,0.004914,0.020311,0.016052,0.004914,0.078624,0.000491,0.020803,0.010156,0.011794,0.700901
DET,0.017913,0.204977,0.005771,0.045323,0.000481,0.012623,0.009618,0.003727,0.02164,0.00024,0.040394,0.637293
X,0.162831,0.016505,0.055229,0.074433,0.010316,0.025393,0.144898,0.055705,0.002857,0.184891,0.204571,0.062371
CONJ,0.035698,0.118683,0.118683,0.008809,0.000464,0.053778,0.053778,0.058414,0.042188,0.003709,0.155308,0.350487
ADV,0.135153,0.13016,0.069907,0.023302,0.006991,0.07723,0.119507,0.015646,0.031624,0.014314,0.344541,0.031624
ADP,0.039754,0.107389,0.323969,0.034984,0.000848,0.013357,0.017492,0.069119,0.06191,0.001484,0.008481,0.321213
PRON,0.040613,0.072031,0.009195,0.09272,0.004981,0.0341,0.023372,0.007663,0.00728,0.012261,0.484291,0.211494
NUM,0.118835,0.033571,0.003862,0.211824,0.013072,0.002674,0.035056,0.001485,0.184195,0.026144,0.016934,0.352347
PRT,0.043635,0.083661,0.10105,0.013123,0.002297,0.010171,0.019357,0.017717,0.056102,0.001969,0.405184,0.245735


In [13]:
# Viterbi Heuristic
def VanillaViterbi(words, train_bag = taggedWords):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = df.loc['.', tag]
            else:
                transition_p = df.loc[state[-1], tag]
                
            # compute emission and state probabilities
            emission = WordGivenTag(words[key], tag)
            emission_p = emission[0]/emission[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
            
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))

In [14]:
taggedWords = [tup for sent in test_set for tup in sent]
untaggedWords = [tup[0] for sent in test_set for tup in sent]

vitterbi = VanillaViterbi(untaggedWords)
check = [i for i, j in zip(vitterbi, taggedWords) if i == j] 
accuracy = len(check)/len(vitterbi)
accuracy

0.9028982441294691

In [15]:
# List of incorrect tags - (Word, predicted tag, actual tag)

incorrect_tagged_words = [(a[0],a[1],b[1]) for a,b in zip(vitterbi,taggedWords) if a[1] != b[1]]
print(len(incorrect_tagged_words))
incorrect_tagged_words

459


[('book', 'NOUN', 'VERB'),
 ('stocks', 'NOUN', 'ADV'),
 ('up', 'PRT', 'ADP'),
 ('over', 'ADP', 'PRT'),
 ('ignored', '.', 'VERB'),
 ('mine', 'NOUN', 'ADJ'),
 ('Palestinian', 'ADJ', 'NOUN'),
 ('first', 'ADJ', 'ADV'),
 ('Preston', '.', 'NOUN'),
 ('Birmingham', '.', 'NOUN'),
 ('Ala', '.', 'NOUN'),
 ('clamped', '.', 'VERB'),
 ('ankle', '.', 'NOUN'),
 ('third-largest', '.', 'ADJ'),
 ('fifth-largest', '.', 'ADJ'),
 ('Z.', '.', 'NOUN'),
 ('Wick', '.', 'NOUN'),
 ('89.7', '.', 'NUM'),
 ('141.9', '.', 'NUM'),
 ('94.8', '.', 'NUM'),
 ('149.9', '.', 'NUM'),
 ('argues', '.', 'VERB'),
 ('Sit', '.', 'VERB'),
 ('British', 'ADJ', 'NOUN'),
 ('halt', 'NOUN', 'VERB'),
 ('slides', 'NOUN', 'VERB'),
 ('most', 'ADJ', 'ADV'),
 ('athletic', '.', 'ADJ'),
 ('to', 'PRT', 'ADJ'),
 ('better', 'ADJ', 'ADV'),
 ('illustrates', '.', 'VERB'),
 ('attempt', 'VERB', 'NOUN'),
 ('usurp', '.', 'VERB'),
 ('executive', 'NOUN', 'ADJ'),
 ('609', '.', 'NUM'),
 ('executive-office', '.', 'NOUN'),
 ('administer', '.', 'VERB'),
 ('disap

### Solve the problem of unknown words

#### Modification 1 -  using transition probability incase of zero emission probability

In [16]:
# Viterbi Heuristic
def VitterbiMod(words, train_bag = taggedWords):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        trans_p = [] 
        for tag in T:
            if key == 0:
                transition_p = df.loc['.', tag]
            else:
                transition_p = df.loc[state[-1], tag]
                
            # compute emission and state probabilities
            emission = WordGivenTag(words[key], tag)
            emission_p = emission[0]/emission[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
            trans_p.append(transition_p)
            
        pmax = max(p)
        if pmax > 0 :
            # getting state for which probability is maximum
            state_max = T[p.index(pmax)]
        else:
            pmax = max(trans_p)
            state_max = T[trans_p.index(pmax)]
            
        state.append(state_max)
    return list(zip(words, state))

In [19]:
# checking the accuracy
vitterbiMod = VitterbiMod(untaggedWords)
check = [i for i, j in zip(vitterbiMod, taggedWords) if i == j] 
accuracy = len(check)/len(vitterbiMod)
accuracy

0.9331499894224666

#### Accuracy increased by ~ 3% compared to valilla viterbi

In [20]:
# List of incorrect tags - (Word, predicted tag, actual tag) after first modification

incorrect_tagged_words = [(a[0],a[1],b[1]) for a,b in zip(vitterbiMod,taggedWords) if a[1] != b[1]]
print(len(incorrect_tagged_words))
incorrect_tagged_words

316


[('book', 'NOUN', 'VERB'),
 ('stocks', 'NOUN', 'ADV'),
 ('up', 'PRT', 'ADP'),
 ('over', 'ADP', 'PRT'),
 ('mine', 'NOUN', 'ADJ'),
 ('Palestinian', 'ADJ', 'NOUN'),
 ('first', 'ADJ', 'ADV'),
 ('clamped', 'X', 'VERB'),
 ('ankle', 'VERB', 'NOUN'),
 ('third-largest', 'NOUN', 'ADJ'),
 ('fifth-largest', 'NOUN', 'ADJ'),
 ('89.7', 'NOUN', 'NUM'),
 ('141.9', 'NOUN', 'NUM'),
 ('94.8', 'NOUN', 'NUM'),
 ('149.9', 'NOUN', 'NUM'),
 ('British', 'ADJ', 'NOUN'),
 ('halt', 'NOUN', 'VERB'),
 ('slides', 'NOUN', 'VERB'),
 ('most', 'ADJ', 'ADV'),
 ('athletic', 'DET', 'ADJ'),
 ('to', 'PRT', 'ADJ'),
 ('better', 'ADJ', 'ADV'),
 ('attempt', 'VERB', 'NOUN'),
 ('609', 'NOUN', 'NUM'),
 ('administer', 'NOUN', 'VERB'),
 ('disapproved', 'X', 'VERB'),
 ('*-58', 'VERB', 'X'),
 ('disapproval', 'DET', 'NOUN'),
 ('accordance', 'DET', 'NOUN'),
 ('applicable', 'NOUN', 'ADJ'),
 ('Five', 'NOUN', 'NUM'),
 ('Proper', 'NOUN', 'ADJ'),
 ('English', 'NOUN', 'ADJ'),
 ('highest-pitched', 'NOUN', 'ADJ'),
 ('descending', 'NOUN', 'VERB'),

#### Modification 2 -  using a backoff tagger

Using a backoff tagger incase of an unknown word.
Here we are using regex tagger

In [21]:
patterns = [
    (r'[\w+]+ing$', 'VERB'),              # gerund
    (r'.*ed$', 'VERB'),               # past tense
    (r'.*es$', 'VERB'),               # 3rd singular present
    (r'.*\'s$', 'NOUN'),              # possessive nouns
    (r'.*s$', 'NOUN'),                # plural nouns
    (r'\*T?\*?-[0-9]+$', 'X'),        # X
    (r'^-?[0-9]+(.[0-9]+)?$', 'NUM'), # cardinal numbers
    (r'.*', 'NOUN')                    # nouns
]
rule_based_tagger = nltk.RegexpTagger(patterns)


In [22]:
def VitterbiMod2(words,train_bag = taggedWords):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = df.loc['.', tag]
            else:
                transition_p = df.loc[state[-1], tag]
               
            emission = WordGivenTag(words[key], tag)
            emission_p = emission[0]/emission[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
            
        pmax = max(p)
        state_max = rule_based_tagger.tag([word])[0][1]
        
        if pmax > 0:
          if state_max != "X":
            state_max = T[p.index(pmax)]  
        state.append(state_max)
    return list(zip(words, state))

In [23]:
# Testing of Modification 2

vitterbiMod2 = VitterbiMod2(untaggedWords)
check = [i for i, j in zip(vitterbiMod2, taggedWords) if i == j] 
accuracy = len(check)/len(vitterbiMod2)
accuracy

0.9494393907340808

#### Accuracy increased by ~ 5% compared to vanilla viterbi

In [24]:
# List of incorrect tags - (Word, predicted tag, actual tag)

incorrect_tagged_words = [(a[0],a[1],b[1]) for a,b in zip(vitterbiMod2,taggedWords) if a[1] != b[1]]
print(len(incorrect_tagged_words))
incorrect_tagged_words

239


[('book', 'NOUN', 'VERB'),
 ('stocks', 'NOUN', 'ADV'),
 ('up', 'PRT', 'ADP'),
 ('over', 'ADP', 'PRT'),
 ('mine', 'NOUN', 'ADJ'),
 ('Palestinian', 'ADJ', 'NOUN'),
 ('first', 'ADJ', 'ADV'),
 ('third-largest', 'NOUN', 'ADJ'),
 ('fifth-largest', 'NOUN', 'ADJ'),
 ('Sit', 'NOUN', 'VERB'),
 ('down', 'ADP', 'ADV'),
 ('British', 'ADJ', 'NOUN'),
 ('halt', 'NOUN', 'VERB'),
 ('slides', 'NOUN', 'VERB'),
 ('most', 'ADJ', 'ADV'),
 ('athletic', 'NOUN', 'ADJ'),
 ('to', 'PRT', 'ADJ'),
 ('better', 'ADJ', 'ADV'),
 ('attempt', 'VERB', 'NOUN'),
 ('usurp', 'NOUN', 'VERB'),
 ('executive', 'NOUN', 'ADJ'),
 ('administer', 'NOUN', 'VERB'),
 ('applicable', 'NOUN', 'ADJ'),
 ('Five', 'NOUN', 'NUM'),
 ('Proper', 'NOUN', 'ADJ'),
 ('English', 'NOUN', 'ADJ'),
 ('highest-pitched', 'VERB', 'ADJ'),
 ('as', 'ADP', 'ADV'),
 ('62-year-old', 'NOUN', 'ADJ'),
 ('executive', 'NOUN', 'ADJ'),
 ('unsolicited', 'VERB', 'ADJ'),
 ('bid', 'VERB', 'NOUN'),
 ('that', 'ADP', 'DET'),
 ('diversified', 'ADJ', 'VERB'),
 ('that', 'ADP', 'DET')

#### Modification 3 -  using a unigram tagger and regex tagger as backoff tagger

In [25]:
unigramTagger = nltk.UnigramTagger(train_set, backoff=rule_based_tagger)

In [26]:
def VitterbiMod3(words,train_bag = taggedWords):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = df.loc['.', tag]
            else:
                transition_p = df.loc[state[-1], tag]
               
            emission = WordGivenTag(words[key], tag)
            emission_p = emission[0]/emission[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
            
        pmax = max(p)
        state_max = unigramTagger.tag([word])[0][1]
        
        if pmax > 0:
          if state_max != "X":
            state_max = T[p.index(pmax)]  
        state.append(state_max)
    return list(zip(words, state))

In [27]:
vitterbiMod3 = VitterbiMod3(untaggedWords)
check = [i for i, j in zip(vitterbiMod3, taggedWords) if i == j] 
accuracy = len(check)/len(vitterbiMod3)
accuracy

0.9494393907340808

#### Accuracy is still the same even if using unigram as compared to regex tagger

In [28]:
# List of incorrect tags - (Word, predicted tag, actual tag)

incorrect_tagged_words = [(a[0],a[1],b[1]) for a,b in zip(vitterbiMod3,taggedWords) if a[1] != b[1]]
print(len(incorrect_tagged_words))
incorrect_tagged_words

239


[('book', 'NOUN', 'VERB'),
 ('stocks', 'NOUN', 'ADV'),
 ('up', 'PRT', 'ADP'),
 ('over', 'ADP', 'PRT'),
 ('mine', 'NOUN', 'ADJ'),
 ('Palestinian', 'ADJ', 'NOUN'),
 ('first', 'ADJ', 'ADV'),
 ('third-largest', 'NOUN', 'ADJ'),
 ('fifth-largest', 'NOUN', 'ADJ'),
 ('Sit', 'NOUN', 'VERB'),
 ('down', 'ADP', 'ADV'),
 ('British', 'ADJ', 'NOUN'),
 ('halt', 'NOUN', 'VERB'),
 ('slides', 'NOUN', 'VERB'),
 ('most', 'ADJ', 'ADV'),
 ('athletic', 'NOUN', 'ADJ'),
 ('to', 'PRT', 'ADJ'),
 ('better', 'ADJ', 'ADV'),
 ('attempt', 'VERB', 'NOUN'),
 ('usurp', 'NOUN', 'VERB'),
 ('executive', 'NOUN', 'ADJ'),
 ('administer', 'NOUN', 'VERB'),
 ('applicable', 'NOUN', 'ADJ'),
 ('Five', 'NOUN', 'NUM'),
 ('Proper', 'NOUN', 'ADJ'),
 ('English', 'NOUN', 'ADJ'),
 ('highest-pitched', 'VERB', 'ADJ'),
 ('as', 'ADP', 'ADV'),
 ('62-year-old', 'NOUN', 'ADJ'),
 ('executive', 'NOUN', 'ADJ'),
 ('unsolicited', 'VERB', 'ADJ'),
 ('bid', 'VERB', 'NOUN'),
 ('that', 'ADP', 'DET'),
 ('diversified', 'ADJ', 'VERB'),
 ('that', 'ADP', 'DET')

#### Modification 4 -  using a regex tagger and unigram tagger as backoff tagger

In [32]:
unigramTagger1 = nltk.UnigramTagger(train_set)
patterns1 = [
    (r'[\w+]+ing$', 'VERB'),              # gerund
    (r'.*ed$', 'VERB'),               # past tense
    (r'.*es$', 'VERB'),               # 3rd singular present
    (r'.*\'s$', 'NOUN'),              # possessive nouns
    (r'.*s$', 'NOUN'),                # plural nouns
    (r'\*T?\*?-[0-9]+$', 'X'),        # X
    (r'^-?[0-9]+(.[0-9]+)?$', 'NUM'), # cardinal numbers
]
rule_based_tagger1 = nltk.RegexpTagger(patterns,backoff=unigramTagger1)

In [33]:
def VitterbiMod4(words,train_bag = taggedWords):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = df.loc['.', tag]
            else:
                transition_p = df.loc[state[-1], tag]
               
            emission = WordGivenTag(words[key], tag)
            emission_p = emission[0]/emission[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
            
        pmax = max(p)
        state_max = rule_based_tagger1.tag([word])[0][1]
        
        if pmax > 0:
          if state_max != "X":
            state_max = T[p.index(pmax)]  
        state.append(state_max)
    return list(zip(words, state))

In [34]:
vitterbiMod4 = VitterbiMod4(untaggedWords)
check = [i for i, j in zip(vitterbiMod4, taggedWords) if i == j] 
accuracy = len(check)/len(vitterbiMod4)
accuracy

0.9494393907340808

In [35]:
# List of incorrect tags - (Word, predicted tag, actual tag)

incorrect_tagged_words = [(a[0],a[1],b[1]) for a,b in zip(vitterbiMod4,taggedWords) if a[1] != b[1]]
print(len(incorrect_tagged_words))
incorrect_tagged_words

239


[('book', 'NOUN', 'VERB'),
 ('stocks', 'NOUN', 'ADV'),
 ('up', 'PRT', 'ADP'),
 ('over', 'ADP', 'PRT'),
 ('mine', 'NOUN', 'ADJ'),
 ('Palestinian', 'ADJ', 'NOUN'),
 ('first', 'ADJ', 'ADV'),
 ('third-largest', 'NOUN', 'ADJ'),
 ('fifth-largest', 'NOUN', 'ADJ'),
 ('Sit', 'NOUN', 'VERB'),
 ('down', 'ADP', 'ADV'),
 ('British', 'ADJ', 'NOUN'),
 ('halt', 'NOUN', 'VERB'),
 ('slides', 'NOUN', 'VERB'),
 ('most', 'ADJ', 'ADV'),
 ('athletic', 'NOUN', 'ADJ'),
 ('to', 'PRT', 'ADJ'),
 ('better', 'ADJ', 'ADV'),
 ('attempt', 'VERB', 'NOUN'),
 ('usurp', 'NOUN', 'VERB'),
 ('executive', 'NOUN', 'ADJ'),
 ('administer', 'NOUN', 'VERB'),
 ('applicable', 'NOUN', 'ADJ'),
 ('Five', 'NOUN', 'NUM'),
 ('Proper', 'NOUN', 'ADJ'),
 ('English', 'NOUN', 'ADJ'),
 ('highest-pitched', 'VERB', 'ADJ'),
 ('as', 'ADP', 'ADV'),
 ('62-year-old', 'NOUN', 'ADJ'),
 ('executive', 'NOUN', 'ADJ'),
 ('unsolicited', 'VERB', 'ADJ'),
 ('bid', 'VERB', 'NOUN'),
 ('that', 'ADP', 'DET'),
 ('diversified', 'ADJ', 'VERB'),
 ('that', 'ADP', 'DET')

### Compare the tagging accuracies of the modifications with the vanilla Viterbi algorithm

In [45]:
vitterbi = VanillaViterbi(untaggedWords)
check = [i for i, j in zip(vitterbi, taggedWords) if i == j] 
accuracy = len(check)/len(vitterbi)
print("Accuracy of vanilla viterbi")
print(accuracy)

vitterbiMod = VitterbiMod(untaggedWords)
check = [i for i, j in zip(vitterbiMod, taggedWords) if i == j] 
accuracy = len(check)/len(vitterbiMod)
print("Accuracy of viterbi Modification 1 (using transition probability for unknown words)")
print(accuracy)

vitterbiMod2 = VitterbiMod4(untaggedWords)
check = [i for i, j in zip(vitterbiMod2, taggedWords) if i == j] 
accuracy = len(check)/len(vitterbiMod2)
print("Accuracy of viterbi Modification 2 (using backoff tagger for unknown words)")
print(accuracy)

Accuracy of vanilla viterbi
0.9028982441294691
Accuracy of viterbi Modification 1 (using transition probability for unknown words)
0.9331499894224666
Accuracy of viterbi Modification 2 (using backoff tagger for unknown words)
0.9494393907340808


### List down cases which were incorrectly tagged by original POS tagger and got corrected by your modifications

#### Preparing the evaluation data

In [57]:
a = open("Test_sentences.txt")
testSentences = a.readlines()
t1 = testSentences[0].strip()
t2 = testSentences[1].strip()
t3 = testSentences[2].strip()
words1 = nltk.word_tokenize(t1)
words2 = nltk.word_tokenize(t2)
words3 = nltk.word_tokenize(t3)

#### comparing vanilla and modified viterbi

In [58]:
testTags = VanillaViterbi(words1)
display("Vanilla")
display(testTags)
testTags = VitterbiMod4(words1)
display("Modified")
display(testTags)

'Vanilla'

[('Android', '.'),
 ('is', 'VERB'),
 ('a', 'DET'),
 ('mobile', 'ADJ'),
 ('operating', 'NOUN'),
 ('system', 'NOUN'),
 ('developed', 'VERB'),
 ('by', 'ADP'),
 ('Google', '.'),
 ('.', '.')]

'Modified'

[('Android', 'NOUN'),
 ('is', 'VERB'),
 ('a', 'DET'),
 ('mobile', 'ADJ'),
 ('operating', 'NOUN'),
 ('system', 'NOUN'),
 ('developed', 'VERB'),
 ('by', 'ADP'),
 ('Google', 'NOUN'),
 ('.', '.')]

Following are correctly identified by modified and incorrectly by vanilla viterbi: <br>

1. Android as NOUN
2. operatiing as NOUN
3. system as NOUN
4. Google as NOUN


In [59]:
testTags = VanillaViterbi(words2)
display("Vanilla")
display(testTags)
testTags = VitterbiMod4(words2)
display("Modified")
display(testTags)

'Vanilla'

[('Android', '.'),
 ('has', 'VERB'),
 ('been', 'VERB'),
 ('the', 'DET'),
 ('best-selling', 'ADJ'),
 ('OS', '.'),
 ('worldwide', '.'),
 ('on', 'ADP'),
 ('smartphones', '.'),
 ('since', 'ADP'),
 ('2011', '.'),
 ('and', 'CONJ'),
 ('on', 'ADP'),
 ('tablets', 'NOUN'),
 ('since', 'ADP'),
 ('2013', '.'),
 ('.', '.')]

'Modified'

[('Android', 'NOUN'),
 ('has', 'VERB'),
 ('been', 'VERB'),
 ('the', 'DET'),
 ('best-selling', 'ADJ'),
 ('OS', 'NOUN'),
 ('worldwide', 'NOUN'),
 ('on', 'ADP'),
 ('smartphones', 'VERB'),
 ('since', 'ADP'),
 ('2011', 'NUM'),
 ('and', 'CONJ'),
 ('on', 'ADP'),
 ('tablets', 'NOUN'),
 ('since', 'ADP'),
 ('2013', 'NUM'),
 ('.', '.')]

Following are correctly identified by modified and incorrectly by vanilla viterbi: <br>

1. Android as NOUN
2. OS as NOUN
3. worldwide as NOUN
4. 2011 as NUM
5. 2018 as NUM


In [60]:
testTags = VanillaViterbi(words3)
display("Vanilla")
display(testTags)
testTags = VitterbiMod4(words3)
display("Modified")
display(testTags)

'Vanilla'

[('Google', '.'),
 ('and', 'CONJ'),
 ('Twitter', '.'),
 ('made', 'VERB'),
 ('a', 'DET'),
 ('deal', 'NOUN'),
 ('in', 'ADP'),
 ('2015', '.'),
 ('that', 'DET'),
 ('gave', 'VERB'),
 ('Google', '.'),
 ('access', 'NOUN'),
 ('to', 'PRT'),
 ('Twitter', '.'),
 ("'s", 'VERB'),
 ('firehose', '.'),
 ('.', '.')]

'Modified'

[('Google', 'NOUN'),
 ('and', 'CONJ'),
 ('Twitter', 'NOUN'),
 ('made', 'VERB'),
 ('a', 'DET'),
 ('deal', 'NOUN'),
 ('in', 'ADP'),
 ('2015', 'NUM'),
 ('that', 'ADP'),
 ('gave', 'VERB'),
 ('Google', 'NOUN'),
 ('access', 'NOUN'),
 ('to', 'PRT'),
 ('Twitter', 'NOUN'),
 ("'s", 'PRT'),
 ('firehose', 'NOUN'),
 ('.', '.')]

Following are correctly identified by modified and incorrectly by vanilla viterbi: <br>

1. Google as NOUN
2. Twitter as NOUN
3. 2015 as NUM

"that" in the sentence has been incorrectly tagged as ADP by modified vitterbi algorithm