## POS tagging using modified Viterbi

In [2]:
#Importing libraries
import nltk, re, pprint
import numpy as np
import pandas as pd
import requests
import matplotlib.pyplot as plt
import seaborn as sns
import pprint, time
import random
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize

### Data Preparation

In [3]:
# reading the Treebank tagged sentences
nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))

In [4]:
#reading few sentences from the dataset
nltk_data[:10]

[[('Pierre', 'NOUN'),
  ('Vinken', 'NOUN'),
  (',', '.'),
  ('61', 'NUM'),
  ('years', 'NOUN'),
  ('old', 'ADJ'),
  (',', '.'),
  ('will', 'VERB'),
  ('join', 'VERB'),
  ('the', 'DET'),
  ('board', 'NOUN'),
  ('as', 'ADP'),
  ('a', 'DET'),
  ('nonexecutive', 'ADJ'),
  ('director', 'NOUN'),
  ('Nov.', 'NOUN'),
  ('29', 'NUM'),
  ('.', '.')],
 [('Mr.', 'NOUN'),
  ('Vinken', 'NOUN'),
  ('is', 'VERB'),
  ('chairman', 'NOUN'),
  ('of', 'ADP'),
  ('Elsevier', 'NOUN'),
  ('N.V.', 'NOUN'),
  (',', '.'),
  ('the', 'DET'),
  ('Dutch', 'NOUN'),
  ('publishing', 'VERB'),
  ('group', 'NOUN'),
  ('.', '.')],
 [('Rudolph', 'NOUN'),
  ('Agnew', 'NOUN'),
  (',', '.'),
  ('55', 'NUM'),
  ('years', 'NOUN'),
  ('old', 'ADJ'),
  ('and', 'CONJ'),
  ('former', 'ADJ'),
  ('chairman', 'NOUN'),
  ('of', 'ADP'),
  ('Consolidated', 'NOUN'),
  ('Gold', 'NOUN'),
  ('Fields', 'NOUN'),
  ('PLC', 'NOUN'),
  (',', '.'),
  ('was', 'VERB'),
  ('named', 'VERB'),
  ('*-1', 'X'),
  ('a', 'DET'),
  ('nonexecutive', 'ADJ'),
 

In [5]:
# Splitting into train and test sets
random.seed(1234)
train_set, test_set = train_test_split(nltk_data,train_size=0.95,test_size=0.05)

print(len(train_set))
print(len(test_set))
print(train_set[:10])

3718
196
[[('James', 'NOUN'), ('L.', 'NOUN'), ('Pate', 'NOUN'), (',', '.'), ('54-year-old', 'ADJ'), ('executive', 'NOUN'), ('vice', 'NOUN'), ('president', 'NOUN'), (',', '.'), ('was', 'VERB'), ('named', 'VERB'), ('*-44', 'X'), ('a', 'DET'), ('director', 'NOUN'), ('of', 'ADP'), ('this', 'DET'), ('oil', 'NOUN'), ('concern', 'NOUN'), (',', '.'), ('*', 'X'), ('expanding', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('to', 'PRT'), ('14', 'NUM'), ('members', 'NOUN'), ('.', '.')], [('Garbage', 'NOUN'), ('magazine', 'NOUN'), (',', '.'), ('billed', 'VERB'), ('*', 'X'), ('as', 'ADP'), ('``', '.'), ('The', 'NOUN'), ('Practical', 'NOUN'), ('Journal', 'NOUN'), ('for', 'ADP'), ('the', 'DET'), ('Environment', 'NOUN'), (',', '.'), ("''", '.'), ('is', 'VERB'), ('about', 'ADP'), ('*-1', 'X'), ('to', 'PRT'), ('find', 'VERB'), ('out', 'PRT'), ('0', 'X'), ('*?*', 'X'), ('.', '.')], [('December', 'NOUN'), ('delivery', 'NOUN'), ('gold', 'NOUN'), ('fell', 'VERB'), ('$', '.'), ('3.20', 'NUM'), ('*U*', 'X'), ('

In [6]:
# Getting list of tagged words
train_tagged_words = [tup for sent in train_set for tup in sent]
len(train_tagged_words)

95373

In [7]:
# tokens 
tokens = [pair[0] for pair in train_tagged_words]
tokens[:10]

['James',
 'L.',
 'Pate',
 ',',
 '54-year-old',
 'executive',
 'vice',
 'president',
 ',',
 'was']

In [8]:
# vocabulary
V = set(tokens)
print(len(V))

12011


In [9]:
# number of tags
T = set([pair[1] for pair in train_tagged_words])
len(T)

12

In [10]:
print(T)

{'ADJ', 'ADV', '.', 'X', 'NOUN', 'DET', 'PRON', 'CONJ', 'NUM', 'PRT', 'ADP', 'VERB'}


### Build the vanilla Viterbi based POS tagger

In [11]:
#Emission probabilities 
# computing P(w/t) and storing in T x V matrix
t = len(T)
v = len(V)
w_given_t = np.zeros((t, v))

In [12]:
# compute word given tag: Emission Probability
def word_given_tag(word, tag, train_bag = train_tagged_words):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
    count_w_given_tag = len(w_given_tag_list)
    
    return (count_w_given_tag, count_tag)

In [14]:
#Transition Probabilty
# compute tag given tag: tag2(t2) given tag1 (t1), i.e. Transition Probability

def t2_given_t1(t2, t1, train_bag = train_tagged_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

In [16]:
# creating t x t transition matrix of tags
# each column is t2, each row is t1
# thus M(i, j) represents P(tj given ti)

tags_matrix = np.zeros((len(T), len(T)), dtype='float32')
for i, t1 in enumerate(list(T)):
    for j, t2 in enumerate(list(T)): 
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]

In [17]:
tags_matrix

array([[6.67545721e-02, 4.94478317e-03, 6.39525279e-02, 2.02736109e-02,
        7.00016499e-01, 4.77995723e-03, 3.29652219e-04, 1.69770885e-02,
        2.14273948e-02, 1.07136974e-02, 7.77979195e-02, 1.20323058e-02],
       [1.26895189e-01, 7.87738934e-02, 1.35464728e-01, 2.24126559e-02,
        3.06526031e-02, 6.85563609e-02, 1.45023074e-02, 6.26236014e-03,
        3.23005915e-02, 1.48319053e-02, 1.20632827e-01, 3.48714560e-01],
       [4.36547324e-02, 5.32154776e-02, 9.37133580e-02, 2.75096968e-02,
        2.19626591e-01, 1.74258143e-01, 6.75565973e-02, 5.86272217e-02,
        7.98232183e-02, 2.52548023e-03, 9.19996426e-02, 8.73996541e-02],
       [1.64958369e-02, 2.62652151e-02, 1.63356826e-01, 7.46316463e-02,
        6.27802685e-02, 5.49327359e-02, 5.57335056e-02, 9.60922521e-03,
        2.56246002e-03, 1.82575271e-01, 1.45579755e-01, 2.05477253e-01],
       [1.21633997e-02, 1.72925442e-02, 2.39787504e-01, 2.85766628e-02,
        2.63638020e-01, 1.34090493e-02, 4.65286663e-03, 4.24

In [18]:
# converting to a data frame for better readability
tags_df = pd.DataFrame(tags_matrix, columns = list(T), index=list(T))
tags_df

Unnamed: 0,ADJ,ADV,.,X,NOUN,DET,PRON,CONJ,NUM,PRT,ADP,VERB
ADJ,0.066755,0.004945,0.063953,0.020274,0.700016,0.00478,0.00033,0.016977,0.021427,0.010714,0.077798,0.012032
ADV,0.126895,0.078774,0.135465,0.022413,0.030653,0.068556,0.014502,0.006262,0.032301,0.014832,0.120633,0.348715
.,0.043655,0.053215,0.093713,0.02751,0.219627,0.174258,0.067557,0.058627,0.079823,0.002525,0.092,0.0874
X,0.016496,0.026265,0.163357,0.074632,0.06278,0.054933,0.055734,0.009609,0.002562,0.182575,0.14558,0.205477
NOUN,0.012163,0.017293,0.239788,0.028577,0.263638,0.013409,0.004653,0.042425,0.009049,0.044221,0.177029,0.147756
DET,0.205017,0.012542,0.017969,0.045827,0.637241,0.005186,0.003859,0.000482,0.02219,0.000241,0.009527,0.039918
PRON,0.07318,0.0341,0.03908,0.090805,0.209962,0.009195,0.006513,0.004981,0.006513,0.012644,0.023372,0.489655
CONJ,0.115691,0.055738,0.03466,0.008899,0.348946,0.119906,0.059485,0.000468,0.043091,0.004684,0.055269,0.153162
NUM,0.033573,0.003297,0.119904,0.209832,0.350719,0.002998,0.001199,0.013789,0.186451,0.027278,0.034472,0.016487
PRT,0.087114,0.009862,0.043064,0.013807,0.247863,0.102235,0.017751,0.002301,0.056542,0.001972,0.019724,0.397765


In [20]:
len(train_tagged_words)

95373

In [21]:
#Vannila Flavor
# Viterbi Heuristic (no modifications)
def Viterbi(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
            
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))

In [22]:
#evaluating on test set
# list of tagged words
test_run_base = [tup for sent in test_set for tup in sent]
# list of untagged words
test_tagged_words = [tup[0] for sent in test_set for tup in sent]

In [87]:
# tagging the test sentences
start = time.time()
tagged_seq = Viterbi(test_tagged_words)
end = time.time()
difference = end-start

In [90]:
print("Time taken in seconds: ", difference)
#print(tagged_seq)

Time taken in seconds:  738.6486480236053


In [91]:
# calculating accuracy
check = [i for i, j in zip(tagged_seq, test_run_base) if i == j] 

In [92]:
accuracy_vanilla = len(check)/len(tagged_seq)
accuracy_vanilla

0.9045823118989251

In [93]:
#no of incorrectly tagged words
incorrect_tagged_cases = [[test_run_base[i-1],j] for i, j in enumerate(zip(tagged_seq, test_run_base)) if j[0]!=j[1]]
len(incorrect_tagged_cases)

506

In [94]:
## Testing
sentence_test = 'Android is a mobile operating system developed by Google. Android has been the best-selling OS worldwide on smartphones since 2011 and on tablets since 13th June 2013.'
words = word_tokenize(sentence_test)

start = time.time()
tagged_seq = Viterbi(words)
end = time.time()
difference = end-start

In [95]:
print(tagged_seq)
print(difference)

[('Android', 'ADJ'), ('is', 'VERB'), ('a', 'DET'), ('mobile', 'ADJ'), ('operating', 'NOUN'), ('system', 'NOUN'), ('developed', 'VERB'), ('by', 'ADP'), ('Google', 'ADJ'), ('.', '.'), ('Android', 'ADJ'), ('has', 'VERB'), ('been', 'VERB'), ('the', 'DET'), ('best-selling', 'ADJ'), ('OS', 'ADJ'), ('worldwide', 'ADJ'), ('on', 'ADP'), ('smartphones', 'ADJ'), ('since', 'ADP'), ('2011', 'ADJ'), ('and', 'CONJ'), ('on', 'ADP'), ('tablets', 'NOUN'), ('since', 'ADP'), ('13th', 'ADJ'), ('June', 'NOUN'), ('2013', 'ADJ'), ('.', '.')]
3.826481342315674


### Solve the problem of unknown words

##### By Viterbi Heuristic(vannila), the algorithm will choose the tag having max (emission_p * transition_p) probability. However, there are situations where algorithm can not predict the tag because the max probability is 0. In those scenarios we are going to route the alogorithm to with modifications.

In [34]:
#lets try to solve the problem of unknown words using rule based tagging (method 1)

regex_pattern = [
    (r'[aA-zZ]+(ed|ing|es)$', 'VERB'), #words ending with 'ing' or 'ed' or 'es' like deciding, decided, decides is a verb
    (r'.*ly$', 'ADV'),  #words ending with 'ly' like accidentally are marked as adverb
    (r'^([0-9]|[aA-zZ])+\-[aA-zZ]*$','ADJ'),  # words like 'best-selling', 'adorable', 'beautiful', 'fabulous' marked as adjective 
    (r'.*able$', 'ADJ'), 
    (r'.*ful$', 'ADJ'),
    (r'.*ous$', 'ADJ'),
    (r'^[aA-zZ].*[0-9]+','NOUN'),     # Alpha Numeric words marked as noun
    (r'.*ness$', 'NOUN'),
    (r'.*\'s$', 'NOUN'),              # possessive nouns - words ending with 's
    (r'.*s$', 'NOUN'),                # plural nouns
    (r'.*ers$', 'NOUN'),              # eg.- kinderganteners, government, hazeltown
    (r'.*ment$', 'NOUN'),
    (r'.*town$', 'NOUN'),
    (r'^(0|([*|-|$].*))','X'), # Any special character combination
    (r'(The|the|A|a|An|an|That|that|This|this|Those|those|These|these)$', 'DET'), # That/this/these/those belong to the category of Demonstrative determiners
    (r'[0-9].?[,\/]?[0-9]*','NUM'), # Numbers  
    (r'.*', 'NOUN')
    ]

In [35]:
#creating regex pattern
regexp_tagger = nltk.RegexpTagger(regex_pattern)

In [37]:
# viterbi heuristic Rule based  (Modified version1)
def ViterbiRuleBased(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
            
        pmax = max(p)
        #modification to route to regex tagger in case of pmax is zero
        if pmax == 0.0:
            dummyp = []
            dummyp.append(word)
            [state.append(x[1]) for x in regexp_tagger.tag(dummyp)]          
        # getting state for which probability is maximum
        else:
            state_max = T[p.index(pmax)] 
            state.append(state_max)
    return list(zip(words, state))

In [38]:
# tagging the test sentences
start = time.time()
tagged_seq2 = ViterbiRuleBased(test_tagged_words)
end = time.time()
difference = end-start

In [96]:
#calculating accuracy
check2 = [i for i, j in zip(tagged_seq2, test_run_base) if i == j] 
accuracy_rulebased = len(check2)/len(tagged_seq2)
accuracy_rulebased

0.9566283235904205

In [79]:
#testing and evaluating with few test sentences

sentence_test = 'Android is a mobile operating system developed by Google. Android has been the best-selling OS worldwide on smartphones since 2011 and on tablets since 11th June 2013.'
words = word_tokenize(sentence_test)

start = time.time()
tagged_seq =  ViterbiRuleBased(words)
end = time.time()
difference = end-start

In [80]:
print(tagged_seq)
print(difference)

[('Android', 'NOUN'), ('is', 'VERB'), ('a', 'DET'), ('mobile', 'ADJ'), ('operating', 'NOUN'), ('system', 'NOUN'), ('developed', 'VERB'), ('by', 'ADP'), ('Google', 'NOUN'), ('.', '.'), ('Android', 'NOUN'), ('has', 'VERB'), ('been', 'VERB'), ('the', 'DET'), ('best-selling', 'ADJ'), ('OS', 'NOUN'), ('worldwide', 'NOUN'), ('on', 'ADP'), ('smartphones', 'VERB'), ('since', 'ADP'), ('2011', 'NUM'), ('and', 'CONJ'), ('on', 'ADP'), ('tablets', 'NOUN'), ('since', 'ADP'), ('11th', 'ADJ'), ('June', 'NOUN'), ('2013', 'NUM'), ('.', '.')]
3.9932961463928223


In [44]:
# to solve the problem of unknown words lets use another technique known as lexicon based tagging with rule based as backoff
lexicon_tagger = nltk.UnigramTagger(train_set, backoff=regexp_tagger)

In [45]:
# viterbi heuristic lexicon based  (Modified version2)
def ViterbiLexiconBased(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
            
        pmax = max(p)
        #modification to route to lexicon based/rule based
        if pmax == 0.0:
            dummyp = []
            dummyp.append(word)
            [state.append(x[1]) for x in lexicon_tagger.tag(dummyp)]          
        # getting state for which probability is maximum
        else:
            state_max = T[p.index(pmax)] 
            state.append(state_max)
    return list(zip(words, state))

In [46]:
# tagging the test sentences
start = time.time()
tagged_seq3 = ViterbiLexiconBased(test_tagged_words)
end = time.time()
difference = end-start

In [47]:
print("Time taken in seconds: ", difference)

Time taken in seconds:  821.1825449466705


In [53]:
#calculating accuracy
accuracy_lexicon_based = lexicon_tagger.evaluate(test_set)
accuracy_lexicon_based

0.9547425985291345

In [77]:
#testing and evaluating with few test sentences

sentence_test = 'Android is a mobile operating system developed by Google. Android has been the best-selling OS worldwide on smartphones since 2011 and on tablets since 11th June 2013.'
words = word_tokenize(sentence_test)

start = time.time()
tagged_seq =  ViterbiLexiconBased(words)
end = time.time()
difference = end-start

In [78]:
print(tagged_seq)
print(difference)

[('Android', 'NOUN'), ('is', 'VERB'), ('a', 'DET'), ('mobile', 'ADJ'), ('operating', 'NOUN'), ('system', 'NOUN'), ('developed', 'VERB'), ('by', 'ADP'), ('Google', 'NOUN'), ('.', '.'), ('Android', 'NOUN'), ('has', 'VERB'), ('been', 'VERB'), ('the', 'DET'), ('best-selling', 'ADJ'), ('OS', 'NOUN'), ('worldwide', 'NOUN'), ('on', 'ADP'), ('smartphones', 'VERB'), ('since', 'ADP'), ('2011', 'NUM'), ('and', 'CONJ'), ('on', 'ADP'), ('tablets', 'NOUN'), ('since', 'ADP'), ('11th', 'ADJ'), ('June', 'NOUN'), ('2013', 'NUM'), ('.', '.')]
4.0856451988220215


In [103]:
# viterbi heuristic noun word tag for unknown words (Modified version3)
#lets see if a common tag given to all unknown words is better or worse than a rule based or lexicon tagger
def ViterbiCommonTag(words, train_bag = train_tagged_words):
    tagged_seq = Viterbi(words)
    V = list(set([pair[0] for pair in train_bag]))
    words_list = [pair[0] for pair in tagged_seq] #list of words in WORDS
    Viterbi_tags = [pair[1] for pair in tagged_seq]
    for key, word in enumerate(words_list):
        if word not in V: #unknown word
            Viterbi_tags[key] = 'NOUN'
    return list(zip(words_list, Viterbi_tags))

In [105]:
# tagging the test sentences
start = time.time()
tagged_seq4 = ViterbiCommonTag(test_tagged_words)
end = time.time()
difference = end-start

In [106]:
# calculating accuracy
check4 = [i for i, j in zip(tagged_seq4, test_run_base) if i == j] 
accuracy_commontag = len(check4)/len(tagged_seq4)
accuracy_commontag

0.937205355459174

In [107]:
#testing and evaluating with few test sentences

sentence_test = 'Android is a mobile operating system developed by Google. Android has been the best-selling OS worldwide on smartphones since 2011 and on tablets since 11th June 2013.'
words = word_tokenize(sentence_test)

start = time.time()
tagged_seq =  ViterbiCommonTag(words)
end = time.time()
difference = end-start
print(tagged_seq)
print(difference)

[('Android', 'NOUN'), ('is', 'VERB'), ('a', 'DET'), ('mobile', 'ADJ'), ('operating', 'NOUN'), ('system', 'NOUN'), ('developed', 'VERB'), ('by', 'ADP'), ('Google', 'NOUN'), ('.', '.'), ('Android', 'NOUN'), ('has', 'VERB'), ('been', 'VERB'), ('the', 'DET'), ('best-selling', 'ADJ'), ('OS', 'NOUN'), ('worldwide', 'NOUN'), ('on', 'ADP'), ('smartphones', 'NOUN'), ('since', 'ADP'), ('2011', 'NOUN'), ('and', 'CONJ'), ('on', 'ADP'), ('tablets', 'NOUN'), ('since', 'ADP'), ('11th', 'ADJ'), ('June', 'NOUN'), ('2013', 'NOUN'), ('.', '.')]
3.7758898735046387


In [72]:
# viterbi heuristic probablistic approach (Modified version4)

def ViterbiProbabilistic(words):
    state = []
    T = list(set([pair[1] for pair in train_tagged_words]))
    
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = []
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                
            #compute emission Probabilities and state Probabilities
            if(words[key] in V):
                emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
                state_probability = emission_p * transition_p
            else:
                state_probability = transition_p
                                   
            p.append(state_probability)
            
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))

In [73]:
# tagging the test sentences
start = time.time()
tagged_seq5 = ViterbiProbabilistic(test_tagged_words)
end = time.time()
difference = end-start

In [97]:
#calculating accuracy
check5 = [i for i, j in zip(tagged_seq5, test_run_base) if i == j] 
accuracy_probablistic = len(check5)/len(tagged_seq5)
accuracy_probablistic

0.935319630397888

In [108]:
#testing and evaluating with few test sentences

sentence_test = 'Android is a mobile operating system developed by Google. Android has been the best-selling OS worldwide on smartphones since 2011 and on tablets since 11th June 2013.'
words = word_tokenize(sentence_test)

start = time.time()
tagged_seq =  ViterbiProbabilistic(words)
end = time.time()
difference = end-start
print(tagged_seq)
print(difference)
#probablistic approch gives almost same accuracy as the common tagger

[('Android', 'NOUN'), ('is', 'VERB'), ('a', 'DET'), ('mobile', 'ADJ'), ('operating', 'NOUN'), ('system', 'NOUN'), ('developed', 'VERB'), ('by', 'ADP'), ('Google', 'DET'), ('.', '.'), ('Android', 'NOUN'), ('has', 'VERB'), ('been', 'VERB'), ('the', 'DET'), ('best-selling', 'ADJ'), ('OS', 'NOUN'), ('worldwide', 'NOUN'), ('on', 'ADP'), ('smartphones', 'DET'), ('since', 'ADP'), ('2011', 'DET'), ('and', 'CONJ'), ('on', 'ADP'), ('tablets', 'NOUN'), ('since', 'ADP'), ('11th', 'ADJ'), ('June', 'NOUN'), ('2013', 'NOUN'), ('.', '.')]
3.3922619819641113


### Compare the tagging accuracies of the modifications with the vanilla Viterbi algorithm

In [116]:
accuracy_vanilla

0.9045823118989251

In [117]:
accuracy_commontag

0.937205355459174

In [118]:
accuracy_lexicon_based

0.9547425985291345

In [119]:
accuracy_rulebased

0.9566283235904205

In [120]:
accuracy_probablistic

0.935319630397888

In [None]:
# from the above approches, i would conclude that rule based modifications provided the best results for tagging unknown words

### List down cases which were incorrectly tagged by original POS tagger and got corrected by your modifications

In [109]:
#testing and evaluating on Vanilla Flavor

sentence_test = 'Android is a mobile operating system developed by Google. Android has been the best-selling OS worldwide on smartphones since 2011 and on tablets since 2013. Google and Twitter made a deal in 2015 that gave Google access to Twitter''s firehose. Before entering politics, Donald Trump was a domineering businessman and a television personality. The 2018 FIFA World Cup is the 21st FIFA World Cup, an international football tournament contested once every four years.'
words = word_tokenize(sentence_test)
tagged_seq =  Viterbi(words)
print(tagged_seq)


[('Android', 'ADJ'), ('is', 'VERB'), ('a', 'DET'), ('mobile', 'ADJ'), ('operating', 'NOUN'), ('system', 'NOUN'), ('developed', 'VERB'), ('by', 'ADP'), ('Google', 'ADJ'), ('.', '.'), ('Android', 'ADJ'), ('has', 'VERB'), ('been', 'VERB'), ('the', 'DET'), ('best-selling', 'ADJ'), ('OS', 'ADJ'), ('worldwide', 'ADJ'), ('on', 'ADP'), ('smartphones', 'ADJ'), ('since', 'ADP'), ('2011', 'ADJ'), ('and', 'CONJ'), ('on', 'ADP'), ('tablets', 'NOUN'), ('since', 'ADP'), ('2013', 'ADJ'), ('.', '.'), ('Google', 'ADJ'), ('and', 'CONJ'), ('Twitter', 'ADJ'), ('made', 'VERB'), ('a', 'DET'), ('deal', 'NOUN'), ('in', 'ADP'), ('2015', 'ADJ'), ('that', 'ADP'), ('gave', 'VERB'), ('Google', 'ADJ'), ('access', 'NOUN'), ('to', 'PRT'), ('Twitters', 'ADJ'), ('firehose', 'ADJ'), ('.', '.'), ('Before', 'ADP'), ('entering', 'VERB'), ('politics', 'NOUN'), (',', '.'), ('Donald', 'NOUN'), ('Trump', 'NOUN'), ('was', 'VERB'), ('a', 'DET'), ('domineering', 'ADJ'), ('businessman', 'NOUN'), ('and', 'CONJ'), ('a', 'DET'), ('tel

In [111]:
#testing and evaluating on Lexicon modifications

sentence_test = 'Android is a mobile operating system developed by Google. Android has been the best-selling OS worldwide on smartphones since 2011 and on tablets since 2013. Google and Twitter made a deal in 2015 that gave Google access to Twitter''s firehose. Before entering politics, Donald Trump was a domineering businessman and a television personality. The 2018 FIFA World Cup is the 21st FIFA World Cup, an international football tournament contested once every four years.'
words = word_tokenize(sentence_test)
tagged_seq =  ViterbiLexiconBased(words)
print(tagged_seq)

[('Android', 'NOUN'), ('is', 'VERB'), ('a', 'DET'), ('mobile', 'ADJ'), ('operating', 'NOUN'), ('system', 'NOUN'), ('developed', 'VERB'), ('by', 'ADP'), ('Google', 'NOUN'), ('.', '.'), ('Android', 'NOUN'), ('has', 'VERB'), ('been', 'VERB'), ('the', 'DET'), ('best-selling', 'ADJ'), ('OS', 'NOUN'), ('worldwide', 'NOUN'), ('on', 'ADP'), ('smartphones', 'VERB'), ('since', 'ADP'), ('2011', 'NUM'), ('and', 'CONJ'), ('on', 'ADP'), ('tablets', 'NOUN'), ('since', 'ADP'), ('2013', 'NUM'), ('.', '.'), ('Google', 'NOUN'), ('and', 'CONJ'), ('Twitter', 'NOUN'), ('made', 'VERB'), ('a', 'DET'), ('deal', 'NOUN'), ('in', 'ADP'), ('2015', 'NUM'), ('that', 'ADP'), ('gave', 'VERB'), ('Google', 'NOUN'), ('access', 'NOUN'), ('to', 'PRT'), ('Twitters', 'NOUN'), ('firehose', 'NOUN'), ('.', '.'), ('Before', 'ADP'), ('entering', 'VERB'), ('politics', 'NOUN'), (',', '.'), ('Donald', 'NOUN'), ('Trump', 'NOUN'), ('was', 'VERB'), ('a', 'DET'), ('domineering', 'VERB'), ('businessman', 'NOUN'), ('and', 'CONJ'), ('a', '

In [113]:
#testing and evaluating on Probablistic modification

sentence_test = 'Android is a mobile operating system developed by Google. Android has been the best-selling OS worldwide on smartphones since 2011 and on tablets since 2013. Google and Twitter made a deal in 2015 that gave Google access to Twitter''s firehose. Before entering politics, Donald Trump was a domineering businessman and a television personality. The 2018 FIFA World Cup is the 21st FIFA World Cup, an international football tournament contested once every four years.'
words = word_tokenize(sentence_test)
tagged_seq =  ViterbiProbabilistic(words)
print(tagged_seq)

[('Android', 'NOUN'), ('is', 'VERB'), ('a', 'DET'), ('mobile', 'ADJ'), ('operating', 'NOUN'), ('system', 'NOUN'), ('developed', 'VERB'), ('by', 'ADP'), ('Google', 'DET'), ('.', '.'), ('Android', 'NOUN'), ('has', 'VERB'), ('been', 'VERB'), ('the', 'DET'), ('best-selling', 'ADJ'), ('OS', 'NOUN'), ('worldwide', 'NOUN'), ('on', 'ADP'), ('smartphones', 'DET'), ('since', 'ADP'), ('2011', 'DET'), ('and', 'CONJ'), ('on', 'ADP'), ('tablets', 'NOUN'), ('since', 'ADP'), ('2013', 'DET'), ('.', '.'), ('Google', 'NOUN'), ('and', 'CONJ'), ('Twitter', 'NOUN'), ('made', 'VERB'), ('a', 'DET'), ('deal', 'NOUN'), ('in', 'ADP'), ('2015', 'DET'), ('that', 'ADP'), ('gave', 'VERB'), ('Google', 'X'), ('access', 'NOUN'), ('to', 'PRT'), ('Twitters', 'VERB'), ('firehose', 'X'), ('.', '.'), ('Before', 'ADP'), ('entering', 'VERB'), ('politics', 'NOUN'), (',', '.'), ('Donald', 'NOUN'), ('Trump', 'NOUN'), ('was', 'VERB'), ('a', 'DET'), ('domineering', 'NOUN'), ('businessman', 'NOUN'), ('and', 'CONJ'), ('a', 'DET'), (

In [114]:
#testing and evaluating on Common Tag modification

sentence_test = 'Android is a mobile operating system developed by Google. Android has been the best-selling OS worldwide on smartphones since 2011 and on tablets since 2013. Google and Twitter made a deal in 2015 that gave Google access to Twitter''s firehose. Before entering politics, Donald Trump was a domineering businessman and a television personality. The 2018 FIFA World Cup is the 21st FIFA World Cup, an international football tournament contested once every four years.'
words = word_tokenize(sentence_test)
tagged_seq =  ViterbiCommonTag(words)
print(tagged_seq)

[('Android', 'NOUN'), ('is', 'VERB'), ('a', 'DET'), ('mobile', 'ADJ'), ('operating', 'NOUN'), ('system', 'NOUN'), ('developed', 'VERB'), ('by', 'ADP'), ('Google', 'NOUN'), ('.', '.'), ('Android', 'NOUN'), ('has', 'VERB'), ('been', 'VERB'), ('the', 'DET'), ('best-selling', 'ADJ'), ('OS', 'NOUN'), ('worldwide', 'NOUN'), ('on', 'ADP'), ('smartphones', 'NOUN'), ('since', 'ADP'), ('2011', 'NOUN'), ('and', 'CONJ'), ('on', 'ADP'), ('tablets', 'NOUN'), ('since', 'ADP'), ('2013', 'NOUN'), ('.', '.'), ('Google', 'NOUN'), ('and', 'CONJ'), ('Twitter', 'NOUN'), ('made', 'VERB'), ('a', 'DET'), ('deal', 'NOUN'), ('in', 'ADP'), ('2015', 'NOUN'), ('that', 'ADP'), ('gave', 'VERB'), ('Google', 'NOUN'), ('access', 'NOUN'), ('to', 'PRT'), ('Twitters', 'NOUN'), ('firehose', 'NOUN'), ('.', '.'), ('Before', 'ADP'), ('entering', 'VERB'), ('politics', 'NOUN'), (',', '.'), ('Donald', 'NOUN'), ('Trump', 'NOUN'), ('was', 'VERB'), ('a', 'DET'), ('domineering', 'NOUN'), ('businessman', 'NOUN'), ('and', 'CONJ'), ('a'

In [122]:
#testing and evaluating on Rulebased modification
sentence_test = 'Android is a mobile operating system developed by Google. Android has been the best-selling OS worldwide on smartphones since 2011 and on tablets since 2013. Google and Twitter made a deal in 2015 that gave Google access to Twitter''s firehose. Before entering politics, Donald Trump was a domineering businessman and a television personality. The 2018 FIFA World Cup is the 21st FIFA World Cup, an international football tournament contested once every four years.'
words = word_tokenize(sentence_test)
tagged_seq =  ViterbiRuleBased(words)
print(tagged_seq)

[('Android', 'NOUN'), ('is', 'VERB'), ('a', 'DET'), ('mobile', 'ADJ'), ('operating', 'NOUN'), ('system', 'NOUN'), ('developed', 'VERB'), ('by', 'ADP'), ('Google', 'NOUN'), ('.', '.'), ('Android', 'NOUN'), ('has', 'VERB'), ('been', 'VERB'), ('the', 'DET'), ('best-selling', 'ADJ'), ('OS', 'NOUN'), ('worldwide', 'NOUN'), ('on', 'ADP'), ('smartphones', 'VERB'), ('since', 'ADP'), ('2011', 'NUM'), ('and', 'CONJ'), ('on', 'ADP'), ('tablets', 'NOUN'), ('since', 'ADP'), ('2013', 'NUM'), ('.', '.'), ('Google', 'NOUN'), ('and', 'CONJ'), ('Twitter', 'NOUN'), ('made', 'VERB'), ('a', 'DET'), ('deal', 'NOUN'), ('in', 'ADP'), ('2015', 'NUM'), ('that', 'ADP'), ('gave', 'VERB'), ('Google', 'NOUN'), ('access', 'NOUN'), ('to', 'PRT'), ('Twitters', 'NOUN'), ('firehose', 'NOUN'), ('.', '.'), ('Before', 'ADP'), ('entering', 'VERB'), ('politics', 'NOUN'), (',', '.'), ('Donald', 'NOUN'), ('Trump', 'NOUN'), ('was', 'VERB'), ('a', 'DET'), ('domineering', 'VERB'), ('businessman', 'NOUN'), ('and', 'CONJ'), ('a', '

In [121]:
# going ahead with rule based modification as it gave the most satisfactory results

##### From the above examples it is evident in a few cases such as unknown words like Android, Google, Twitter, FIFA were incorrectly marked by the vanilla flavor, are now correctly marked as NOUN with rule based modification. Also dates like 21st or year 2018 was incorrectly marked by vanilla flavor, are now correctly marked as NUM, other words like tournament, contested, firehose, domineering also correctly marked after running it with modifications