<a href="https://colab.research.google.com/github/kaustubhpatil2611/ai_assignments/blob/master/POS_tagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [85]:
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import treebank
#from nltk.corpus import brown

In [86]:
nltk.download('treebank')
nltk.download('universal_tagset')
#nltk.download('brown')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [87]:
nltk_data = list(treebank.tagged_sents(tagset='universal'))

In [88]:
tagged_words = [ tuples for sent in nltk_data for tuples in sent ]

In [89]:
tags = {tag for word,tag in tagged_words}
vocabulary = {word for word,tag in tagged_words}

In [95]:
#calculating the emission probability of a word for given tag
def emission_proba(word, tag, tagged_corpus = tagged_words):
    tags_list = [pair for pair in tagged_corpus if pair[1]==tag]#checking num of times the tag appeared in the corpus
    words_list= [pair[0] for pair in tags_list if pair[0]==word]#checking num of times the word appears with that tag

    return len(words_list)/len(tags_list)

In [96]:
#calculating the transition probabilty / bigram probabilty
def transition_proba(word2, word1, tagged_corpus=tagged_words):
    tags = [pair[1] for pair in tagged_corpus]
    word1_count = len([t for t in tags if t==word1])#count of word1 in corpus
    word1_by_2_count = 0
    for index in range(len(tags)-1):
        if tags[index]==word1 and tags[index+1] == word2:#count of word1 followed by word2 
            word1_by_2_count += 1
    return word1_by_2_count/word1_count

In [97]:
#transition probability matrix for tags
tags_matrix = np.zeros((len(tags), len(tags)), dtype='float32')
for i, t1 in enumerate(list(tags)):
    for j, t2 in enumerate(list(tags)): 
        tags_matrix[i, j] = transition_proba(t2, t1)
tags_df = pd.DataFrame(tags_matrix, columns = list(tags), index=list(tags))

In [100]:
def viterbi_decoding(words, tagged_corpus = tagged_words):
    path= []
    T = list(set([pair[1] for pair in tagged_corpus]))
     
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[path[-1], tag] 
            # compute emission and state probabilities
            emission_p = emission_proba(words[key], tag)
            state_probability = emission_p* transition_p    
            p.append(state_probability)
             
        maxpath = max(p)#getting path has higher probability
        path_max = T[p.index(maxpath)] 
        path.append(path_max)
    return list(zip(words, path))

In [109]:
test_sent="I can play the game cricket whole day"
pos_tags= viterbi_decoding(test_sent.split())

In [110]:
print(pos_tags)

[('I', 'PRON'), ('can', 'VERB'), ('play', 'VERB'), ('the', 'DET'), ('game', 'NOUN'), ('cricket', 'NOUN'), ('whole', 'ADJ'), ('day', 'NOUN')]
