In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import brown
import re

**Importing all tagged words along with pos tags from brown corpus**

In [2]:
tagged = brown.tagged_words(categories='news', tagset='universal')
pos = ([tup[1] for tup in tagged])     # tup[0] gives the word and tup[1] gives the POS tag
# print(tagged[1])

**Defining Hidden Markov Model for sequence of POS tags**

In [3]:
def hmm(text):
    markov = {}
    for i in range(len(text)-1):
        curr, next_ ='',''
        curr+=text[i]
        next_+=text[i+1]
        if curr not in markov:
            markov[curr] = {}
            markov[curr][next_] = 1
        else:
            if next_ in markov[curr]:
                markov[curr][next_]+=1
            else:
                markov[curr][next_] = 1
                
    for curr,trans in markov.items():
        total = sum(trans.values())
        for state, count in trans.items():
            markov[curr][state] = count/total
    return markov

markov = hmm(pos)

**Creating the Transition matrix (A matrix) for all the POS tags present in Brown corpus**

In [4]:
setpos = set(pos)
L = len(setpos)
A = np.zeros([L,L])

for i in range(L):
    for j in range(L):
        x = list(setpos)[i]
        y = list(setpos)[j]
        if y in markov[x]:
            A[i][j] = markov[x][y]
        else:
            A[i][j] = 0        

A_ = pd.DataFrame(A)
A_.columns = list(setpos)
A_.index = list(setpos)
print("The transition matrix is following:\n")
A_

The transition matrix is following:



Unnamed: 0,ADP,NOUN,X,ADV,ADJ,VERB,CONJ,PRT,.,NUM,DET,PRON
ADP,0.016916,0.30603,0.000324,0.010765,0.077135,0.037798,0.001052,0.00858,0.00858,0.0586,0.440227,0.033994
NOUN,0.212664,0.25964,0.000326,0.02065,0.017029,0.136752,0.047531,0.016833,0.252235,0.010537,0.013603,0.012201
X,0.054348,0.119565,0.554348,0.01087,0.0,0.021739,0.01087,0.01087,0.217391,0.0,0.0,0.0
ADV,0.156166,0.055539,0.0,0.075545,0.120334,0.273514,0.015826,0.028367,0.132875,0.024186,0.081218,0.036429
ADJ,0.07292,0.709961,0.000298,0.005517,0.06099,0.015956,0.025947,0.016105,0.065464,0.018938,0.005368,0.002535
VERB,0.17397,0.127926,6.9e-05,0.073269,0.051254,0.202445,0.009653,0.067088,0.064727,0.01764,0.179943,0.032016
CONJ,0.057416,0.345234,0.000368,0.058152,0.10784,0.176297,0.0,0.024291,0.01693,0.027604,0.143541,0.042326
PRT,0.102032,0.041519,0.0,0.033127,0.017668,0.651502,0.008392,0.009717,0.041519,0.011926,0.080389,0.002208
.,0.10254,0.234426,0.001006,0.052486,0.043095,0.100444,0.064056,0.02465,0.110673,0.025405,0.163243,0.077974
NUM,0.132502,0.412742,0.0,0.036934,0.069714,0.046168,0.029086,0.006925,0.235919,0.015235,0.010619,0.004155


**Determining the Pi matrix by simply calculating probability**

In [5]:
Pi = np.zeros([1, L])
for i in range(len(setpos)):
    count = 0
    for j in pos:
        if list(setpos)[i] == j:
            count += 1
    Pi[0][i] = count/len(pos)
Pi = pd.DataFrame(Pi)
Pi.columns = list(setpos)
Pi.index = ['pi matrix']
print('The pi matrix is given by:')
Pi

The pi matrix is given by:


Unnamed: 0,ADP,NOUN,X,ADV,ADJ,VERB,CONJ,PRT,.,NUM,DET,PRON
pi matrix,0.122869,0.304851,0.000915,0.033305,0.066691,0.143197,0.02702,0.022515,0.118623,0.021541,0.113263,0.02521


**Determining the Pi matrix by multiplying the A matrix with itself a large number of times**

In [6]:
step = 10000
A_n = A

i=0
while i<step:
    A_n = np.matmul(A_n, A)
    i+=1
    
A_n = pd.DataFrame(A_n)
A_n. columns = list(setpos)
A_n.index = ['pi matrix' for i in range(L)]
print('The pi matrix is given by:')
A_n.iloc[0:1]

The pi matrix is given by:


Unnamed: 0,ADP,NOUN,X,ADV,ADJ,VERB,CONJ,PRT,.,NUM,DET,PRON
pi matrix,0.122871,0.30485,0.000915,0.033306,0.06669,0.143199,0.027021,0.022516,0.118624,0.021541,0.113256,0.025211


**Determining the B matrix for a given sentence**

In [7]:
string = "Those gambling folks watch the race seeking for money"
# string = input("Enter your line:\n")
string = re.sub('[.]',' .', string)
string = [w for w in string.split()]

pos_ = list(setpos)
B = np.zeros([len(pos_), len(string)])

for m in range(len(pos_)):
    for n in range(len(string)):
        x=0
        s=0
        for tag in tagged:
            if tag[1] == pos_[m]:
                s+=1
                if tag[0] == string[n]:
                    x+=1
        
        B[m][n]=x/s

In [8]:
B = pd.DataFrame(B)
B.columns = string
B.index = pos_
B

Unnamed: 0,Those,gambling,folks,watch,the,race,seeking,for,money
ADP,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076163,0.0
NOUN,0.0,0.0,6.5e-05,6.5e-05,0.0,0.00062,0.0,0.0,0.000946
X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ADV,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000597,0.0
ADJ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
VERB,0.0,0.000139,0.0,0.000278,0.0,6.9e-05,0.000417,0.0,0.0
CONJ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PRT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NUM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# N-gram model

In [9]:
txt = 'Let us make the ngram model'
# txt = input("Your sentence:\n")
n = int(input("Value of n:\n"))

def ngram(n, sent=txt):
    word_list=sent.split()
    wlist=[]
    if(n>len(word_list)):
        raise ValueError('n is greater than the number of words in sentence.')
    elif (n==len(word_list)):
        print('Value of n equals to the length of Sentence')
        print(word_list)
    else:
        for i in word_list:
            wlist.append(i)
            if(len(wlist)==n):
                print(wlist)
                wlist=wlist[1:]
                
ngram(n)

Value of n:
3
['Let', 'us', 'make']
['us', 'make', 'the']
['make', 'the', 'ngram']
['the', 'ngram', 'model']
