In [1]:
# Importing libraries
import nltk
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import pprint, time

In [2]:
training_file = open("/home/iradspm/Msc/NLP/dataset/kikamba_bert_corpus")
all_lines = training_file.readlines()
training_file.close()

In [3]:
import re
def split_words():
    for words in all_lines:
        clean_data = re.sub(r'\n','', words)
        split_words=clean_data.split(' ')
    return split_words

In [4]:
tagged=list(split_words())
tagged

['Tene/JJ',
 'yĩla/RB',
 'Ĩsilaeli/NP',
 "yatongoew'e/VB",
 'nĩ/PRE',
 'Asili/NNS',
 ',/COMMA',
 'nĩkweethĩiwe/VB',
 'na/CONJ',
 'yũa/NN',
 'nthĩ/NN',
 'ĩsu/JJ',
 './F-STOP',
 'Kwoou/JJ',
 'mũndũ/NN',
 'ũmwe/JJ',
 'kuma/PRE',
 'Mbetheleemu/NP',
 'nthĩ/NN',
 'ya/PRE',
 'Yuta/NP',
 'nĩwaendie/VB',
 'e/RB',
 'na/CONJ',
 'mũka/NN',
 'na/CONJ',
 'ana/NNS',
 'make/PP$',
 'elĩ/NUM',
 'kwĩkala/VB',
 'kwa/RB',
 'kavinda/NN',
 'ũeninĩ/JJ',
 'nthĩ/NN',
 'ya/PRE',
 'Moavi/NP',
 './F-STOP',
 'Mũndũ/NN',
 'ũsu/JJ',
 'eetawa/VB',
 'Elimeleki/NP',
 ',/COMMA',
 'na/CONJ',
 'mũka/NN',
 'eetawa/VB',
 'Naũmi/NP',
 './F-STOP',
 'Ana/NNS',
 'make/PP$',
 'ũmwe/NUM',
 'eetawa/VB',
 'Maloni/NP',
 'na/CONJ',
 'ũla/DET',
 'ũngĩ/JJ',
 'eetawa/VB',
 'Kilioni/NP',
 './F-STOP',
 'Andũ/NNS',
 'asu/JJ',
 'maĩ/VB',
 'ma/PP$',
 'mũsyĩ/NN',
 'wa/PRE',
 'Aevilathi/NPS',
 'ala/JJ',
 'matwĩe/VB',
 'Mbetheleemu/NP',
 'nthĩ/NN',
 'ya/PRE',
 'Yuta/NP',
 './F-STOP',
 'Nĩmaendie/VB',
 'Moavi/NP',
 'matũa/VB',
 "kw'o/RB",
 './F-S

In [5]:
nltk_data=[nltk.tag.str2tuple(t) for t in tagged]

In [6]:
# split data into training and validation set in the ratio 80:20
train_set,test_set =train_test_split(nltk_data,train_size=0.80,test_size=0.20,random_state = 101)

In [7]:
train_set[:5]

[('makamũtwaa', 'VB'),
 ('nĩweeyũmbanisye', 'VB'),
 ('we', 'PP'),
 ('kũya', 'VB'),
 ("kũkolany'a", 'VB')]

In [8]:
# create list of train and test tagged words
train_tagged_words = list(train_set)
test_tagged_words = list(test_set)
print(len(train_tagged_words))
print(len(test_tagged_words))

1959
490


In [9]:
train_tagged_words[:5]

[('makamũtwaa', 'VB'),
 ('nĩweeyũmbanisye', 'VB'),
 ('we', 'PP'),
 ('kũya', 'VB'),
 ("kũkolany'a", 'VB')]

In [10]:
test_tagged_words[:5]

[("ngataananw'a", 'VB'),
 ('.', 'F-STOP'),
 ('ala', 'JJ'),
 ('mũatĩĩe', 'VB'),
 ('Nĩwasyokie', 'VB')]

In [11]:
#use set datatype to check how many unique tags are present in training data
tags = {tag for word,tag in train_tagged_words}
print(len(tags))
print(tags)
 
# check total words in vocabulary
vocab = {word for word,tag in train_tagged_words}

17
{'F-STOP', 'NNS', 'NPS', 'NN', 'PP', 'NUM', 'NP', 'JJ', 'VB', 'PRE', 'RB', 'CONJ', 'INJ', 'COMMA', 'DET', 'PP$', 'PUNC'}


In [12]:
#vocab

In [13]:
# compute Emission Probability
def word_given_tag(word, tag, train_bag = train_tagged_words):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)#total number of times the passed tag occurred in train_bag
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
#now calculate the total number of times the passed word occurred as the passed tag.
    count_w_given_tag = len(w_given_tag_list)
 
     
    return (count_w_given_tag, count_tag)

In [14]:
# compute  Transition Probability
def t2_given_t1(t2, t1, train_bag = train_tagged_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

In [15]:
# creating t x t transition matrix of tags, t= no of tags
# Matrix(i, j) represents P(jth tag after the ith tag)
 
tags_matrix = np.zeros((len(tags), len(tags)), dtype='float32')
for i, t1 in enumerate(list(tags)):
    for j, t2 in enumerate(list(tags)): 
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]
 
print(tags_matrix)

[[0.07619048 0.01904762 0.         0.14285715 0.01904762 0.
  0.10476191 0.06666667 0.22857143 0.05714286 0.04761905 0.08571429
  0.         0.05714286 0.         0.03809524 0.05714286]
 [0.05555556 0.01388889 0.01388889 0.20833333 0.04166667 0.01388889
  0.04166667 0.04166667 0.16666667 0.02777778 0.09722222 0.08333334
  0.01388889 0.09722222 0.         0.02777778 0.05555556]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.33333334 0.33333334 0.
  0.         0.33333334 0.         0.         0.        ]
 [0.07450981 0.02352941 0.         0.10196079 0.00784314 0.
  0.06666667 0.07450981 0.22352941 0.03921569 0.11372549 0.08627451
  0.         0.07450981 0.00392157 0.04705882 0.0627451 ]
 [0.         0.0625     0.         0.1875     0.         0.
  0.         0.125      0.3125     0.         0.125      0.0625
  0.         0.         0.         0.         0.125     ]
 [0.         0.         0.         0.09090909 0.09090909 0.
  0.09090909 0

In [16]:
# convert the matrix to a df for better readability
#the table is same as the transition table shown in section 3 of article
tags_df = pd.DataFrame(tags_matrix, columns = list(tags), index=list(tags))
display(tags_df)

Unnamed: 0,F-STOP,NNS,NPS,NN,PP,NUM,NP,JJ,VB,PRE,RB,CONJ,INJ,COMMA,DET,PP$,PUNC
F-STOP,0.07619,0.019048,0.0,0.142857,0.019048,0.0,0.104762,0.066667,0.228571,0.057143,0.047619,0.085714,0.0,0.057143,0.0,0.038095,0.057143
NNS,0.055556,0.013889,0.013889,0.208333,0.041667,0.013889,0.041667,0.041667,0.166667,0.027778,0.097222,0.083333,0.013889,0.097222,0.0,0.027778,0.055556
NPS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.333333,0.0,0.0,0.333333,0.0,0.0,0.0
NN,0.07451,0.023529,0.0,0.101961,0.007843,0.0,0.066667,0.07451,0.223529,0.039216,0.113725,0.086275,0.0,0.07451,0.003922,0.047059,0.062745
PP,0.0,0.0625,0.0,0.1875,0.0,0.0,0.0,0.125,0.3125,0.0,0.125,0.0625,0.0,0.0,0.0,0.0,0.125
NUM,0.0,0.0,0.0,0.090909,0.090909,0.0,0.090909,0.090909,0.090909,0.090909,0.181818,0.090909,0.0,0.090909,0.0,0.090909,0.0
NP,0.032895,0.065789,0.0,0.184211,0.013158,0.0,0.065789,0.065789,0.223684,0.052632,0.065789,0.092105,0.0,0.065789,0.0,0.026316,0.046053
JJ,0.045455,0.025974,0.0,0.181818,0.0,0.006494,0.077922,0.071429,0.233766,0.051948,0.11039,0.064935,0.006494,0.038961,0.0,0.045455,0.038961
VB,0.057208,0.043478,0.002288,0.112128,0.009153,0.006865,0.102975,0.086957,0.196796,0.04119,0.091533,0.077803,0.002288,0.075515,0.002288,0.050343,0.038902
PRE,0.041667,0.03125,0.010417,0.104167,0.0,0.0,0.09375,0.125,0.21875,0.041667,0.104167,0.041667,0.0,0.083333,0.0,0.052083,0.052083


In [17]:
def Viterbi(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
     
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['F-STOP', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                 
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
             
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))

In [18]:
# Let's test our Viterbi algorithm on a few sample sentences of test dataset
random.seed(1234)      #define a random seed to get same sentences when run multiple times
 
# choose random 10 numbers
rndom = [random.randint(1,len(test_set)) for x in range(10)]
 
# list of 10 sents on which we test the model
#test_run = [test_set[i] for i in rndom]
test_run = [test_set[i] for i in rndom]
 
# list of tagged words
test_run_base=test_run
#test_run_base = [tup for sent in test_run for tup in sent]
 
# list of untagged words
#[pair[1] for pair in tagged_sents]
test_tagged_words = [tup[0] for tup in test_run]

In [19]:
rndom

[399, 226, 60, 4, 47, 467, 483, 481, 414, 299]

In [20]:
test_run

[('ũsu', 'JJ'),
 ('mũno', 'RB'),
 ('ũngĩ', 'NN'),
 ('Nĩwasyokie', 'VB'),
 ('Lũthi', 'NP'),
 ('nĩwaumisye', 'VB'),
 ('ila', 'JJ'),
 ('”', 'PUNC'),
 ('ĩsyĩtwa', 'NP'),
 ('thooa', 'VB')]

In [21]:
test_tagged_words

['ũsu',
 'mũno',
 'ũngĩ',
 'Nĩwasyokie',
 'Lũthi',
 'nĩwaumisye',
 'ila',
 '”',
 'ĩsyĩtwa',
 'thooa']

In [22]:
test_run_base

[('ũsu', 'JJ'),
 ('mũno', 'RB'),
 ('ũngĩ', 'NN'),
 ('Nĩwasyokie', 'VB'),
 ('Lũthi', 'NP'),
 ('nĩwaumisye', 'VB'),
 ('ila', 'JJ'),
 ('”', 'PUNC'),
 ('ĩsyĩtwa', 'NP'),
 ('thooa', 'VB')]

In [23]:
#Here We will only test 10 sentences to check the accuracy
#as testing the whole training set takes huge amount of time
start = time.time()
tagged_seq = Viterbi(test_tagged_words)
end = time.time()
difference = end-start
 
print("Time taken in seconds: ", difference)
 
# accuracy
check = [i for i, j in zip(tagged_seq, test_run_base) if i == j] 
 
accuracy = len(check)/len(tagged_seq)
print('Viterbi Algorithm Accuracy: ',accuracy*100)

Time taken in seconds:  0.04178762435913086
Viterbi Algorithm Accuracy:  60.0


In [24]:
test_sent=" Ngai asya atĩĩ , “ Kĩw'ũ kĩla kĩ ĩtheo wa ĩtu kĩsyokeanĩe vandũ vamwe na nthĩ nyũmũ yumĩle , ” na ũu weethĩwa . "
pred_tags= Viterbi(test_sent.split())
print(pred_tags)

[('Ngai', 'NP'), ('asya', 'F-STOP'), ('atĩĩ', 'RB'), (',', 'COMMA'), ('“', 'PUNC'), ("Kĩw'ũ", 'F-STOP'), ('kĩla', 'RB'), ('kĩ', 'F-STOP'), ('ĩtheo', 'F-STOP'), ('wa', 'PRE'), ('ĩtu', 'F-STOP'), ('kĩsyokeanĩe', 'F-STOP'), ('vandũ', 'NN'), ('vamwe', 'RB'), ('na', 'CONJ'), ('nthĩ', 'NN'), ('nyũmũ', 'F-STOP'), ('yumĩle', 'F-STOP'), (',', 'COMMA'), ('”', 'PUNC'), ('na', 'CONJ'), ('ũu', 'RB'), ('weethĩwa', 'F-STOP'), ('.', 'F-STOP')]
