POSTagging dengan metode HMM-Viterbi. 

Contoh yang digunakan pada tutorial ini sesuai dengan contoh yang diberikan pada slide materi POSTagging. Asumsi yang digunakan adalah tabel emission dan transition probability sudah diketahui

In [1]:
import pandas as pd
from collections import defaultdict
import sklearn
from collections import Counter 
from itertools import chain

Pembacaan Data Latih

In [5]:
colnames = ['Word','Tag']
tsv_read = pd.read_csv('train.01.tsv', sep='\t',names=colnames,header=None)
tsv_read = tsv_read.astype(str)
tsv_read.head()

Unnamed: 0,Word,Tag
0,Pemerintah,NN
1,bahkan,RB
2,telah,MD
3,mencanangkan,VB
4,dana,NN


In [6]:
tsv_read['Word']

0          Pemerintah
1              bahkan
2               telah
3        mencanangkan
4                dana
             ...     
94402         balasan
94403          dengan
94404         senjata
94405        artileri
94406               .
Name: Word, Length: 94407, dtype: object

Inisialisasi tagset

In [7]:
count = 0
i = 1
j = 0
tag_count = {} # dictionary untuk menyimpan frekuensi tag
tag_count['<start>'] = 0
tags = []  # list untuk menyimpan tag per kata
tags.append('<start>') 
tagset = {} # dictionary untuk menyimpan tagset
for index, row in tsv_read.iterrows():
    word = row['Word'].lower()
    tag = row['Tag']
    key = (i,str(tag)) 
    if word != 'nan':
        if key not in tags:        
            if word == '.' and tag == 'Z':
                tags.append(str(tag))
                tags.append('<start>')
                tag_count['<start>'] += 1
                i += 2
            else:
                tags.append(str(tag))
                i += 1

        if tag in tag_count:
            tag_count[tag] += 1
        else:
            tag_count[tag] = 1

    if word == '.' and tag == 'Z':
        count += 1
    if count == 50:
        break

for t in tag_count:
    tagset[j] = t
    j += 1

In [8]:
tags.pop()
print(tags)

['<start>', 'NN', 'RB', 'MD', 'VB', 'NN', 'SC', 'VB', 'NN', 'NN', 'JJ', 'CC', 'JJ', 'NN', 'VB', 'RB', 'JJ', 'Z', 'SC', 'NN', 'PR', 'RB', 'JJ', 'IN', 'JJ', 'NN', 'IN', 'NN', 'CD', 'CC', 'Z', 'VB', 'PRP', 'Z', '<start>', 'NNP', 'NNP', 'NNP', 'NNP', 'NNP', 'JJ', 'VB', 'NN', 'NN', 'NN', 'NN', 'NNP', 'NNP', 'CD', 'SC', 'VB', 'CD', 'Z', 'CD', 'CD', 'Z', '<start>', 'NN', 'NN', 'NN', 'MD', 'VB', 'IN', 'NN', 'VB', 'PRP', 'NN', 'NN', 'JJ', 'IN', 'NNP', 'Z', '<start>', 'IN', 'NN', 'CD', 'NN', 'NN', 'NN', 'NN', 'VB', 'CD', 'NND', 'VB', 'VB', 'NNP', 'SYM', 'CD', 'CD', 'Z', '<start>', 'PRP', 'VB', 'SC', 'NN', 'NN', 'IN', 'VB', 'PRP', 'NN', 'NN', 'JJ', 'CD', 'CD', 'IN', 'NN', 'CD', 'Z', 'VB', 'NNP', 'NNP', 'NNP', 'NNP', 'NNP', 'NNP', 'NN', 'NN', 'PR', 'VB', 'IN', 'NN', 'NNP', 'Z', 'SC', 'NN', 'VB', 'NN', 'NN', 'NN', 'NN', 'PR', 'Z', 'VB', 'PRP', 'Z', '<start>', 'SC', 'NN', 'NN', 'VB', 'SYM', 'CD', 'CD', 'SC', 'NN', 'NN', 'JJ', 'SYM', 'CD', 'CD', 'Z', '<start>', 'NN', 'NNP', 'MD', 'VB', 'SC', 'NN', 'N

In [9]:
print(tag_count)

{'<start>': 50, 'NN': 338, 'RB': 24, 'MD': 32, 'VB': 155, 'SC': 69, 'JJ': 57, 'CC': 39, 'Z': 118, 'PR': 32, 'IN': 118, 'CD': 97, 'PRP': 31, 'NNP': 178, 'NND': 4, 'SYM': 16, 'OD': 4, 'X': 1, 'NEG': 9, 'FW': 8, 'DT': 1}


In [10]:
print(tagset)

{0: '<start>', 1: 'NN', 2: 'RB', 3: 'MD', 4: 'VB', 5: 'SC', 6: 'JJ', 7: 'CC', 8: 'Z', 9: 'PR', 10: 'IN', 11: 'CD', 12: 'PRP', 13: 'NNP', 14: 'NND', 15: 'SYM', 16: 'OD', 17: 'X', 18: 'NEG', 19: 'FW', 20: 'DT'}


Inisialisasi Vocab

In [11]:
vocabs = {} # dictionary untuk menyimpan vocab
vocab_count = {} # dictionary untuk menyimpan frekuensi vocab
count = 0
i = 0
for index, row in tsv_read.iterrows():
    word = row['Word'].lower()
    tag = row['Tag']
    key = (i,str(word)) 
    if word != 'nan':
        if key not in vocabs:
            vocabs[i] = str(word)
            i += 1

        if word in vocab_count:
            vocab_count[word] += 1
        else:
            vocab_count[word] = 1

    if word == '.' and tag == 'Z':
        count += 1
    if count == 50:
        break

In [12]:
print(vocabs)

{0: 'pemerintah', 1: 'bahkan', 2: 'telah', 3: 'mencanangkan', 4: 'dana', 5: 'untuk', 6: 'memicu', 7: 'sektor', 8: 'usaha', 9: 'kecil', 10: 'dan', 11: 'menengah', 12: 'ukm', 13: 'tumbuh', 14: 'lebih', 15: 'baik', 16: ',', 17: 'karena', 18: 'sektor', 19: 'ini', 20: 'cukup', 21: 'kuat', 22: 'dalam', 23: 'krisis', 24: 'keuangan', 25: 'pada', 26: 'tahun', 27: '1997', 28: 'lalu', 29: ',', 30: 'kata', 31: '-nya', 32: '.', 33: 'gubernur', 34: 'bank', 35: 'indonesia', 36: 'burhanudin', 37: 'abdullah', 38: 'optimistis', 39: 'mencapai', 40: 'target', 41: 'proyeksi', 42: 'suku', 43: 'bunga', 44: 'indonesia', 45: 'sbi', 46: '2008', 47: 'yang', 48: 'mencapai', 49: '7,5', 50: '-', 51: '8,0', 52: 'persen', 53: '.', 54: 'badan', 55: 'kredit', 56: 'desa', 57: 'boleh', 58: 'dibilang', 59: 'sebagai', 60: 'tonggak sejarah', 61: 'berdiri', 62: '-nya', 63: 'lembaga', 64: 'keuangan', 65: 'mikro', 66: 'di', 67: 'indonesia', 68: '.', 69: 'pada', 70: 'tahun', 71: '1991', 72: 'jumlah', 73: 'utang', 74: 'luar nege

In [13]:
print(vocab_count)

{'pemerintah': 6, 'bahkan': 1, 'telah': 6, 'mencanangkan': 1, 'dana': 1, 'untuk': 13, 'memicu': 1, 'sektor': 2, 'usaha': 1, 'kecil': 2, 'dan': 20, 'menengah': 1, 'ukm': 1, 'tumbuh': 1, 'lebih': 3, 'baik': 2, ',': 65, 'karena': 7, 'ini': 8, 'cukup': 2, 'kuat': 2, 'dalam': 16, 'krisis': 1, 'keuangan': 4, 'pada': 14, 'tahun': 9, '1997': 1, 'lalu': 4, 'kata': 5, '-nya': 21, '.': 52, 'gubernur': 1, 'bank': 6, 'indonesia': 11, 'burhanudin': 1, 'abdullah': 1, 'optimistis': 1, 'mencapai': 6, 'target': 3, 'proyeksi': 1, 'suku': 2, 'bunga': 4, 'sbi': 1, '2008': 2, 'yang': 29, '7,5': 1, '-': 3, '8,0': 1, 'persen': 13, 'badan': 2, 'kredit': 1, 'desa': 1, 'boleh': 2, 'dibilang': 1, 'sebagai': 4, 'tonggak sejarah': 1, 'berdiri': 1, 'lembaga': 1, 'mikro': 1, 'di': 23, '1991': 1, 'jumlah': 2, 'utang': 2, 'luar negeri': 1, 'membengkak': 1, 'dua': 2, 'kali': 1, 'lipat': 1, 'menjadi': 3, 'us': 1, '$': 1, '45,725': 1, 'miliar': 7, 'kami': 3, 'sampaikan': 1, 'bahwa': 7, 'optimis': 1, 'mengenai': 3, 'tercap

Inisialisasi emission probability

In [14]:
emission_count = {}
count = 0
for index, row in tsv_read.iterrows():
    word = row['Word'].lower()
    tag = row['Tag']
    key = tag,word
    if word != 'nan':
        if key in emission_count:
            emission_count[key] = emission_count[key] + 1
        else:
            emission_count[key] = 1       
    if word == '.' and tag == 'Z':
        count += 1
    if count == 50:
        break

In [15]:
for vocab in vocab_count:
    for tag in tag_count:
        key = (tag,vocab)
        if key not in emission_count: 
            emission_count[key] = 0

In [16]:
emission_prob = {} 
for key in emission_count:
    tag, vocab = key
    emission_prob[key] = emission_count[key] / tag_count[tag]

In [17]:
print(emission_count)

{('NN', 'pemerintah'): 6, ('RB', 'bahkan'): 1, ('MD', 'telah'): 6, ('VB', 'mencanangkan'): 1, ('NN', 'dana'): 1, ('SC', 'untuk'): 10, ('VB', 'memicu'): 1, ('NN', 'sektor'): 2, ('NN', 'usaha'): 1, ('JJ', 'kecil'): 2, ('CC', 'dan'): 20, ('JJ', 'menengah'): 1, ('NN', 'ukm'): 1, ('VB', 'tumbuh'): 1, ('RB', 'lebih'): 3, ('JJ', 'baik'): 2, ('Z', ','): 65, ('SC', 'karena'): 7, ('PR', 'ini'): 8, ('RB', 'cukup'): 2, ('JJ', 'kuat'): 2, ('IN', 'dalam'): 14, ('JJ', 'krisis'): 1, ('NN', 'keuangan'): 2, ('IN', 'pada'): 14, ('NN', 'tahun'): 9, ('CD', '1997'): 1, ('CC', 'lalu'): 4, ('VB', 'kata'): 5, ('PRP', '-nya'): 21, ('Z', '.'): 50, ('NNP', 'gubernur'): 1, ('NNP', 'bank'): 6, ('NNP', 'indonesia'): 11, ('NNP', 'burhanudin'): 1, ('NNP', 'abdullah'): 1, ('JJ', 'optimistis'): 1, ('VB', 'mencapai'): 6, ('NN', 'target'): 3, ('NN', 'proyeksi'): 1, ('NN', 'suku'): 2, ('NN', 'bunga'): 4, ('NNP', 'sbi'): 1, ('CD', '2008'): 2, ('SC', 'yang'): 29, ('CD', '7,5'): 1, ('Z', '-'): 2, ('CD', '8,0'): 1, ('CD', 'per

In [18]:
print(emission_prob)

{('NN', 'pemerintah'): 0.01775147928994083, ('RB', 'bahkan'): 0.041666666666666664, ('MD', 'telah'): 0.1875, ('VB', 'mencanangkan'): 0.0064516129032258064, ('NN', 'dana'): 0.0029585798816568047, ('SC', 'untuk'): 0.14492753623188406, ('VB', 'memicu'): 0.0064516129032258064, ('NN', 'sektor'): 0.005917159763313609, ('NN', 'usaha'): 0.0029585798816568047, ('JJ', 'kecil'): 0.03508771929824561, ('CC', 'dan'): 0.5128205128205128, ('JJ', 'menengah'): 0.017543859649122806, ('NN', 'ukm'): 0.0029585798816568047, ('VB', 'tumbuh'): 0.0064516129032258064, ('RB', 'lebih'): 0.125, ('JJ', 'baik'): 0.03508771929824561, ('Z', ','): 0.5508474576271186, ('SC', 'karena'): 0.10144927536231885, ('PR', 'ini'): 0.25, ('RB', 'cukup'): 0.08333333333333333, ('JJ', 'kuat'): 0.03508771929824561, ('IN', 'dalam'): 0.11864406779661017, ('JJ', 'krisis'): 0.017543859649122806, ('NN', 'keuangan'): 0.005917159763313609, ('IN', 'pada'): 0.11864406779661017, ('NN', 'tahun'): 0.026627218934911243, ('CD', '1997'): 0.0103092783

Inisialisasi Transition Probability

In [19]:
transition_count = {} 
count = 0

for i in range(1, len(tags)):
    curr_tag = (tags[i-1], tags[i])
    if curr_tag in transition_count:
        transition_count[curr_tag] += 1
    else:
        transition_count[curr_tag] = 1

for i in tag_count:
    for j in tag_count:
        if (i,j) not in transition_count:
            transition_count[(i,j)] = 0

In [20]:
transition_prob = {} 
for key in transition_count:
    a, b = key 
    transition_prob[key] = transition_count[key] / tag_count[a] 

In [21]:
print(transition_count)

{('<start>', 'NN'): 14, ('NN', 'RB'): 4, ('RB', 'MD'): 4, ('MD', 'VB'): 21, ('VB', 'NN'): 52, ('NN', 'SC'): 27, ('SC', 'VB'): 24, ('NN', 'NN'): 113, ('NN', 'JJ'): 27, ('JJ', 'CC'): 1, ('CC', 'JJ'): 3, ('JJ', 'NN'): 10, ('NN', 'VB'): 21, ('VB', 'RB'): 2, ('RB', 'JJ'): 6, ('JJ', 'Z'): 4, ('Z', 'SC'): 8, ('SC', 'NN'): 20, ('NN', 'PR'): 23, ('PR', 'RB'): 4, ('JJ', 'IN'): 13, ('IN', 'JJ'): 1, ('NN', 'IN'): 29, ('IN', 'NN'): 70, ('NN', 'CD'): 12, ('CD', 'CC'): 3, ('CC', 'Z'): 3, ('Z', 'VB'): 12, ('VB', 'PRP'): 12, ('PRP', 'Z'): 8, ('Z', '<start>'): 49, ('<start>', 'NNP'): 11, ('NNP', 'NNP'): 76, ('NNP', 'JJ'): 1, ('JJ', 'VB'): 10, ('NN', 'NNP'): 35, ('NNP', 'CD'): 4, ('CD', 'SC'): 7, ('VB', 'CD'): 8, ('CD', 'Z'): 19, ('Z', 'CD'): 5, ('CD', 'CD'): 27, ('NN', 'MD'): 4, ('VB', 'IN'): 28, ('PRP', 'NN'): 7, ('IN', 'NNP'): 22, ('NNP', 'Z'): 43, ('<start>', 'IN'): 8, ('CD', 'NN'): 19, ('CD', 'NND'): 3, ('NND', 'VB'): 1, ('VB', 'VB'): 9, ('VB', 'NNP'): 6, ('NNP', 'SYM'): 1, ('SYM', 'CD'): 16, ('<sta

In [22]:
print(transition_prob)

{('<start>', 'NN'): 0.28, ('NN', 'RB'): 0.011834319526627219, ('RB', 'MD'): 0.16666666666666666, ('MD', 'VB'): 0.65625, ('VB', 'NN'): 0.33548387096774196, ('NN', 'SC'): 0.07988165680473373, ('SC', 'VB'): 0.34782608695652173, ('NN', 'NN'): 0.3343195266272189, ('NN', 'JJ'): 0.07988165680473373, ('JJ', 'CC'): 0.017543859649122806, ('CC', 'JJ'): 0.07692307692307693, ('JJ', 'NN'): 0.17543859649122806, ('NN', 'VB'): 0.0621301775147929, ('VB', 'RB'): 0.012903225806451613, ('RB', 'JJ'): 0.25, ('JJ', 'Z'): 0.07017543859649122, ('Z', 'SC'): 0.06779661016949153, ('SC', 'NN'): 0.2898550724637681, ('NN', 'PR'): 0.06804733727810651, ('PR', 'RB'): 0.125, ('JJ', 'IN'): 0.22807017543859648, ('IN', 'JJ'): 0.00847457627118644, ('NN', 'IN'): 0.08579881656804733, ('IN', 'NN'): 0.5932203389830508, ('NN', 'CD'): 0.03550295857988166, ('CD', 'CC'): 0.030927835051546393, ('CC', 'Z'): 0.07692307692307693, ('Z', 'VB'): 0.1016949152542373, ('VB', 'PRP'): 0.07741935483870968, ('PRP', 'Z'): 0.25806451612903225, ('Z'

Fungsi Viterbi

In [23]:
def viterbi(trans_prob, emission_prob, tokens):
    # create a path probability matrix viterbi[N,T]
    # N: banyaknya state
    # T: jumlah token

    T, N = len(tokens)+1, len(tagset) # token ditambah satu untuk start
    new_tokens = ['<s>'] + tokens
    print('T=',T,',N=',N)
    print('token:',new_tokens)
    viterbi_mat = [[0 for x in range(T)] for y in range(N)] 
    print('matriks viterbi setelah di-create:',viterbi_mat)
    print('baris adalah tag/state dan kolom adalah token\n')
    # create backpointers matrix
    backpointers = [[0 for x in range(T)] for y in range(N)] 

    # initial probability distribution over states (phi)
    # transition probability dengan previous state adalah <start>
    phi = {}
    for i in range (1,len(tagset)):
        phi[tagset[i]] = transition_prob[('<start>',tagset[i])]
                    
    # initialization
    # urutan index state sesuai dengan index di dictionary tags{}
    # inisialisasi dimulai dari state ke-1, state ke-0 sudah pasti <start>
    viterbi_mat[0][0] = 1.0 # untuk token <s>, tag = <start>
    for s in range(1,N):
        viterbi_mat[s][1] = phi[tagset[s]] * emission_prob[(tagset[s],new_tokens[1])]
        backpointers[s][1] = 0

    print('viterbi mat setelah inisialisasi, proses token pertama ',new_tokens[1],':')
    print(viterbi_mat)
    print('\n')

    # recursion step
    for t in range(2,T):
        print('token ke ',t,':',new_tokens[t])
        for s in range(1,N):
            # get max viterbi from previous transition
            max_prev_transition = 0.0
            max_state = 0
            print('menghitung nilai viterbi untuk state:',tagset[s])
            for i in range(1,N):                
                #selain transisi dari tag <start>, range mulai dari indeks 1
                temp_transition = viterbi_mat[i][t-1] * transition_prob[(tagset[i],tagset[s])]
                print('t-1 = ',t-1)   
                print('current calculation ',tagset[i],'i:',i,'=',viterbi_mat[i][t-1],'*',transition_prob[(tagset[i],tagset[s])],'=',temp_transition)           
                if temp_transition > max_prev_transition:
                    max_prev_transition = temp_transition
                    max_state = i
            viterbi_mat[s][t] = max_prev_transition * emission_prob[(tagset[s],new_tokens[t])]
            backpointers[s][t] = max_state
        print('viterbi mat setelah proses token: ',new_tokens[t])
        print(viterbi_mat)
        print('\n')

    

    # terminasi
    # get max probability in last column
    max_last_prob = 0.0
    best_last_tag = ''
    idx_best_last_tag = 0
    for i in range (1,N):
        if viterbi_mat[i][T-1] > max_last_prob:
            max_last_prob = viterbi_mat[i][T-1]
            best_last_tag = tagset[i]
            idx_best_last_tag = i
    print('last token = ',new_tokens[T-1],',tag = ',best_last_tag,',max_last_prob =',max_last_prob)

    best_path = []
    best_path.append(idx_best_last_tag)
    for i in range(T-1,1,-1):
        best_prev_tag = backpointers[idx_best_last_tag][i]
        print('best_prev_tag=',best_prev_tag)
        best_path.append(best_prev_tag)
    # reverse the order
    best_path = best_path[::-1]
    
    return viterbi_mat, best_path

Tes pada kalimat uji

In [24]:
colnames = ['Word','Tag']
tsv_read_test = pd.read_csv("test_sentences.tsv",sep="\t", names=colnames, header=None)
tsv_read_test = tsv_read_test.astype(str)
tsv_read_test.head()

Unnamed: 0,Word,Tag
0,Menteri,NNP
1,pertahanan,NNP
2,AS,NNP
3,dijadwalkan,VB
4,mengunjungi,VB


In [25]:
word_temp = [] 
tag_temp= [] 
testing_sentences = []
testing_tags = [] 

for index, row in tsv_read_test.iterrows():   
    word = row['Word'].lower()
    tag = row['Tag']
    if word_temp != 'nan':
      word_temp.append(word)
      tag_temp.append(tag)
      if word == '.' :
        testing_sentences.append(word_temp)
        testing_tags.append(tag_temp)
        word_temp = []
        tag_temp = []

In [26]:
print(testing_sentences)

[['menteri', 'pertahanan', 'as', 'dijadwalkan', 'mengunjungi', 'india', '.'], ['tata', 'power', 'menyuplai', 'batu bara', 'pada', 'tahun', '2000', '.'], ['pemerintah', 'hati-hati', 'dalam', 'mengelola', 'bumn', '.'], ['perusahaan', 'baru', 'tersebut', 'mencanangkan', 'target', 'perolehan', 'laba bersih', '.'], ['menteri', 'pertahanan', 'mengunjungi', 'pangkalan', 'udara', '.'], ['menurut', 'laporan', 'sekretaris', 'perusahaan', ',', 'laba bersih', 'meningkat', '.'], ['transaksi', 'penjualan', 'barang mewah', 'tahun', '2007', 'turun', '.'], ['menkeu', 'memperkirakan', 'inflasi', 'akan', 'meningkat', 'dibanding', 'tahun', 'lalu', '.'], ['kenaikan', 'tarif', 'didorong', 'oleh', 'target', 'laba bersih', 'yang', 'meningkat', '.'], ['makanan', 'dari', 'luar negeri', 'tidak', 'bisa', 'masuk', 'pasar', 'lokal', '.']]


In [36]:
print(testing_tags)

[['NNP', 'NNP', 'NNP', 'VB', 'VB', 'NNP', 'Z'], ['NNP', 'NNP', 'VB', 'NN', 'IN', 'NN', 'CD', 'Z'], ['NN', 'JJ', 'IN', 'VB', 'NN', 'Z'], ['NN', 'JJ', 'PR', 'VB', 'NN', 'NN', 'NN', 'Z'], ['NNP', 'NNP', 'VB', 'NN', 'NN', 'Z'], ['IN', 'NN', 'NN', 'NN', 'Z', 'NN', 'VB', 'Z'], ['NN', 'NN', 'NN', 'NN', 'CD', 'VB', 'Z'], ['NN', 'VB', 'NN', 'MD', 'VB', 'VB', 'NN', 'CC', 'Z'], ['NN', 'NN', 'VB', 'IN', 'NN', 'NN', 'SC', 'VB', 'Z'], ['NN', 'IN', 'NN', 'NEG', 'MD', 'VB', 'NN', 'JJ', 'Z']]


Tes print viterbi matrix yang terbentuk

In [28]:
path =[]
for sentence in testing_sentences: # dilakukan pengujian dari data test per kalimat yang sudah berbentuk token
    viterbi_mat, best_path =  viterbi(transition_prob,emission_prob,sentence)
    path.append(best_path)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
current calculation  RB i: 2 = 0.0 * 0.0 = 0.0
t-1 =  2
current calculation  MD i: 3 = 0.0 * 0.0 = 0.0
t-1 =  2
current calculation  VB i: 4 = 0.0 * 0.0 = 0.0
t-1 =  2
current calculation  SC i: 5 = 0.0 * 0.0 = 0.0
t-1 =  2
current calculation  JJ i: 6 = 0.0 * 0.0 = 0.0
t-1 =  2
current calculation  CC i: 7 = 0.0 * 0.0 = 0.0
t-1 =  2
current calculation  Z i: 8 = 0.0 * 0.0 = 0.0
t-1 =  2
current calculation  PR i: 9 = 0.0 * 0.0 = 0.0
t-1 =  2
current calculation  IN i: 10 = 5.42104647857485e-06 * 0.0 = 0.0
t-1 =  2
current calculation  CD i: 11 = 0.0 * 0.0 = 0.0
t-1 =  2
current calculation  PRP i: 12 = 0.0 * 0.0 = 0.0
t-1 =  2
current calculation  NNP i: 13 = 0.0 * 0.0 = 0.0
t-1 =  2
current calculation  NND i: 14 = 0.0 * 0.0 = 0.0
t-1 =  2
current calculation  SYM i: 15 = 0.0 * 0.0 = 0.0
t-1 =  2
current calculation  OD i: 16 = 0.0 * 0.0 = 0.0
t-1 =  2
current calculation  X i: 17 = 0.0 * 0.0 = 0.0
t-1 =  2
current calc

Tes print best POSTag yang diperoleh

In [29]:
test_tags =[]
for t in testing_tags:
    test_tags += t
print(test_tags)

['NNP', 'NNP', 'NNP', 'VB', 'VB', 'NNP', 'Z', 'NNP', 'NNP', 'VB', 'NN', 'IN', 'NN', 'CD', 'Z', 'NN', 'JJ', 'IN', 'VB', 'NN', 'Z', 'NN', 'JJ', 'PR', 'VB', 'NN', 'NN', 'NN', 'Z', 'NNP', 'NNP', 'VB', 'NN', 'NN', 'Z', 'IN', 'NN', 'NN', 'NN', 'Z', 'NN', 'VB', 'Z', 'NN', 'NN', 'NN', 'NN', 'CD', 'VB', 'Z', 'NN', 'VB', 'NN', 'MD', 'VB', 'VB', 'NN', 'CC', 'Z', 'NN', 'NN', 'VB', 'IN', 'NN', 'NN', 'SC', 'VB', 'Z', 'NN', 'IN', 'NN', 'NEG', 'MD', 'VB', 'NN', 'JJ', 'Z']


Prediksi POSTag

In [33]:
pred_tags = []
for best in path:
    for tag in best:
        pred_tags.append(tagset[tag])
print(pred_tags)

['NNP', 'NNP', 'NNP', 'VB', 'VB', 'NNP', 'Z', 'NNP', 'NNP', 'VB', 'NN', '<start>', 'NN', 'CD', 'Z', 'NN', 'JJ', 'NN', 'VB', 'NN', 'Z', 'NNP', 'JJ', 'PR', 'VB', 'NN', 'NN', 'NN', 'Z', 'NNP', 'NNP', 'VB', 'NN', 'NN', 'Z', '<start>', 'NN', 'NNP', 'NNP', '<start>', 'NN', 'VB', 'Z', 'NN', 'NN', 'NN', 'NN', 'CD', 'VB', 'Z', 'NNP', 'VB', 'NN', '<start>', 'VB', 'VB', 'NN', 'CC', 'Z', 'NN', 'NN', 'VB', '<start>', 'NN', 'NN', '<start>', 'VB', 'Z', 'NN', '<start>', 'NN', 'NEG', '<start>', 'VB', 'NN', 'JJ', 'Z']


In [34]:
for i in range(len(test_tags)):
    if test_tags[i] == pred_tags[i]:
        count += 1

print('Akurasi dengan metode HMM Viterbi : ', count/len(test_tags))

Akurasi dengan metode HMM Viterbi :  0.8311688311688312
