In [32]:
import nltk
import numpy as np
import pandas as pd
import underthesea
import re
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Utils

In [6]:
def read_file(file_name):
    return open(file_name, 'r', encoding='utf-8').read().split('\n')

def write_file(file_name, text):
    file = open(file_name, 'w', encoding='utf-8')
    for sentences in text:
        file.writelines(sentences + '\n')

In [7]:
def inspect_tag(tag) -> bool:
    pattern = re.compile(r'^[A-Z][a-z]?$')
    return bool(pattern.search(tag))

In [8]:
def inspect_word(word) -> bool:
    pattern = re.compile(r'[!"#$%&\'()*+,-./:;<=>?@\[\]^`{|}~“”\\]+')
    return not(bool(pattern.search(word)))

In [9]:
def get_raw_text(filename):
    """
    - trả về data nguyên bản (kể cả các invalid word, tag) trong file
    - bên cạnh đó, xử lí word và tag -> tách tag và word sau đó kẹp thành 1 tuple như này (word, tag)
    - loại bỏ các word không tag hoặc word có nhiều hơn 1 tag
    """ 
    text = read_file(filename)
    data = []

    for sentence in text:
        for duo in sentence.strip().split(' '):
            data.append(tuple(duo.split('/')))

    data = [duo for duo in data if len(duo) == 2]
    return data

In [10]:
def get_processed_data(filename):
    opening = ('<s>', '<s>')
    ending = ('</s>', '</s>')

    corpus = get_raw_text(filename)
    refreshed_data = []
    sentence = []

    for word, tag in corpus:
        if inspect_word(word) and inspect_tag(tag):
            sentence.append((word, tag))
        elif word in ['.', '...', ':', '!', '?', '-', '"']:
            
            refreshed_data.append([opening] + sentence + [ending])
            sentence.clear()

    return refreshed_data

In [11]:
def load_corpus(filename):
    data = get_processed_data(filename)
    corpus = []
    for sentence in data:
        corpus.extend(sentence)
    return corpus

In [15]:
path = "D:/01. sem1-22.23/NLP/dataset/pos-tagging/data.pos"
corpus = load_corpus(path)

In [17]:
corpus[:5]

[('<s>', '<s>'),
 ('Hải_tặc', 'N'),
 ('eo_biển', 'N'),
 ('Malacca', 'Np'),
 ('kỳ', 'N')]

## train test split 

In [19]:
train_set, test_set = train_test_split(corpus, test_size=0.2, random_state=27)

In [21]:
print("trainset size",len(train_set))
print("testset size",len(test_set))

trainset size 172512
testset size 43129


## dictionary parameters 

In [23]:
emission_count = Counter(train_set)
all_tags = [tag for _,tag in train_set]
all_words = [word for word,_ in train_set]
tag_count = Counter(all_tags)
transition_count = Counter(nltk.bigrams(all_tags))

In [25]:
tags = set(all_tags)
vocab = set(all_words)

print(f"there're {len(tags)} tags, including opening and closing tags")
print(f"they're {sorted(tags)}")

there're 29 tags, including opening and closing tags
they're ['</s>', '<s>', 'A', 'Ab', 'Ap', 'B', 'C', 'E', 'Eb', 'H', 'I', 'L', 'M', 'Mb', 'N', 'Nb', 'Nc', 'Np', 'Nu', 'Ny', 'P', 'R', 'S', 'T', 'V', 'Vb', 'Vy', 'X', 'Y']


# Hidden Markov Model with Viterbi algorithm

In [26]:
def P_t2_given_t1(t2, t1): # kết hợp Laplace smoothing
    alpha = 0.001
    return (transition_count[(t2, t1)]+ alpha)/ (tag_count[t1]+ alpha * len(tags))

In [27]:
def P_w_given_t(word, tag):
    return (emission_count[(word, tag)])/(tag_count[tag])

In [28]:
transition_probability = np.zeros((len(tags), len(tags)), dtype='float32')
for i, t1 in enumerate(tags):
    for j, t2 in enumerate(tags):
        transition_probability[i, j] = P_t2_given_t1(t2, t1)

In [29]:
# transition_table
transition_table = pd.DataFrame(transition_probability, columns = tags, index=tags)

In [30]:
transition_table

Unnamed: 0,</s>,V,B,Vb,L,Nb,E,Np,S,H,...,P,Nc,Ap,Nu,Mb,<s>,I,N,Ny,Ab
</s>,0.068648,0.208156,0.0001770165,8.846403e-08,0.01787,0.001681,0.064402,0.040959,0.0007962648,8.846403e-08,...,0.043967,0.027424,8.846403e-08,0.004335,8.846403e-08,0.063429,0.0003539446,0.232395,0.004246,8.846403e-08
V,0.065601,0.200966,5.785239e-05,0.0001734993,0.018359,0.001908,0.063548,0.04039,0.0006649988,2.891174e-08,...,0.042153,0.025471,2.891174e-08,0.004539,2.891174e-08,0.066642,0.0005204402,0.245692,0.004973,2.894065e-05
B,0.110988,0.110988,5.546619e-05,5.546619e-05,0.055522,5.5e-05,0.055522,0.055522,5.546619e-05,5.546619e-05,...,0.055522,5.5e-05,5.546619e-05,5.5e-05,5.546619e-05,0.055522,5.546619e-05,0.332853,5.5e-05,5.546619e-05
Vb,0.04544,0.408598,4.539471e-05,4.539471e-05,4.5e-05,4.5e-05,4.5e-05,4.5e-05,4.539471e-05,4.539471e-05,...,4.5e-05,4.5e-05,4.539471e-05,4.5e-05,4.539471e-05,4.5e-05,4.539471e-05,0.272414,4.5e-05,4.539471e-05
L,0.062814,0.211682,3.140675e-07,0.0003143816,0.021671,0.003141,0.0625,0.034862,0.001256584,3.140675e-07,...,0.047739,0.024498,3.140675e-07,0.002513,3.140675e-07,0.057789,3.140675e-07,0.244973,0.005025,3.140675e-07
Nb,0.056073,0.214937,3.114983e-06,3.114983e-06,0.015578,3e-06,0.046728,0.043613,3.114983e-06,3.114983e-06,...,0.049843,0.043613,3.114983e-06,0.003118,3.114983e-06,0.074763,3.114983e-06,0.255432,3e-06,3.114983e-06
E,0.063492,0.19376,0.0001825392,0.0001825392,0.019704,0.001916,0.063948,0.039956,0.0007298831,9.122399e-08,...,0.043331,0.027276,9.122399e-08,0.004835,9.122399e-08,0.064495,0.0003649872,0.246943,0.004835,9.122399e-08
Np,0.062211,0.202112,1.446753e-07,0.00014482,0.018374,0.003183,0.062934,0.037182,0.001302222,1.446753e-07,...,0.046007,0.025029,1.446753e-07,0.005642,1.446753e-07,0.067129,0.0008681966,0.242187,0.003906,1.446753e-07
S,0.050721,0.144904,7.244854e-06,7.244854e-06,0.007252,7e-06,0.028987,0.057966,7.244854e-06,7.244854e-06,...,0.043476,0.057966,7.244854e-06,0.007252,7.244854e-06,0.072456,7.244854e-06,0.340515,0.014497,7.244854e-06
H,0.000972,0.000972,0.0009718173,0.0009718173,0.000972,0.000972,0.000972,0.000972,0.0009718173,0.0009718173,...,0.000972,0.000972,0.0009718173,0.000972,0.0009718173,0.000972,0.0009718173,0.972789,0.000972,0.0009718173


## HMM + viterbi

In [31]:
def viterbi(tokens):
    pred = []
    T = list(tags)
    for idx in range(len(tokens)):
        #initialise list of probability column for a given observation
        p = []
        for tag in T:
            if idx == 0:
                transition_p = transition_table.loc['<s>', tag]
            else:
                transition_p = transition_table.loc[pred[-1], tag]
            # compute emission and state probabilities
            emission_p = P_w_given_t(tokens[idx], tag)
            state_probability = emission_p * transition_p
            p.append(state_probability)

        argmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(argmax)]
        pred.append(state_max)
    return pred

In [33]:
test_word = [duo[0] for duo in test_set]
test_tag= [duo[1] for duo in test_set]

In [34]:
predict_tags = viterbi(test_word)
accuracy_score(test_tag, predict_tags)

0.8819587748382759

predict tag

In [35]:
def predict_tag(sent):
    tokens = underthesea.word_tokenize(sent, format="text")
    tokens = tokens.split()
    return list(zip(tokens,viterbi(tokens)))

In [37]:
s1 = 'Tôi thích học xử lí ngôn ngữ tự nhiên'
predict_tag(s1)

[('Tôi', 'P'),
 ('thích', 'V'),
 ('học', 'V'),
 ('xử_lí', '</s>'),
 ('ngôn_ngữ', 'N'),
 ('tự_nhiên', 'A')]

# HMM by nltk library

In [38]:
from nltk.tag import hmm

In [46]:
data_hmm = get_processed_data(path)
train_hmm, test_hmm = train_test_split(data_hmm, test_size=0.2, random_state=27)

In [48]:
tagger = nltk.HiddenMarkovModelTagger.train(train_hmm)

In [49]:
pred_hmm_tag = [tag for _,tag in tagger.tag(test_word)]

In [50]:
accuracy_score(test_tag, pred_hmm_tag)

0.8378585174708433