<a href="https://colab.research.google.com/github/nthanhkhang/vietnamese-pos-tagging/blob/main/Vietnamese_POS_Tagger.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyvi
import string
import numpy
import sklearn.svm as svm
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
import io
from collections import defaultdict
import random
from pyvi import ViTokenizer, ViPosTagger

ModuleNotFoundError: No module named 'pyvi'

In [None]:
!wget https://raw.githubusercontent.com/nthanhkhang/Natural-Language-Processing/main/Data/vi_train.txt
!wget https://raw.githubusercontent.com/nthanhkhang/Natural-Language-Processing/main/Data/vi_test.txt

In [None]:
tagsetDict = {"N" : 1,
              "Np" : 2,
              "Nc" : 3,
              "Nu" : 4,
              "V" : 5,
              "A" : 6,
              "P" : 7,
              "L" : 8,
              "M" : 9,
              "R" : 10,
              "E" : 11,
              "C" : 12,
              "I" : 13,
              "T" : 14,
              "B" : 15,
              "Y" : 16,
              "S" : 17,
              "X" : 19 }
inverseTagsetDict = {tagsetDict[k]:k for k in tagsetDict}

In [None]:
wordBank = defaultdict()
bigramBank = defaultdict()
bigramDict = defaultdict(int)

bigramFreq = {x:[0]*18 for x in tagsetDict}
mostCommonBigrams = defaultdict()

prevParts = None

f_tagged_train = io.open("vi_train.txt", encoding='utf-8').readlines()
f_tagged_test = io.open("vi_test.txt", encoding='utf-8').readlines()

train = f_tagged_train[:10000]
test = f_tagged_test[:10000]
print(train[:10])
print(test[:10])

In [None]:
for line in train:
    l_split = line.split()
    for i,w in enumerate(l_split):
        parts = w.split("/")
        if i >= 1:
            prevParts = l_split[i-1].split("/")
        if len(parts) == 1 or \
            parts[1] not in tagsetDict:
            continue

        if i >= 2 and prevParts[1] in string.punctuation:
            prevParts = l_split[i-1].split("/")

        word = parts[0]
        pos = parts[1]

        if i >= 1:
            prevWord = prevParts[0]
            prevPos = prevParts[1]
            bigramBank[ word ] = (pos, prevWord, prevPos)
            bigramDict[ (pos, prevPos) ] += 1

        if word not in wordBank:
            wordBank[word] = [pos]
        else:
            wordBank[word] += [pos]
#print(wordBank)

In [None]:
bi_grams=[]
tri_grams=[]
qua_grams=[]
for sentence in set(wordBank):
  temp=0
  for s in list(sentence):
    if s == "_":
      temp+=1
    if temp == 0:
      bi_grams.append(sentence)
    elif temp == 2:
      tri_grams.append(sentence)
    elif temp == 4:
      qua_grams.append(sentence)
#print(bi_grams)
#print(tri_grams)
#print(qua_grams)

In [None]:
#finished getting training data
for k in bigramFreq:
    maxFreq = 0
    maxPos = "X"
    for i,x in enumerate(bigramFreq[k]):
        if x > maxFreq:
            maxFreq = x
            maxPos = inverseTagsetDict[i]
    mostCommonBigrams[k] = maxPos

In [None]:
def Viterbi(word,wordIdx, lineSize, line):
    feat = [1]
    sentPercent = float(wordIdx)/float(lineSize)
    feat.append(sentPercent)

    if word[0].isupper() and wordIdx != 0:
        feat.append(1)
    else:
        feat.append(0)

    posIdx_array = ([0] * len(tagsetDict))
    posSet = []
    if word in wordBank:
        posSet = wordBank[word]
    else:
        if wordIdx == 0:
            posSet = list(tagsetDict.keys())[0] 
            posIdx_array[tagsetDict[posSet]] = 1
            return feat + posIdx_array + [0]
        else:
            prevWord = line[wordIdx-1]
            if prevWord in wordBank:
                prevPos = wordBank[prevWord]
                maxPos = mostCommonBigrams[prevPos]
                if prevPos == "E":
                    maxPos = 3
                posIdx_array[tagsetDict[maxPos]] = 1
                feat += posIdx_array + [tagsetDict[maxPos]]
                return feat
            else:
                posSet = list(tagsetDict.keys())[0]  
                posIdx_array[tagsetDict[posSet]] = 1
                feat += posIdx_array + [0]
                return feat

    for pos in posSet:
        posIdx = tagsetDict[pos]
        posIdx_array[posIdx] += 1.0/len(wordBank[word])
    feat += (posIdx_array) + [0]
    return feat

In [None]:
#finished creating y and xtrain
y = []
X_train = []
for line in train:
    l_split = line.split()
    for i,w in enumerate(l_split):
        parts = w.split("/")
        word = parts[0]
        len_line = len(l_split)

        if len(parts) == 1 or parts[1] not in tagsetDict:
            continue
        y.append(wordBank[word][0])
        X_train.append( Viterbi(word,i,len_line, l_split))

In [None]:
print (len(X_train))
print (len(y))
print(X_train[0:10])
print(y[0:10])
train_fit = OneVsRestClassifier(LinearSVC(random_state=0)).fit(X_train,y)

In [None]:
#finished getting testing data
X_test = []
correct_results = []
for line in test:
    l_split = line.split()
    for i,w in enumerate(l_split):
        len_line = len(l_split)
        parts = w.split("/")
        if len(parts) == 1 or parts[1] not in tagsetDict:
            continue
        word = parts[0]
        pos = parts[1]
        if word in string.punctuation or word == ":.":
            continue
        X_test.append(Viterbi(word,i,len_line,l_split))
        correct_results.append(pos)
#print(X_test)

In [None]:
predicted_results = train_fit.predict(X_test)
print("Predicted Results:",predicted_results[:12])
print("Correct Results  :",correct_results[:12])

numCorrect = 0
for cor,pred in zip(correct_results, predicted_results):
    if cor == pred:
        numCorrect += 1

accuracy = 1.0*numCorrect/len(correct_results)
print("Viterbi Algorithm Accuracy:",accuracy*100)

In [None]:
import unicodedata as ud
import re
def syllablize(sentence):
    word = '\w+'
    non_word = '[^\w\s]'
    digits = '\d+([\.,_]\d+)+'
    
    patterns = []
    patterns.extend([word, non_word, digits])
    patterns = f"({'|'.join(patterns)})"
    
    sentence = ud.normalize('NFC', sentence)
    tokens = re.findall(patterns, sentence, re.UNICODE)
    return [token[0] for token in tokens]

In [None]:
def longest_matching(sentence, bi_grams, tri_grams):
    syllables = syllablize(sentence)
    syl_len = len(syllables)
    
    curr_id = 0
    word_list = []
    done = False
    
    while (curr_id < syl_len) and (not done):
        curr_word = syllables[curr_id]
        if curr_id >= syl_len - 1:
            word_list.append(curr_word)
            done = True
        else:
            next_word = syllables[curr_id + 1]
            pair_word = ' '.join([curr_word.lower(), next_word.lower()])
            if curr_id >= (syl_len - 2):
                if pair_word in bi_grams:
                    word_list.append('_'.join([curr_word, next_word]))
                    curr_id += 2
                else:
                    word_list.append(curr_word)
                    curr_id += 1
            else:
                next_next_word = syllables[curr_id + 2]
                triple_word = ' '.join([pair_word, next_next_word.lower()])
                if triple_word in tri_grams:
                    word_list.append('_'.join([curr_word, next_word, next_next_word]))
                    curr_id += 3
                elif pair_word in bi_grams:
                    word_list.append('_'.join([curr_word, next_word]))
                    curr_id += 2
                else:
                    word_list.append(curr_word)
                    curr_id += 1
    return word_list

In [None]:
def toString(wl):
  wl=longest_matching(wl, bi_grams, tri_grams)
  X=[]
  A=[]
  text=""
  for i in set(wl):
    if i ==".":
      A="./."
    else:
      X.append(Viterbi(i,1,1,1))
      A=str(train_fit.predict(X))
  #print(A)  
  for i in range(len(wl)):
    text+=wl[i]
    text+='/'
    text+=str(A[2])
    text+=' '
  return text

In [None]:
if __name__ == "__main__":
    #wr = input('Text: ')
    wr ="Dù khá đắt nhưng tôi vẫn đồng ý."
    print(wr)
    wl = ViTokenizer.tokenize(wr)
    wl = wl.split()  
    for i in wl:
      try:
          print(toString(i),end='')
      except Exception:
          print(i,"X",sep='/',end='')
          print(' ',end='')
          pass    