<a href="https://colab.research.google.com/github/nthanhkhang/vietnamese-pos-tagging/blob/main/Vietnamese_POS_Tagger.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [368]:
import string
import numpy
import sklearn.svm as svm
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
import io
from collections import defaultdict
import random

In [369]:
#!wget https://raw.githubusercontent.com/nthanhkhang/Natural-Language-Processing/main/Data/vi_train.txt
#!wget https://raw.githubusercontent.com/nthanhkhang/Natural-Language-Processing/main/Data/vi_test.txt

In [370]:
tagsetDict = {"N" : 1,
              "Np" : 2,
              "Nc" : 3,
              "Nu" : 4,
              "V" : 5,
              "A" : 6,
              "P" : 7,
              "L" : 8,
              "M" : 9,
              "R" : 10,
              "E" : 11,
              "C" : 12,
              "I" : 13,
              "T" : 14,
              "B" : 15,
              "Y" : 16,
              "S" : 17,
              "X" : 19 }
inverseTagsetDict = {tagsetDict[k]:k for k in tagsetDict}

In [371]:
wordBank = defaultdict()
bigramBank = defaultdict()
bigramDict = defaultdict(int)

bigramFreq = {x:[0]*18 for x in tagsetDict}
mostCommonBigrams = defaultdict()

prevParts = None

f_tagged_train = io.open("vi_train.txt", encoding='utf-8').readlines()
f_tagged_test = io.open("vi_test.txt", encoding='utf-8').readlines()

train = f_tagged_train[:6000]
test = f_tagged_test[:10000]
print(train)
print(test)

['Trên/E đường/N đi/V ,/, có/V một/M lần/N xe/N cô/N suýt/R rơi/V xuống/R vực/N ở/E đèo/N Ngoạn_Mục/Np ./.\n', '\n', 'Trong/E một/M trận/N đánh/V ác_liệt/A bên/N thành/N cổ/N Quảng_Trị/Np ,/, một/M loạt/N đạn/N pháo/N của/E kẻ_thù/N đã/R rơi/V trúng/A chỗ/N chiến_sĩ/N thông_tin/N Nguyễn_Văn_Thạc/Np .../... Hôm/N đó/P ngày/N 30/M -/- 7/M -/- 1972/M ./.\n', '\n', 'Sau/E khi/N H./Ny “/“ AK/Ny ”/” bị/V xộ/V khám/V ,/, Hoàng/Np đã/R hoàn_lương/V ,/, bây_giờ/P đang/R phụ/V việc/N cho/E gia_đình/N Quân/Np ./.\n', '\n', 'Hà/Np ,/, 21/M tuổi/N -/- làm/V nghề/N hớt_tóc/V ,/, khai/V :/: “/“ Ban_đầu/N nghe/V mấy/L ảnh/N nói/V thuốc/N lắc/V không/R nghiện/V nên/C em/N uống/V thử/V ./.\n', '\n', 'Theo/E anh/Nc Thông/Np ,/, người/N già/A ở/E Đức/Np thường/R sống/V một_mình/Ny và/C sống/V với/E những/L kỷ_vật/N của/E riêng/A mình/P mang/V về/R từ/E những/L chuyến/N du_lịch/V hay/C những/L tặng_vật/N của/E cả/P một/M đời/N ./.\n', '\n', 'Người/N đến/V muộn/A một_chút/L ,/, phà/N vừa/R rời/V bến/N thì/C

In [372]:
# Get training data
for line in train:
    l_split = line.split()
    for i,w in enumerate(l_split):
        parts = w.split("/")
        if i >= 1:
            prevParts = l_split[i-1].split("/")
        if len(parts) == 1 or \
            parts[1] not in tagsetDict:
            continue

        # if prev parts is a punctuation get the one before that
        if i >= 2 and prevParts[1] in string.punctuation:
            prevParts = l_split[i-1].split("/")

        word = parts[0]
        pos = parts[1]

        if i >= 1:
            prevWord = prevParts[0]
            prevPos = prevParts[1]
            bigramBank[ word ] = (pos, prevWord, prevPos)
            bigramDict[ (pos, prevPos) ] += 1

        # Makes word map to list of pos tags
        if word not in wordBank:
            wordBank[word] = [pos]
        else:
            wordBank[word] += [pos]

In [373]:
bi_grams=[]
tri_grams=[]
for sentence in set(wordBank):
  temp=0
  for s in list(sentence):
    if s == "_":
      temp+=1
    if temp == 1:
      bi_grams.append(sentence)
    elif temp == 2:
      tri_grams.append(sentence)
print(bi_grams)
print(tri_grams)

['hung_dữ', 'hung_dữ', 'hung_dữ', 'bao_la', 'bao_la', 'bao_la', 'dọn_dẹp', 'dọn_dẹp', 'dọn_dẹp', 'dọn_dẹp', 'người_mẫu', 'người_mẫu', 'người_mẫu', 'người_mẫu', 'Bạc_tình', 'Bạc_tình', 'Bạc_tình', 'Bạc_tình', 'Bạc_tình', 'tất_nhiên', 'tất_nhiên', 'tất_nhiên', 'tất_nhiên', 'tất_nhiên', 'tất_nhiên', 'Ma_Thoa', 'Ma_Thoa', 'Ma_Thoa', 'Ma_Thoa', 'Ma_Thoa', 'ở_trần', 'ở_trần', 'ở_trần', 'ở_trần', 'ở_trần', 'cắn_răng', 'cắn_răng', 'cắn_răng', 'cắn_răng', 'cắn_răng', 'Sophie_Quinn', 'Sophie_Quinn', 'Sophie_Quinn', 'Sophie_Quinn', 'Sophie_Quinn', 'Sophie_Quinn', 'vật_liệu', 'vật_liệu', 'vật_liệu', 'vật_liệu', 'vật_liệu', 'ruột_thịt', 'ruột_thịt', 'ruột_thịt', 'ruột_thịt', 'ruột_thịt', 'ra_đời', 'ra_đời', 'ra_đời', 'ra_đời', 'bất_khả_thi', 'bất_khả_thi', 'bất_khả_thi', 'bất_khả_thi', 'đau_khổ', 'đau_khổ', 'đau_khổ', 'đau_khổ', 'sống_sót', 'sống_sót', 'sống_sót', 'sống_sót', 'cường_quốc', 'cường_quốc', 'cường_quốc', 'cường_quốc', 'cường_quốc', 'Trò_chuyện', 'Trò_chuyện', 'Trò_chuyện', 'Trò_chuyện'

In [374]:
for k in bigramFreq:
    maxFreq = 0
    maxPos = "X"
    for i,x in enumerate(bigramFreq[k]):
        if x > maxFreq:
            maxFreq = x
            maxPos = inverseTagsetDict[i]
    mostCommonBigrams[k] = maxPos


print ("finished getting training data")

finished getting training data


In [375]:
# Features
def feature(word,wordIdx, lineSize, line):
    feat = [1]
    sentPercent = float(wordIdx)/float(lineSize)
    feat.append(sentPercent)

    if word[0].isupper() and wordIdx != 0:
        feat.append(1)
    else:
        feat.append(0)

    # label pos tags for each word
    posIdx_array = ([0] * len(tagsetDict))
    posSet = []
    if word in wordBank:
        posSet = wordBank[word]
    else:
        # see word we don't know
        if wordIdx == 0:
            posSet = list(tagsetDict.keys())[0] # Naive: always choose Np for word we havent seen
            posIdx_array[tagsetDict[posSet]] = 1
            return feat + posIdx_array + [0]
        else:
            prevWord = line[wordIdx-1]  #TODO this is questionable
            if prevWord in wordBank:
                prevPos = wordBank[prevWord]
                maxPos = mostCommonBigrams[prevPos]
                if prevPos == "E":
                    maxPos = 3
                posIdx_array[tagsetDict[maxPos]] = 1
                feat += posIdx_array + [tagsetDict[maxPos]]
                return feat
            else:
                posSet = list(tagsetDict.keys())[0]  # Naive: always choose Np for word we havent seen
                posIdx_array[tagsetDict[posSet]] = 1
                feat += posIdx_array + [0]
                return feat

    for pos in posSet:
        posIdx = tagsetDict[pos]
        posIdx_array[posIdx] += 1.0/len(wordBank[word])
    feat += (posIdx_array) + [0]

    # Don't put features here!

    return feat

In [376]:
# Create y (list of pos tags) and x (feature) data
y = []
X_train = []
for line in train:
    l_split = line.split()
    for i,w in enumerate(l_split):
        parts = w.split("/")
        word = parts[0]
        len_line = len(l_split)

        if len(parts) == 1 or parts[1] not in tagsetDict:
            continue
        y.append(wordBank[word][0]) # [0] for first word tagged pos
        X_train.append(feature(word,i,len_line, l_split))
print ("finished creating y and xtrain")


finished creating y and xtrain


In [377]:
print ( len(X_train) )
print ( len(y) )

print(X_train[0:10])
print(y[0:10])
train_fit = OneVsRestClassifier(LinearSVC(random_state=0)).fit(X_train,y)

print ("finished fitting")

57622
57622
[[1, 0.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0000000000000002, 0, 0, 0, 0, 0, 0, 0], [1, 0.058823529411764705, 0, 0, 1.0000000000000009, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0.11764705882352941, 0, 0, 0, 0, 0, 0, 0.8885630498533718, 0, 0, 0, 0, 0.10850439882697942, 0, 0, 0, 0.002932551319648094, 0, 0, 0, 0], [1, 0.23529411764705882, 0, 0, 0, 0, 0, 0, 0.9948453608247391, 0, 0, 0, 0, 0.003436426116838488, 0, 0, 0, 0.001718213058419244, 0, 0, 0, 0], [1, 0.29411764705882354, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.9999999999999948, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0.35294117647058826, 0, 0, 0.9702970297029709, 0, 0, 0, 0.0297029702970297, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0.4117647058823529, 0, 0, 1.000000000000001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0.47058823529411764, 0, 0, 0.7162162162162168, 0, 0.2837837837837835, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0.5294117647058824, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 

In [378]:
X_test = []
correct_results = []
# Get testing data
for line in test:
    l_split = line.split()
    for i,w in enumerate(l_split):
        len_line = len(l_split)
        parts = w.split("/")
        if len(parts) == 1 or parts[1] not in tagsetDict:
            continue
        word = parts[0]
        pos = parts[1]
        if word in string.punctuation or word == ":.":
            continue
        X_test.append(feature(word,i,len_line,l_split))
        correct_results.append(pos)
print("finished getting testing data")

finished getting testing data


In [379]:
predicted_results = train_fit.predict(X_test)
print("Predicted Results:",predicted_results[:10])
print("Correct Results  :",correct_results[:10])

numCorrect = 0
for cor,pred in zip(correct_results, predicted_results):
    if cor == pred:
        numCorrect += 1

accuracy = 1.0*numCorrect/len(correct_results)
print("Accuracy:",accuracy*100)

Predicted Results: ['E' 'N' 'V' 'A' 'N' 'N' 'N' 'N' 'N' 'C']
Correct Results  : ['E', 'N', 'V', 'A', 'N', 'N', 'N', 'N', 'N', 'C']
Accuracy: 85.60241580509775


In [380]:
import unicodedata as ud
import re
def syllablize(sentence):
    word = '\w+'
    non_word = '[^\w\s]'
    digits = '\d+([\.,_]\d+)+'
    
    patterns = []
    patterns.extend([word, non_word, digits])
    patterns = f"({'|'.join(patterns)})"
    
    sentence = ud.normalize('NFC', sentence)
    tokens = re.findall(patterns, sentence, re.UNICODE)
    return [token[0] for token in tokens]

In [381]:
def longest_matching(sentence, bi_grams, tri_grams):
    syllables = syllablize(sentence)
    syl_len = len(syllables)
    
    curr_id = 0
    word_list = []
    done = False
    
    while (curr_id < syl_len) and (not done):
        curr_word = syllables[curr_id]
        if curr_id >= syl_len - 1:
            word_list.append(curr_word)
            done = True
        else:
            next_word = syllables[curr_id + 1]
            pair_word = ' '.join([curr_word.lower(), next_word.lower()])
            if curr_id >= (syl_len - 2):
                if pair_word in bi_grams:
                    word_list.append('_'.join([curr_word, next_word]))
                    curr_id += 2
                else:
                    word_list.append(curr_word)
                    curr_id += 1
            else:
                next_next_word = syllables[curr_id + 2]
                triple_word = ' '.join([pair_word, next_next_word.lower()])
                if triple_word in tri_grams:
                    word_list.append('_'.join([curr_word, next_word, next_next_word]))
                    curr_id += 3
                elif pair_word in bi_grams:
                    word_list.append('_'.join([curr_word, next_word]))
                    curr_id += 2
                else:
                    word_list.append(curr_word)
                    curr_id += 1
    return word_list

In [382]:
wl=longest_matching('Tôi tên bất khả thi', bi_grams, tri_grams)
print(wl)

['Tôi', 'tên', 'bất', 'khả', 'thi']


In [383]:
X=[]
Y=[]
word = "Nhà"
X.append(feature(word,1,1,1))
Y.append(word)
Y.append(str(train_fit.predict(X)))
for word, tag in zip(word,Y):
    if word == '--n--': 
      print()
    else: 
      print(f'{word}/{tag}', end=' ')

N/Nhà h/['N'] 