# Importing libraries

In [1]:
!pip install pyvi

import io
import string
import numpy
import random
import sklearn.svm as svm

from sklearn.svm import LinearSVC
from collections import defaultdict
from pyvi import ViTokenizer, ViPosTagger
from sklearn.multiclass import OneVsRestClassifier

Collecting pyvi
[?25l  Downloading https://files.pythonhosted.org/packages/10/e1/0e5bc6b5e3327b9385d6e0f1b0a7c0404f28b74eb6db59a778515b30fd9c/pyvi-0.1-py2.py3-none-any.whl (8.5MB)
[K     |████████████████████████████████| 8.5MB 5.4MB/s 
[?25hCollecting sklearn-crfsuite
  Downloading https://files.pythonhosted.org/packages/25/74/5b7befa513482e6dee1f3dd68171a6c9dfc14c0eaa00f885ffeba54fe9b0/sklearn_crfsuite-0.3.6-py2.py3-none-any.whl
Collecting python-crfsuite>=0.8.3
[?25l  Downloading https://files.pythonhosted.org/packages/79/47/58f16c46506139f17de4630dbcfb877ce41a6355a1bbf3c443edb9708429/python_crfsuite-0.9.7-cp37-cp37m-manylinux1_x86_64.whl (743kB)
[K     |████████████████████████████████| 747kB 48.2MB/s 
Installing collected packages: python-crfsuite, sklearn-crfsuite, pyvi
Successfully installed python-crfsuite-0.9.7 pyvi-0.1 sklearn-crfsuite-0.3.6


# Data

In [2]:
!wget https://raw.githubusercontent.com/nthanhkhang/Natural-Language-Processing/main/Data/vi_train.txt
!wget https://raw.githubusercontent.com/nthanhkhang/Natural-Language-Processing/main/Data/vi_test.txt

--2021-03-16 11:19:19--  https://raw.githubusercontent.com/nthanhkhang/Natural-Language-Processing/main/Data/vi_train.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1664450 (1.6M) [text/plain]
Saving to: ‘vi_train.txt’


2021-03-16 11:19:19 (16.9 MB/s) - ‘vi_train.txt’ saved [1664450/1664450]

--2021-03-16 11:19:19--  https://raw.githubusercontent.com/nthanhkhang/Natural-Language-Processing/main/Data/vi_test.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 200159 (195K) [text/plain]
Saving to: ‘vi_test.txt’


2021-03-16 11

# Tagset

In [3]:
tagsetDict = {"N" : 1,
              "Np" : 2,
              "Nc" : 3,
              "Nu" : 4,
              "V" : 5,
              "A" : 6,
              "P" : 7,
              "L" : 8,
              "M" : 9,
              "R" : 10,
              "E" : 11,
              "C" : 12,
              "I" : 13,
              "T" : 14,
              "B" : 15,
              "Y" : 16,
              "S" : 17,
              "X" : 18 }
inverseTagsetDict = {tagsetDict[k]:k for k in tagsetDict}

# Check some of the tagged words.


In [4]:
wordBank = defaultdict()
bigramBank = defaultdict()
bigramDict = defaultdict(int)

bigramFreq = {x:[0]*18 for x in tagsetDict}
mostCommonBigrams = defaultdict()

prevParts = None

f_tagged_train = io.open("vi_train.txt", encoding='utf-8').readlines()
f_tagged_test = io.open("vi_test.txt", encoding='utf-8').readlines()

train = f_tagged_train[:12000]
test = f_tagged_test[:10000]
print(train[:10])
print(test[:10])

['\n', 'Trên/E đường/N đi/V ,/, có/V một/M lần/N xe/N cô/N suýt/R rơi/V xuống/R vực/N ở/E đèo/N Ngoạn_Mục/Np ./.\n', '\n', 'Trong/E một/M trận/N đánh/V ác_liệt/A bên/N thành/N cổ/N Quảng_Trị/Np ,/, một/M loạt/N đạn/N pháo/N của/E kẻ_thù/N đã/R rơi/V trúng/A chỗ/N chiến_sĩ/N thông_tin/N Nguyễn_Văn_Thạc/Np .../... Hôm/N đó/P ngày/N 30/M -/- 7/M -/- 1972/M ./.\n', '\n', 'Sau/E khi/N H./Ny “/“ AK/Ny ”/” bị/V xộ/V khám/V ,/, Hoàng/Np đã/R hoàn_lương/V ,/, bây_giờ/P đang/R phụ/V việc/N cho/E gia_đình/N Quân/Np ./.\n', '\n', 'Hà/Np ,/, 21/M tuổi/N -/- làm/V nghề/N hớt_tóc/V ,/, khai/V :/: “/“ Ban_đầu/N nghe/V mấy/L ảnh/N nói/V thuốc/N lắc/V không/R nghiện/V nên/C em/N uống/V thử/V ./.\n', '\n', 'Theo/E anh/Nc Thông/Np ,/, người/N già/A ở/E Đức/Np thường/R sống/V một_mình/Ny và/C sống/V với/E những/L kỷ_vật/N của/E riêng/A mình/P mang/V về/R từ/E những/L chuyến/N du_lịch/V hay/C những/L tặng_vật/N của/E cả/P một/M đời/N ./.\n']
['Trên/E đường/N xuất_hiện/V nhiều/A cặp/N cha/N -/- con/N ,/, mẹ/

In [5]:
for line in train:
    l_split = line.split()
    #print(l_split)
    for i,w in enumerate(l_split):
        parts = w.split("/")
        if i >= 1:
            prevParts = l_split[i-1].split("/")
        if len(parts) == 1 or \
            parts[1] not in tagsetDict:
            continue

        if i >= 2 and prevParts[1] in string.punctuation:
            prevParts = l_split[i-1].split("/")

        word = parts[0]
        pos = parts[1]

        if i >= 1:
            prevWord = prevParts[0]
            prevPos = prevParts[1]
            bigramBank[ word ] = (pos, prevWord, prevPos)
            bigramDict[ (pos, prevPos) ] += 1

        if word not in wordBank:
            wordBank[word] = [pos]
        else:
            wordBank[word] += [pos]


In [6]:
bi_grams=[]
tri_grams=[]
qua_grams=[]
for sentence in set(wordBank):
  temp=0
  for s in list(sentence):
    if s == "_":
      temp+=1
    if temp == 0:
      bi_grams.append(sentence)
    elif temp == 2:
      tri_grams.append(sentence)
    elif temp == 4:
      qua_grams.append(sentence)
#print(bi_grams)
#print(tri_grams)
#print(qua_grams)

In [7]:
#finished getting training data
for k in bigramFreq:
    maxFreq = 0
    maxPos = "Np"
    for i,x in enumerate(bigramFreq[k]):
        if x > maxFreq:
            maxFreq = x
            maxPos = inverseTagsetDict[i]
    mostCommonBigrams[k] = maxPos

In [8]:
def Viterbi_rule_based(word,wordIdx, lineSize, line):
    feat = [1]
    sentPercent = float(wordIdx)/float(lineSize)
    feat.append(sentPercent)

    if word[0].isupper() and wordIdx != 0:
        feat.append(1)
    else:
        feat.append(0)

    posIdx_array = ([0] * len(tagsetDict))
    posSet = []
    if word in wordBank:
        posSet = wordBank[word]
    else:
        if wordIdx == 0:
            posSet = list(tagsetDict.keys())[0] 
            posIdx_array[tagsetDict[posSet]] = 1
            return feat + posIdx_array + [0]
        else:
            prevWord = line[wordIdx-1]
            if prevWord in wordBank:
                prevPos = wordBank[prevWord]
                maxPos = mostCommonBigrams[prevPos]
                if prevPos == "E":
                    maxPos = 3
                posIdx_array[tagsetDict[maxPos]] = 1
                feat += posIdx_array + [tagsetDict[maxPos]]
                return feat
            else:
                posSet = list(tagsetDict.keys())[0]  
                posIdx_array[tagsetDict[posSet]] = 1
                feat += posIdx_array + [0]
                return feat

    for pos in posSet:
        posIdx = tagsetDict[pos]
        posIdx_array[posIdx] += 1.0/len(wordBank[word])
    feat += (posIdx_array) + [0]
    #print(feat)
    return feat


In [9]:
#finished creating y and xtrain
y = []
X_train = []
for line in train:
    l_split = line.split()
    for i,w in enumerate(l_split):
        parts = w.split("/")
        word = parts[0]
        len_line = len(l_split)

        if len(parts) == 1 or parts[1] not in tagsetDict:
            continue
        y.append(wordBank[word][0])
        X_train.append(Viterbi_rule_based(word,i,len_line, l_split))

# SVM

In [10]:
print (len(X_train))
print (len(y))
for i in range(10):
  print(i,":",X_train[i])
print(y[:10])
train_fit = OneVsRestClassifier(LinearSVC(random_state=0)).fit(X_train,y)

114577
114577
0 : [1, 0.0, 0, 0, 0, 0, 0, 0, 0, 0.05555555555555555, 0, 0, 0, 0, 0.9444444444444448, 0, 0, 0, 0, 0, 0, 0]
1 : [1, 0.058823529411764705, 0, 0, 0.9999999999999959, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
2 : [1, 0.11764705882352941, 0, 0, 0, 0, 0, 0, 0.8944281524926679, 0, 0, 0, 0, 0.09970674486803513, 0, 0, 0, 0.005865102639296188, 0, 0, 0, 0]
3 : [1, 0.23529411764705882, 0, 0, 0, 0, 0, 0, 0.9916247906197839, 0, 0, 0, 0, 0.005862646566164153, 0, 0, 0, 0.002512562814070352, 0, 0, 0, 0]
4 : [1, 0.29411764705882354, 0, 0, 0.0007524454477050414, 0, 0, 0, 0, 0, 0, 0, 0.9992475545522823, 0, 0, 0, 0, 0, 0, 0, 0, 0]
5 : [1, 0.35294117647058826, 0, 0, 0.9775784753363252, 0, 0, 0, 0.02242152466367713, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
6 : [1, 0.4117647058823529, 0, 0, 0.9913793103448257, 0, 0, 0, 0.008620689655172414, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
7 : [1, 0.47058823529411764, 0, 0, 0.7094339622641509, 0, 0.29056603773584905, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [11]:
#finished getting testing data
X_test = []
correct_results = []
for line in test:
    l_split = line.split()
    for i,w in enumerate(l_split):
        len_line = len(l_split)
        parts = w.split("/")
        if len(parts) == 1 or parts[1] not in tagsetDict:
            continue
        word = parts[0]
        pos = parts[1]
        if word in string.punctuation or word == ":.":
            continue
        X_test.append(Viterbi_rule_based(word,i,len_line,l_split))
        correct_results.append(pos)
#print(X_test)

# Using the Longest Matching algorithm

In [12]:
import unicodedata as ud
import re
def syllablize(sentence):
    word = '\w+'
    non_word = '[^\w\s]'
    digits = '\d+([\.,_]\d+)+'
    
    patterns = []
    patterns.extend([word, non_word, digits])
    patterns = f"({'|'.join(patterns)})"
    
    sentence = ud.normalize('NFC', sentence)
    tokens = re.findall(patterns, sentence, re.UNICODE)
    return [token[0] for token in tokens]

In [13]:
def longest_matching(sentence, bi_grams, tri_grams):
    syllables = syllablize(sentence)
    syl_len = len(syllables)
    
    curr_id = 0
    word_list = []
    done = False
    
    while (curr_id < syl_len) and (not done):
        curr_word = syllables[curr_id]
        if curr_id >= syl_len - 1:
            word_list.append(curr_word)
            done = True
        else:
            next_word = syllables[curr_id + 1]
            pair_word = ' '.join([curr_word.lower(), next_word.lower()])
            if curr_id >= (syl_len - 2):
                if pair_word in bi_grams:
                    word_list.append('_'.join([curr_word, next_word]))
                    curr_id += 2
                else:
                    word_list.append(curr_word)
                    curr_id += 1
            else:
                next_next_word = syllables[curr_id + 2]
                triple_word = ' '.join([pair_word, next_next_word.lower()])
                if triple_word in tri_grams:
                    word_list.append('_'.join([curr_word, next_word, next_next_word]))
                    curr_id += 3
                elif pair_word in bi_grams:
                    word_list.append('_'.join([curr_word, next_word]))
                    curr_id += 2
                else:
                    word_list.append(curr_word)
                    curr_id += 1
    return word_list

# Test accuracy on subset of test data


In [14]:
predicted_results = train_fit.predict(X_test)
print("Predicted Results:",predicted_results[:12])
print("Correct Results  :",correct_results[:12])

numCorrect = 0
for cor,pred in zip(correct_results, predicted_results):
    if cor == pred:
        numCorrect += 1

accuracy = 1.0*numCorrect/len(correct_results)
print("Viterbi Algorithm Accuracy:",accuracy*100)

Predicted Results: ['E' 'N' 'V' 'A' 'N' 'N' 'N' 'N' 'N' 'C' 'V' 'P']
Correct Results  : ['E', 'N', 'V', 'A', 'N', 'N', 'N', 'N', 'N', 'C', 'C', 'T']
Viterbi Algorithm Accuracy: 87.34261439246596


In [15]:
from sklearn.metrics import classification_report
print('Results of the Markov hidden model combined with the Viterbi algorithm:\n')
print(classification_report(predicted_results, correct_results))

Results of the Markov hidden model combined with the Viterbi algorithm:

              precision    recall  f1-score   support

           A       0.77      0.90      0.83      1241
           C       0.88      0.93      0.91       782
           E       0.93      0.82      0.87      1641
           I       0.33      0.75      0.46         4
           L       0.97      0.93      0.95       432
           M       0.90      1.00      0.94       744
           N       0.95      0.82      0.88      6504
          Nc       0.53      0.78      0.63       399
          Np       0.77      0.98      0.86       698
          Nu       0.82      0.91      0.87       102
           P       0.98      0.95      0.96       992
           R       0.89      0.84      0.86      1745
           S       0.53      0.62      0.57        16
           T       0.28      0.69      0.40        59
           V       0.84      0.92      0.88      4178
           Y       0.00      0.00      0.00         1

    acc

# Check how a sentence is tagged by the POS tagger

In [16]:
wr ="Dù rất đắt nhưng tôi vẫn đồng ý"
A=[]
X=[]
wl = ViTokenizer.tokenize(wr)
wl = wl.split()
print(wl)
#print(Viterbi_rule_based(word,i,len_line, l_split))
for word in wl:
  X.append(Viterbi_rule_based(word,1,len(wl),wl))
X=str(train_fit.predict(X))
print(X)

['Dù', 'rất', 'đắt', 'nhưng', 'tôi', 'vẫn', 'đồng_ý']
['C' 'R' 'A' 'C' 'P' 'R' 'V']
