In [43]:
# Spec: Core i7 8550U 4 cores 8 threads
# Jupyter notebook on windows 11
# python 3.9.7

# DATASET
# https://github.com/UniversalDependencies/UD_Vietnamese-VTB 1400 sentences

# REQUIREMENTS
# conll==4.4.1
# ViSpacy==0.0.1 https://gitlab.com/trungtv/vi_spacy/-/raw/master/vi_core_news_lg/dist/vi_core_news_lg-0.0.1.tar.gz
# Underthesea==1.3.4a0
# Vncorenlp==1.0.3 
# Java 8


In [44]:
from conllu import parse
import time

In [45]:

# Format for data [[[word, pos, entity], [word, pos, entity],...]]
with open ('data/vi_vtb-ud-train.conllu', 'r', encoding='utf-8') as f:
	text = f.read()

sentences = parse(text)

text = ''
sents = []
groundtruth = []
for tokenlist in sentences:
    tagged = []
    sent = tokenlist.metadata['text']
    for item in tokenlist:
        tagged.append([item['form'],item['xpos'],' '])
    text += sent + ' '
    groundtruth.append(tagged)
    sents.append(sent)

print(f"Num sentences: {len(sents)}")
print(f"Sentence: {sents[0]}\nTag: {groundtruth[0]}")

Num sentences: 1400
Sentence: mảnh đất của đạn bom không còn người nghèo.
Tag: [['mảnh', 'Nc', ' '], ['đất', 'N', ' '], ['của', 'E', ' '], ['đạn', 'N', ' '], ['bom', 'N', ' '], ['không', 'R', ' '], ['còn', 'V', ' '], ['người', 'N', ' '], ['nghèo', 'A', ' '], ['.', '.', ' ']]


In [46]:
class tokenizer:
    def __init__(self):
        pass
    def annotate(self, text):
        pass
    def tokenize(self, text):
        pass
    def close(self):
        pass

In [58]:
class Vi_Spacy(tokenizer):
	def __init__(self):
		super().__init__()

		import spacy
		self.nlp = spacy.load('vi_core_news_lg')

	def annotate(self,text):
		output = []
		doc = self.nlp(text)

		for token in doc:
		    output.append([token.text.replace('_',' '), token.tag_, ''])
		    #print(token.text, token.lemma_, token.tag_, token.pos_, token.dep_,
		    #        token.shape_, token.is_alpha, token.is_stop)

		return output


	def info(self):
		return('ViSpacy')
	
a = Vi_Spacy()
a.annotate("Cộng đồng xử lý ngôn ngữ tự nhiên")

[['Cộng đồng', 'N', ''],
 ['xử lý', 'V', ''],
 ['ngôn ngữ', 'N', ''],
 ['tự nhiên', 'A', '']]

In [48]:
class Underthesea(tokenizer):
	def __init__(self):
		super().__init__()
		from underthesea import word_tokenize
		from underthesea import ner
		self.word_tokenize = word_tokenize
		self.ner = ner

	def annotate(self,text):
		output = []
		ners = self.ner(text)
		for item in ners:
			output.append([item[0],item[1],item[3]])
		
		return output

	def tokenize(self, text):
		return self.word_tokenize(text)

	def info(self):
		return('Underthesea')

# a = Underthesea()
# a.annotate('Cộng đồng xử lý ngôn ngữ tự nhiên')

In [49]:
class VncoreNLP(tokenizer):
	# To perform word segmentation, POS tagging, NER and then dependency parsing
	# annotator = VnCoreNLP("VnCoreNLP-1.1.1.jar", annotators="wseg,pos,ner,parse", max_heap_size='-Xmx2g')
	def __init__(self):
		super().__init__()
		from vncorenlp import VnCoreNLP
		self.annotator = VnCoreNLP("VnCoreNLP-1.1.1.jar", annotators="wseg,pos,ner", max_heap_size='-Xmx2g')
		
	def annotate(self,text):
		output = []
		annotated_text = self.annotator.annotate(text)
		for sent in annotated_text['sentences']:
			for item in sent:
				output.append([item['form'].replace('_',' '), item['posTag'], item['nerLabel']])
		return output
	
	def tokenize(self, text):
		self.annotator.tokenize(text)
			
	def info(self):
		return('VnCoreNLP')

	def close(self):
		self.annotator.close()

# a = VncoreNLP()
# a.annotate("Cộng đồng xử lý ngôn ngữ tự nhiên")

In [50]:
for t in (Vi_Spacy,Underthesea,VncoreNLP):
    t = t()
    count = 0
    wordcount = 0
    poscount = 0
    sercount = 0


    time_annotate = 0
    time_tokenize = 0
    index = 0

    for sent in sents:

        start = time.time()
        predict = t.annotate(sent) # time for segmentation, postag, and ner
        time_annotate += time.time() - start

        start = time.time()
        predict_tokenize = t.tokenize(sent)
        time_tokenize += time.time() - start

        count += len(groundtruth[index])

        if len(predict) == len(groundtruth[index]): # only add to count if num predicted words equals that of groundtruth
            for item,gt in zip(predict,groundtruth[index]):  # item = [word, pos, entity]
                if item[0] == gt[0]:
                    wordcount += 1
                if item[1] == gt[1]:
                    poscount += 1
                if item[2] == gt[2]:
                    sercount += 1
        index += 1

    # Corrected segmented word and entity / total word count
    wordsegacc = wordcount/count
    posacc = poscount/count
    seracc = sercount/count

    print()
    print(t.info())
    print(f"Tagging time: {time_annotate:.5f}s Tokenize time: {time_tokenize:.5f}s Word segmentation acc: {wordsegacc:.5f} Pos tag acc: {posacc:.5f} ")

    t.close()


ViSpacy
Tagging time: 0.97577s Tokenize time: 0.00764s Word segmentation acc: 0.69662 Pos tag acc: 0.00000 

Underthesea
Tagging time: 25.44487s Tokenize time: 2.81359s Word segmentation acc: 0.79635 Pos tag acc: 0.59990 

VnCoreNLP
Tagging time: 46.65205s Tokenize time: 35.96454s Word segmentation acc: 0.77491 Pos tag acc: 0.62475 


In [51]:
from CocCocTokenizer import PyTokenizer

# load_nontone_data is True by default
T = PyTokenizer(load_nontone_data=True)

# tokenize_option:
# 	0: TOKENIZE_NORMAL (default)
#	1: TOKENIZE_HOST
#	2: TOKENIZE_URL
print(T.word_tokenize("xin chào, tôi là người Việt Nam", tokenize_option=0))

# output: ['xin', 'chào', ',', 'tôi', 'là', 'người', 'Việt_Nam']

ModuleNotFoundError: No module named 'CocCocTokenizer'