From fb79970435634b3443ada88243333e0e1dccd433 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20H=E1=BB=93ng=20Qu=C3=A2n?= Date: Wed, 6 Nov 2019 10:04:17 +0700 Subject: [PATCH] Fix for more words --- README.rst | 11 +++++++---- poetry.lock | 22 +++++++++++++++++++++- pyproject.toml | 3 ++- tests/test_vistickedword.py | 9 ++++++++- vistickedword.py | 28 ++++++++++++++++++++++------ 5 files changed, 60 insertions(+), 13 deletions(-) diff --git a/README.rst b/README.rst index 5637819..7456178 100644 --- a/README.rst +++ b/README.rst @@ -5,15 +5,17 @@ ViStickedWord A library to split a string of many Vietnamese words sticked together to single words. It, for example, split "khuckhuyu" to "khuc" and "khuyu". This library is not supposed to split Vietnamese by semantics, so it won't differentiate single or compound words. It will not, for example, split "bacsitrongbenhvien" to "bac si" + "trong" + "benh vien". If you want such a feature, please use underthesea_. -Due to my personal need, this library currently doesn't process fully marked words, like "họamikhônghótnữa". However, it is trivial for library user to strip those marks before passing to ``ViStickedWord``. +Due to my personal need, this library currently doesn't process fully marked words, like "họamikhônghótnữa". However, it is trivial for library user to strip those marks before passing to ``ViStickedWord`` (using Unidecode_). To make convenient for programming, some terminologies are not used accurately like it should be in linguistic. Please don't use my code as a source for learning Vietnamese grammar. +---------- + Thư viện để tách một chùm từ tiếng Việt viết dính liền thành các từ đơn riêng lẻ, ví dụ tách "khuckhuyu" thành "khuc", "khuyu". Thư viện này không có ý định tách từ dựa theo ngữ nghĩa, nên nó sẽ không phân biệt từ đơn, từ ghép của tiếng Việt. Ví dụ, nó sẽ ko tách cụm "bacsitrongbenhvien" thành "bac si" + "trong" + "benh vien". Nếu bạn cần tính năng đó, nên sử dụng underthesea_. -Do nhu cầu cá nhân nên hiện tại thư viện không xử lý từ có đầy đủ dấu, ví dụ "họamikhônghótnữa". Tuy nhiên, người dùng thư viện có thể loại bỏ dấu trước khi truyền vào ``ViStickedWord``. Việc đó không khó lắm. +Do nhu cầu cá nhân nên hiện tại thư viện không xử lý từ có đầy đủ dấu, ví dụ "họamikhônghótnữa". Tuy nhiên, người dùng thư viện có thể loại bỏ dấu trước khi truyền vào ``ViStickedWord``. Việc đó không khó (dùng Unidecode_). Để thuận tiện cho việc lập trình, một số thuật ngữ không được dùng chính xác như cách dùng bên ngôn ngữ học. Vui lòng đừng xem code của tôi là nguồn tài liệu học ngữ pháp tiếng Việt. @@ -32,9 +34,10 @@ Usage from vistickedword import split_words - split_words('ngoannghoeo') + split_words('ngoanngoeo') - # Returns ('ngoan', 'nghoeo') + # Returns ('ngoan', 'ngoeo') .. _underthesea: https://github.com/undertheseanlp/underthesea +.. _Unidecode: https://pypi.org/project/Unidecode/ diff --git a/poetry.lock b/poetry.lock index abad56a..495f09f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -29,6 +29,22 @@ optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" version = "0.4.1" +[[package]] +category = "dev" +description = "Dev tools for python" +name = "devtools" +optional = false +python-versions = ">=3.5" +version = "0.5.1" + +[package.dependencies] +[package.dependencies.Pygments] +optional = true +version = ">=2.2.0" + +[package.extras] +pygments = ["Pygments (>=2.2.0)"] + [[package]] category = "dev" description = "Read metadata from Python packages" @@ -171,7 +187,7 @@ docs = ["sphinx", "jaraco.packaging (>=3.2)", "rst.linker (>=1.9)"] testing = ["pathlib2", "contextlib2", "unittest2"] [metadata] -content-hash = "a465835ed71014156ae72de348466af7f0a60da25aaa4c2e10d725144f48330c" +content-hash = "25371e766ee37ed44eeddf3811c994ed5d8b72bc9a892b8595d63cc954cd1059" python-versions = "^3.6" [metadata.files] @@ -187,6 +203,10 @@ colorama = [ {file = "colorama-0.4.1-py2.py3-none-any.whl", hash = "sha256:f8ac84de7840f5b9c4e3347b3c1eaa50f7e49c2b07596221daec5edaabbd7c48"}, {file = "colorama-0.4.1.tar.gz", hash = "sha256:05eed71e2e327246ad6b38c540c4a3117230b19679b875190486ddd2d721422d"}, ] +devtools = [ + {file = "devtools-0.5.1-py35.py36-none-any.whl", hash = "sha256:7a1f7db6ade0a71840ca4014d75dd72390aed2ef04e39e2b2445af7b3a3f4679"}, + {file = "devtools-0.5.1.tar.gz", hash = "sha256:51ca8d2e15b8a862875a4837db2bafbc6cda409c069e960aec3f4bbd91fe9c08"}, +] importlib-metadata = [ {file = "importlib_metadata-0.23-py2.py3-none-any.whl", hash = "sha256:d5f18a79777f3aa179c145737780282e27b508fc8fd688cb17c7a813e8bd39af"}, {file = "importlib_metadata-0.23.tar.gz", hash = "sha256:aa18d7378b00b40847790e7c27e11673d7fed219354109d0e7b9e5b25dc3ad26"}, diff --git a/pyproject.toml b/pyproject.toml index 7bacf63..58f0cd7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "vistickedword" -version = "0.9.1" +version = "0.9.2" description = "Library to split sticked Vietnamese words" authors = ["Nguyễn Hồng Quân "] license = "MIT" @@ -24,6 +24,7 @@ pytest = "^3.0" pytest-pycodestyle = "^1.4.0" pytest-flakes = "^4.0.0" pygments = "^2.4.2" +devtools = {version = "^0.5.1", extras = ["pygments"]} [build-system] requires = ["poetry>=0.12"] diff --git a/tests/test_vistickedword.py b/tests/test_vistickedword.py index 45f6ab0..2d6424e 100644 --- a/tests/test_vistickedword.py +++ b/tests/test_vistickedword.py @@ -11,7 +11,7 @@ ('hueoanh', ('hue', 'oanh')), ('queanh', ('que', 'anh')), ('ueoai', ('ue', 'oai')), - ('ngoannghoeo', ('ngoan', 'nghoeo')), + ('ngoanngoeo', ('ngoan', 'ngoeo')), ('khuckhuyu', ('khuc', 'khuyu')), ('BinhYen', ('Binh', 'Yen')), ('yanhsao', ('y', 'anh', 'sao')), @@ -36,6 +36,13 @@ ('khoanhkhac', ('khoanh', 'khac')), ('vinhcuu', ('vinh', 'cuu')), ('muonthuo', ('muon', 'thuo')), + ('nguoitinhthuoxua', ('nguoi', 'tinh', 'thuo', 'xua')), + ('daorua', ('dao', 'rua')), + ('nhonguoimuonnamcu', ('nho', 'nguoi', 'muon', 'nam', 'cu')), + ('matbiec', ('mat', 'biec')), + # FIXME + # ('nhuangmaybay', ('nhu', 'ang', 'may', 'bay')), + ('giuvungniemtin', ('giu', 'vung', 'niem', 'tin')), ) diff --git a/vistickedword.py b/vistickedword.py index 59799b7..ad23172 100644 --- a/vistickedword.py +++ b/vistickedword.py @@ -8,7 +8,7 @@ from typing import Match, Tuple, Sequence, List, Optional -__version__ = '0.9.1' +__version__ = '0.9.2' logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) @@ -25,7 +25,7 @@ 'oao': frozenset(['ng']), 'oeo': frozenset(['ng']), 'uay': frozenset(['ng', 'q']), - 'uoi': frozenset(('b', 'c', 'd', 'ch', 'm', 'n', 's', 't', 'x')), + 'uoi': frozenset(('b', 'c', 'd', 'ch', 'm', 'n', 'ng', 's', 't', 'x')), 'uya': frozenset(['kh']), 'uye': frozenset((None, 'ch', 'ng', 'h', 'kh', 'l', 't', 'q', 't', 'x')), 'uyu': frozenset(['kh']), @@ -35,6 +35,7 @@ 'eu': COMMON_INITIAL_CONSONANT, 'ia': frozenset(['g']), 'ie': COMMON_INITIAL_CONSONANT | {'gh', 'ngh'} - {'c', 'ng'}, + 'iu': frozenset(('ch', 'b', 'd', 'g', 'l', 'm', 'n', 'r', 't', 'th', 'x')), 'oa': frozenset((None, 'b', 'ch', 'h', 'kh', 'l', 'n', 'ng', 't')), 'oe': frozenset((None, 'kh', 'l', 'nh', 'ng', 'h', 't')), 'ua': COMMON_INITIAL_CONSONANT | {None, 'q'}, @@ -83,6 +84,12 @@ 'final': 'm', # Can only be: oam 'initial': ALL_INITIAL_CONSONANT, }, + ), + 'ua': ( + { + 'final': 'ng', + 'initial': ALL_INITIAL_CONSONANT - {'kh'} + }, ) } @@ -199,18 +206,20 @@ def scan_for_word(i: int, vowel_match: Match, vowel_occurences: Sequence[Match], test_final_consonants = () rest_seq = original_word_sequence[pos_end_vowel:] + final_consonant = None # If rest_seq is empty, no need to scan for final consonant if rest_seq: for con in test_final_consonants: if con is None: + logger.debug('This vowel "%s" can go without final consonant', vowel) word_pos.end = word_pos.end_vowel break if rest_seq.lower().startswith(con.lower()): # Determined final consonant of this word final_consonant = con word_pos.end = pos_end_vowel + len(final_consonant) - logger.debug('"%s" seems to be final consonant', final_consonant) if not leading_source: + logger.debug("No pool to find initial consonant") break if test_initial_consonants: try: @@ -222,11 +231,12 @@ def scan_for_word(i: int, vowel_match: Match, vowel_occurences: Sequence[Match], if not success: continue except IllegalCombination: + logger.debug("Illegal combination. Test next possible final consonant.") continue word_pos.start = pos_start_vowel - len(initial_consonant) break # Not found final consonant - if test_final_consonants and None not in test_final_consonants: # This vowel can go without final consonant + if not final_consonant and test_final_consonants and None not in test_final_consonants: logger.error('This vowel "%s" needs a final consonant, but could not found in "%s".', vowel, rest_seq) # Even when final consonant is not needed, still need to find initial @@ -238,6 +248,12 @@ def scan_for_word(i: int, vowel_match: Match, vowel_occurences: Sequence[Match], except ConfusingState: word_pos.start = word_pos.start_vowel - len(leading_source) negotiate_expand_consonant(word_pos, word_positions, original_word_sequence) + elif None not in test_initial_consonants: + # This vowel needs initial consonant + logger.debug('Vowel "%s" needs an initial consonant. Negotiate with precedence word.', vowel) + negotiate_expand_consonant(word_pos, word_positions, original_word_sequence) + else: + logger.debug("Skip finding initial consonant.") # Save position of this word word_positions.append(word_pos) return word_pos @@ -266,8 +282,8 @@ def negotiate_expand_consonant(word_pos: WordPosition, word_positions: List[Word logger.error('Negotiation failed.') except IndexError: logger.error('Previous word does not exist.') - except KeyError: - logger.error('Previous word doesnot need final consonant') + except KeyError as e: + logger.error('Vowel %s cannot go with (initial) consonant', e) return False