Fix for more words

hongquan · Nov 6, 2019 · fb79970 · fb79970
1 parent 0bdb47a
commit fb79970
Show file tree

Hide file tree

Showing 5 changed files with 60 additions and 13 deletions.
diff --git a/README.rst b/README.rst
@@ -5,15 +5,17 @@ ViStickedWord
 A library to split a string of many Vietnamese words sticked together to single words. It, for example, split "khuckhuyu" to "khuc" and "khuyu".
 This library is not supposed to split Vietnamese by semantics, so it won't differentiate single or compound words. It will not, for example, split "bacsitrongbenhvien" to "bac si" + "trong" + "benh vien".
 If you want such a feature, please use underthesea_.
-Due to my personal need, this library currently doesn't process fully marked words, like "họamikhônghótnữa". However, it is trivial for library user to strip those marks before passing to ``ViStickedWord``.
+Due to my personal need, this library currently doesn't process fully marked words, like "họamikhônghótnữa". However, it is trivial for library user to strip those marks before passing to ``ViStickedWord`` (using Unidecode_).
 
 To make convenient for programming, some terminologies are not used accurately like it should be in linguistic. Please don't use my code as a source for learning Vietnamese grammar.
 
+----------
+
 Thư viện để tách một chùm từ tiếng Việt viết dính liền thành các từ đơn riêng lẻ, ví dụ tách "khuckhuyu" thành "khuc", "khuyu".
 Thư viện này không có ý định tách từ dựa theo ngữ nghĩa, nên nó sẽ không phân biệt từ đơn, từ ghép của tiếng Việt. Ví dụ, nó sẽ ko tách cụm "bacsitrongbenhvien" thành "bac si" + "trong" + "benh vien".
 Nếu bạn cần tính năng đó, nên sử dụng underthesea_.
 
-Do nhu cầu cá nhân nên hiện tại thư viện không xử lý từ có đầy đủ dấu, ví dụ "họamikhônghótnữa". Tuy nhiên, người dùng thư viện có thể loại bỏ dấu trước khi truyền vào ``ViStickedWord``. Việc đó không khó lắm.
+Do nhu cầu cá nhân nên hiện tại thư viện không xử lý từ có đầy đủ dấu, ví dụ "họamikhônghótnữa". Tuy nhiên, người dùng thư viện có thể loại bỏ dấu trước khi truyền vào ``ViStickedWord``. Việc đó không khó (dùng Unidecode_).
 
 Để thuận tiện cho việc lập trình, một số thuật ngữ không được dùng chính xác như cách dùng bên ngôn ngữ học. Vui lòng đừng xem code của tôi là nguồn tài liệu học ngữ pháp tiếng Việt.
 
@@ -32,9 +34,10 @@ Usage
 
     from vistickedword import split_words
 
-    split_words('ngoannghoeo')
+    split_words('ngoanngoeo')
 
-    # Returns ('ngoan', 'nghoeo')
+    # Returns ('ngoan', 'ngoeo')
 
 
 .. _underthesea: https://github.com/undertheseanlp/underthesea
+.. _Unidecode: https://pypi.org/project/Unidecode/
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "vistickedword"
-version = "0.9.1"
+version = "0.9.2"
 description = "Library to split sticked Vietnamese words"
 authors = ["Nguyễn Hồng Quân <ng.hong.quan@gmail.com>"]
 license = "MIT"
@@ -24,6 +24,7 @@ pytest = "^3.0"
 pytest-pycodestyle = "^1.4.0"
 pytest-flakes = "^4.0.0"
 pygments = "^2.4.2"
+devtools = {version = "^0.5.1", extras = ["pygments"]}
 
 [build-system]
 requires = ["poetry>=0.12"]

diff --git a/tests/test_vistickedword.py b/tests/test_vistickedword.py
@@ -11,7 +11,7 @@
     ('hueoanh', ('hue', 'oanh')),
     ('queanh', ('que', 'anh')),
     ('ueoai', ('ue', 'oai')),
-    ('ngoannghoeo', ('ngoan', 'nghoeo')),
+    ('ngoanngoeo', ('ngoan', 'ngoeo')),
     ('khuckhuyu', ('khuc', 'khuyu')),
     ('BinhYen', ('Binh', 'Yen')),
     ('yanhsao', ('y', 'anh', 'sao')),
@@ -36,6 +36,13 @@
     ('khoanhkhac', ('khoanh', 'khac')),
     ('vinhcuu', ('vinh', 'cuu')),
     ('muonthuo', ('muon', 'thuo')),
+    ('nguoitinhthuoxua', ('nguoi', 'tinh', 'thuo', 'xua')),
+    ('daorua', ('dao', 'rua')),
+    ('nhonguoimuonnamcu', ('nho', 'nguoi', 'muon', 'nam', 'cu')),
+    ('matbiec', ('mat', 'biec')),
+    # FIXME
+    # ('nhuangmaybay', ('nhu', 'ang', 'may', 'bay')),
+    ('giuvungniemtin', ('giu', 'vung', 'niem', 'tin')),
 )
 
 

diff --git a/vistickedword.py b/vistickedword.py
@@ -8,7 +8,7 @@
 from typing import Match, Tuple, Sequence, List, Optional
 
 
-__version__ = '0.9.1'
+__version__ = '0.9.2'
 logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger(__name__)
 
@@ -25,7 +25,7 @@
     'oao': frozenset(['ng']),
     'oeo': frozenset(['ng']),
     'uay': frozenset(['ng', 'q']),
-    'uoi': frozenset(('b', 'c', 'd', 'ch', 'm', 'n', 's', 't', 'x')),
+    'uoi': frozenset(('b', 'c', 'd', 'ch', 'm', 'n', 'ng', 's', 't', 'x')),
     'uya': frozenset(['kh']),
     'uye': frozenset((None, 'ch', 'ng', 'h', 'kh', 'l', 't', 'q', 't', 'x')),
     'uyu': frozenset(['kh']),
@@ -35,6 +35,7 @@
     'eu': COMMON_INITIAL_CONSONANT,
     'ia': frozenset(['g']),
     'ie': COMMON_INITIAL_CONSONANT | {'gh', 'ngh'} - {'c', 'ng'},
+    'iu': frozenset(('ch', 'b', 'd', 'g', 'l', 'm', 'n', 'r', 't', 'th', 'x')),
     'oa': frozenset((None, 'b', 'ch', 'h', 'kh', 'l', 'n', 'ng', 't')),
     'oe': frozenset((None, 'kh', 'l', 'nh', 'ng', 'h', 't')),
     'ua': COMMON_INITIAL_CONSONANT | {None, 'q'},
@@ -83,6 +84,12 @@
             'final': 'm',  # Can only be: oam
             'initial': ALL_INITIAL_CONSONANT,
         },
+    ),
+    'ua': (
+        {
+            'final': 'ng',
+            'initial': ALL_INITIAL_CONSONANT - {'kh'}
+        },
     )
 }
 
@@ -199,18 +206,20 @@ def scan_for_word(i: int, vowel_match: Match, vowel_occurences: Sequence[Match],
         test_final_consonants = ()
 
     rest_seq = original_word_sequence[pos_end_vowel:]
+    final_consonant = None
     # If rest_seq is empty, no need to scan for final consonant
     if rest_seq:
         for con in test_final_consonants:
             if con is None:
+                logger.debug('This vowel "%s" can go without final consonant', vowel)
                 word_pos.end = word_pos.end_vowel
                 break
             if rest_seq.lower().startswith(con.lower()):
                 # Determined final consonant of this word
                 final_consonant = con
                 word_pos.end = pos_end_vowel + len(final_consonant)
-                logger.debug('"%s" seems to be final consonant', final_consonant)
                 if not leading_source:
+                    logger.debug("No pool to find initial consonant")
                     break
                 if test_initial_consonants:
                     try:
@@ -222,11 +231,12 @@ def scan_for_word(i: int, vowel_match: Match, vowel_occurences: Sequence[Match],
                         if not success:
                             continue
                     except IllegalCombination:
+                        logger.debug("Illegal combination. Test next possible final consonant.")
                         continue
                     word_pos.start = pos_start_vowel - len(initial_consonant)
                 break
     # Not found final consonant
-    if test_final_consonants and None not in test_final_consonants:  # This vowel can go without final consonant
+    if not final_consonant and test_final_consonants and None not in test_final_consonants:
         logger.error('This vowel "%s" needs a final consonant, but could not found in "%s".',
                      vowel, rest_seq)
     # Even when final consonant is not needed, still need to find initial
@@ -238,6 +248,12 @@ def scan_for_word(i: int, vowel_match: Match, vowel_occurences: Sequence[Match],
         except ConfusingState:
             word_pos.start = word_pos.start_vowel - len(leading_source)
             negotiate_expand_consonant(word_pos, word_positions, original_word_sequence)
+    elif None not in test_initial_consonants:
+        # This vowel needs initial consonant
+        logger.debug('Vowel "%s" needs an initial consonant. Negotiate with precedence word.', vowel)
+        negotiate_expand_consonant(word_pos, word_positions, original_word_sequence)
+    else:
+        logger.debug("Skip finding initial consonant.")
     # Save position of this word
     word_positions.append(word_pos)
     return word_pos
@@ -266,8 +282,8 @@ def negotiate_expand_consonant(word_pos: WordPosition, word_positions: List[Word
             logger.error('Negotiation failed.')
     except IndexError:
         logger.error('Previous word does not exist.')
-    except KeyError:
-        logger.error('Previous word doesnot need final consonant')
+    except KeyError as e:
+        logger.error('Vowel %s cannot go with (initial) consonant', e)
     return False