Skip to content

Commit

Permalink
Fix/issue 1 (#2)
Browse files Browse the repository at this point in the history
* Sort dictionary

* Remove some words

* Add prefix 'ke'

* Fix suffixes visitor

* Add test words
  • Loading branch information
kangfend committed Aug 29, 2022
1 parent d63e78a commit 3ac6af6
Show file tree
Hide file tree
Showing 5 changed files with 18 additions and 15 deletions.
8 changes: 0 additions & 8 deletions bahasa/data/kamus.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2634,7 +2634,6 @@ beku
bekuk
bekuku
bekukung
bel
bela
belabas
belacak
Expand Down Expand Up @@ -8404,7 +8403,6 @@ geplak
gepok
geprak
gepuk
gera
gerabah
gerabak
gerabang
Expand Down Expand Up @@ -11770,7 +11768,6 @@ kamsia
kamu
kamuflase
kamus
kan
kana
kanaah
kanaat
Expand Down Expand Up @@ -18615,10 +18612,6 @@ nasib
nasihat
nasion
nasional
nasionalis
nasionalisasi
nasionalisme
nasionalistis
nasionisme
naskah
nasofaring
Expand Down Expand Up @@ -22493,7 +22486,6 @@ randi
randu
randuk
randung
rang
rangah
rangai
rangak
Expand Down
5 changes: 3 additions & 2 deletions bahasa/stemmer/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ def precedence(self, word):
match_affix(word, 'me', 'i'),
match_affix(word, 'di', 'i'),
match_affix(word, 'pe', 'i'),
match_affix(word, 'ter', 'i')
match_affix(word, 'ter', 'i'),
match_affix(word, suffix='is'),
])

def start_stemming_process(self):
Expand Down Expand Up @@ -101,7 +102,7 @@ def remove_prefixes(self):
return

# Split compound words
matches = re.match(r'^(ter|di|me|pe|ber)(.*)(i|kan|an|lah)$', self.original_word)
matches = re.match(r'^(ter|di|me|pe|ke|ber)(.*)(i|kan|an|lah)$', self.original_word)
if matches:
counter = 1
for _ in self.current_word:
Expand Down
6 changes: 4 additions & 2 deletions bahasa/stemmer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@ def load_dictionary(dictionary='default'):
dictionary = join(base_dir, 'data', 'kamus.txt')

with open(dictionary) as dictionary_file:
for data in dictionary_file:
word_sets.add(data.strip())
words = dictionary_file.read().splitlines()
words.sort(key=len, reverse=True)
word_sets = set(words)

return word_sets


Expand Down
6 changes: 3 additions & 3 deletions bahasa/stemmer/visitor/suffixes.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ class RemoveInflectionalParticle(object):
def visit(self, context):
result = self.remove(context.current_word)
if result != context.current_word:
removed_part = remove_suffix(context.current_word, result)
removed_part = context.current_word[len(result):]
removal = Removal(self, context.current_word, result,
removed_part, 'P')
context.add_removal(removal)
Expand All @@ -34,7 +34,7 @@ class RemoveDerivationalSuffix(object):
def visit(self, context):
result = self.remove(context.current_word)
if result != context.current_word:
removed_part = remove_suffix(context.current_word, result)
removed_part = context.current_word[len(result):]
removal = Removal(self, context.current_word, result,
removed_part, 'DS')
context.add_removal(removal)
Expand All @@ -58,7 +58,7 @@ class RemoveInflectionalPossessivePronoun(object):
def visit(self, context):
result = self.remove(context.current_word)
if result != context.current_word:
removed_part = remove_suffix(context.current_word, result)
removed_part = context.current_word[len(result):]
removal = Removal(self, context.current_word, result,
removed_part, 'PP')
context.add_removal(removal)
Expand Down
8 changes: 8 additions & 0 deletions tests/stemmer/test_stemmer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,14 @@ def test_stem_word(self):
self.assertEqual(self.stemmer.stem_word('penghancurleburan'), 'hancur lebur')
self.assertEqual(self.stemmer.stem_word('dilipatgandakan'), 'lipat ganda')
self.assertEqual(self.stemmer.stem_word('pertanggungjawaban'), 'tanggung jawab')
self.assertEqual(self.stemmer.stem_word('ketidakpercayaan'), 'tidak percaya')
self.assertEqual(self.stemmer.stem_word('dikuranginya'), 'kurang')
self.assertEqual(self.stemmer.stem_word('menyinari'), 'sinar')
self.assertEqual(self.stemmer.stem_word('dibelinya'), 'beli')
self.assertEqual(self.stemmer.stem_word('gerakan'), 'gerak')
self.assertEqual(self.stemmer.stem_word('menangis'), 'tangis')
self.assertEqual(self.stemmer.stem_word('perangi'), 'perang')
self.assertEqual(self.stemmer.stem_word('nasionalis'), 'nasional')

def test_stem_sentences(self):
self.assertEqual(
Expand Down

0 comments on commit 3ac6af6

Please sign in to comment.