Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix/issue 1 #2

Merged
merged 5 commits into from
Aug 29, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 0 additions & 8 deletions bahasa/data/kamus.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2634,7 +2634,6 @@ beku
bekuk
bekuku
bekukung
bel
bela
belabas
belacak
Expand Down Expand Up @@ -8404,7 +8403,6 @@ geplak
gepok
geprak
gepuk
gera
gerabah
gerabak
gerabang
Expand Down Expand Up @@ -11770,7 +11768,6 @@ kamsia
kamu
kamuflase
kamus
kan
kana
kanaah
kanaat
Expand Down Expand Up @@ -18615,10 +18612,6 @@ nasib
nasihat
nasion
nasional
nasionalis
nasionalisasi
nasionalisme
nasionalistis
nasionisme
naskah
nasofaring
Expand Down Expand Up @@ -22493,7 +22486,6 @@ randi
randu
randuk
randung
rang
rangah
rangai
rangak
Expand Down
5 changes: 3 additions & 2 deletions bahasa/stemmer/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ def precedence(self, word):
match_affix(word, 'me', 'i'),
match_affix(word, 'di', 'i'),
match_affix(word, 'pe', 'i'),
match_affix(word, 'ter', 'i')
match_affix(word, 'ter', 'i'),
match_affix(word, suffix='is'),
])

def start_stemming_process(self):
Expand Down Expand Up @@ -101,7 +102,7 @@ def remove_prefixes(self):
return

# Split compound words
matches = re.match(r'^(ter|di|me|pe|ber)(.*)(i|kan|an|lah)$', self.original_word)
matches = re.match(r'^(ter|di|me|pe|ke|ber)(.*)(i|kan|an|lah)$', self.original_word)
if matches:
counter = 1
for _ in self.current_word:
Expand Down
6 changes: 4 additions & 2 deletions bahasa/stemmer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@ def load_dictionary(dictionary='default'):
dictionary = join(base_dir, 'data', 'kamus.txt')

with open(dictionary) as dictionary_file:
for data in dictionary_file:
word_sets.add(data.strip())
words = dictionary_file.read().splitlines()
words.sort(key=len, reverse=True)
word_sets = set(words)

return word_sets


Expand Down
6 changes: 3 additions & 3 deletions bahasa/stemmer/visitor/suffixes.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ class RemoveInflectionalParticle(object):
def visit(self, context):
result = self.remove(context.current_word)
if result != context.current_word:
removed_part = remove_suffix(context.current_word, result)
removed_part = context.current_word[len(result):]
removal = Removal(self, context.current_word, result,
removed_part, 'P')
context.add_removal(removal)
Expand All @@ -34,7 +34,7 @@ class RemoveDerivationalSuffix(object):
def visit(self, context):
result = self.remove(context.current_word)
if result != context.current_word:
removed_part = remove_suffix(context.current_word, result)
removed_part = context.current_word[len(result):]
removal = Removal(self, context.current_word, result,
removed_part, 'DS')
context.add_removal(removal)
Expand All @@ -58,7 +58,7 @@ class RemoveInflectionalPossessivePronoun(object):
def visit(self, context):
result = self.remove(context.current_word)
if result != context.current_word:
removed_part = remove_suffix(context.current_word, result)
removed_part = context.current_word[len(result):]
removal = Removal(self, context.current_word, result,
removed_part, 'PP')
context.add_removal(removal)
Expand Down
8 changes: 8 additions & 0 deletions tests/stemmer/test_stemmer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,14 @@ def test_stem_word(self):
self.assertEqual(self.stemmer.stem_word('penghancurleburan'), 'hancur lebur')
self.assertEqual(self.stemmer.stem_word('dilipatgandakan'), 'lipat ganda')
self.assertEqual(self.stemmer.stem_word('pertanggungjawaban'), 'tanggung jawab')
self.assertEqual(self.stemmer.stem_word('ketidakpercayaan'), 'tidak percaya')
self.assertEqual(self.stemmer.stem_word('dikuranginya'), 'kurang')
self.assertEqual(self.stemmer.stem_word('menyinari'), 'sinar')
self.assertEqual(self.stemmer.stem_word('dibelinya'), 'beli')
self.assertEqual(self.stemmer.stem_word('gerakan'), 'gerak')
self.assertEqual(self.stemmer.stem_word('menangis'), 'tangis')
self.assertEqual(self.stemmer.stem_word('perangi'), 'perang')
self.assertEqual(self.stemmer.stem_word('nasionalis'), 'nasional')

def test_stem_sentences(self):
self.assertEqual(
Expand Down