Enable partial encl splitting for -ue

latin-language-toolkit · Sep 8, 2014 · 744ed76 · 744ed76
1 parent 5b6d90e
commit 744ed76
Show file tree

Hide file tree

Showing 2 changed files with 37 additions and 9 deletions.
diff --git a/lib/llt/tokenizer.rb b/lib/llt/tokenizer.rb
@@ -153,6 +153,7 @@ def find_abbreviations_and_join_strings
     WORDS_ENDING_WITH_QUE = /^((un.{1,3})?[qc]u[aei].*que|qu[ao]que|itaque|atque|ut[er].*que|.*cumque|pler(.{1,2}|[oa]rum)que|denique|undique|usque)$/i # neque taken out!
     WORDS_ENDING_WITH_NE  = /^(omne|sine|bene|paene|iuvene|siccine)$/i # generalize these words and start to look for them in the db, especiialy for adverbs
     WORDS_ENDING_WITH_VE  = /^()$/i # formerly had neve and sive, which we split now
+    WORDS_ENDING_WITH_UE  = /^()$/i # not yet populated
 
     # laetusque  to -que laetus
     # in eoque   to -que in eo
@@ -166,7 +167,7 @@ def find_abbreviations_and_join_strings
     # special because it has ve and ne - both would get split. Such words
     # might be so rare that we postpone proper handling for now
 
-    ENCLITICS = %w{ que ne ve c }
+    ENCLITICS = %w{ que ne ve ue c }
     def split_enklitika_and_change_their_position
       split_with_force
       split_frequent_enclitics # like latin c, ve or greek te, de
@@ -185,7 +186,8 @@ def split_with_force
 
     def split_enklitikon(encl, restrictors)
       # needs a word character in front - ne itself should be contained
-      regexp = /(?<=\w)#{encl}$/
+      # q in front is not allowed, otherwise we would match every que
+      regexp = /(?<=[^Qq])#{encl}$/
 
       indices = []
       @worker.each_with_index do |token, i|
@@ -286,18 +288,24 @@ def ne_corrections
     end
 
     def ve_corrections
+      # contains ue correction
+      # ATTENTION: Since we don't have a stem normalizer, ue-splitting
+      # doesn't work correct in every case. e.g. uentus-ue won't be split,
+      # because the stem 'uent' cannot be found in our db!
       corrections = []
       @worker.each_with_index do |w, i|
-        if w == enclitic('ve')
+        if w.match(/^-([vu])e$/)
+          v_or_u = $1
+
           orig_el = original_word(i)
 
           entries = []
-          entries += lookup(orig_el + 'v',  :adjective, :stem, 1)
-          entries += lookup(orig_el + 'v',  :adjective, :stem, 3)
-          entries += lookup(orig_el + 'v',  :noun,      :stem, [2, 33, 5])
-          entries += lookup(orig_el + 'v',  :persona,   :stem, 3)
-          entries += lookup(orig_el + 've', :verb,      :pr,   2)
-          entries += lookup(orig_el + 'v',  :verb,      :pr,   [3, 5]) # not sure if such a word of 5 exists
+          entries += lookup(orig_el + v_or_u,       :adjective, :stem, 1)
+          entries += lookup(orig_el + v_or_u,       :adjective, :stem, 3)
+          entries += lookup(orig_el + v_or_u,       :noun,      :stem, [2, 33, 5])
+          entries += lookup(orig_el + v_or_u,       :persona,   :stem, 3)
+          entries += lookup(orig_el + v_or_u + 'e', :verb,      :pr,   2)
+          entries += lookup(orig_el + v_or_u,       :verb,      :pr,   [3, 5]) # not sure if such a word of 5 exists
 
           if entries.any?
             corrections << i - corrections.size

diff --git a/spec/lib/llt/tokenizer_spec.rb b/spec/lib/llt/tokenizer_spec.rb
@@ -261,6 +261,26 @@ def enklitika_test(example)
         end
       end
 
+      context "when confronted with -ue as alt. spelling for -ve" do
+        examples = {
+          'siue' => '-ue si',
+          'neue' => '-ue ne',
+          'quisque'  => 'quisque',
+          'praecipue' => 'praecipue'
+        }
+
+        examples.each do |example, expected|
+          it "transforms #{example} to #{expected}" do
+            enklitika_test(example).should be_transformed_to expected
+          end
+        end
+
+        it "transforms uentusue to -ue uentus" do
+          pending('missing alternative spelling handler')
+          enklitika_test('uentusue').should be_transformed_to 'ue -uentus'
+        end
+      end
+
       context "when confronted with -u" do
         examples = {
           'seu' => '-u se',