From b3ee5224157cf6eaa38a7657e0fa2444a3c598f9 Mon Sep 17 00:00:00 2001 From: LFDM <1986gh@gmail.com> Date: Mon, 18 Aug 2014 11:44:20 +0200 Subject: [PATCH] Refactor and split nisi, neu, seu, mede, oude etc --- lib/llt/tokenizer.rb | 32 ++++++++++++++-------------- spec/lib/llt/tokenizer_spec.rb | 38 ++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 15 deletions(-) diff --git a/lib/llt/tokenizer.rb b/lib/llt/tokenizer.rb index fc2ba20..7e44e45 100644 --- a/lib/llt/tokenizer.rb +++ b/lib/llt/tokenizer.rb @@ -169,7 +169,7 @@ def find_abbreviations_and_join_strings ENCLITICS = %w{ que ne ve c } def split_enklitika_and_change_their_position split_with_force - split_nec_and_oute + split_frequent_enclitics # like latin c, ve or greek te, de make_frequent_corrections end @@ -202,22 +202,24 @@ def enclitic(val) "#{@enclitics_marker}#{val}" end - def split_nec_and_oute - nec_indices = [] - oute_indices = [] - @worker.each_with_index do |token, i| - case token - when /^nec$/i - token.slice!(-1) - nec_indices << (i + nec_indices.size + @shift_range) - when /^(οὐ|μή|εἰ)τε$/i - token.slice!(-2, 2) - oute_indices << (i + oute_indices.size + @shift_range) + ENCLITICS_MAP = { + /^(nec)$/i => 'c', + /^(ne|se)u$/i => 'u', + /^(nisi)$/i => 'si', + /^(οὐ|μή|εἰ)τε$/i => 'τε', + /^(οὐ|μή)δε$/i => 'δε', + } + def split_frequent_enclitics + ENCLITICS_MAP.each do |regex, encl| + container = [] + @worker.each_with_index do |token, i| + if token.match(regex) + token.slice!(-encl.length, encl.length) + container << (i + container.size + @shift_range) + end end + container.each { |i| @worker.insert(i, enclitic(encl)) } end - - nec_indices.each { |i| @worker.insert(i, enclitic('c')) } - oute_indices.each { |i| @worker.insert(i, enclitic('τε')) } end def make_frequent_corrections diff --git a/spec/lib/llt/tokenizer_spec.rb b/spec/lib/llt/tokenizer_spec.rb index c6850e1..69af489 100644 --- a/spec/lib/llt/tokenizer_spec.rb +++ b/spec/lib/llt/tokenizer_spec.rb @@ -261,6 +261,31 @@ def enklitika_test(example) end end + context "when confronted with -u" do + examples = { + 'seu' => '-u se', + 'neu' => '-u ne' + } + + examples.each do |example, expected| + it "transforms #{example} to #{expected}" do + enklitika_test(example).should be_transformed_to expected + end + end + end + + context "when confronted with -si" do + examples = { + 'nisi' => '-si ni' + } + + examples.each do |example, expected| + it "transforms #{example} to #{expected}" do + enklitika_test(example).should be_transformed_to expected + end + end + end + context "when confronted with -τε" do examples = { 'οὐτε' => '-τε οὐ', @@ -274,6 +299,19 @@ def enklitika_test(example) end end end + + context "when confronted with -δε" do + examples = { + 'οὐδε' => '-δε οὐ', + 'μήδε' => '-δε μή' + } + + examples.each do |example, expected| + it "transforms #{example} to #{expected}" do + enklitika_test(example).should be_transformed_to expected + end + end + end end describe "#merge_what_needs_merging" do