Browse files

initial commit

  • Loading branch information...
1 parent b6f3c4c commit 4b8c1401eb5ac4a2affebf4bf80d03470862f1e3 @joeellis committed Mar 9, 2011
Showing with 319 additions and 4 deletions.
  1. +28 −0 Gemfile.lock
  2. +2 −2 Rakefile
  3. +286 −0 lib/romkan.rb
  4. +3 −2 spec/romkan_spec.rb
View
28 Gemfile.lock
@@ -0,0 +1,28 @@
+GEM
+ remote: http://rubygems.org/
+ specs:
+ diff-lcs (1.1.2)
+ git (1.2.5)
+ jeweler (1.5.2)
+ bundler (~> 1.0.0)
+ git (>= 1.2.5)
+ rake
+ rake (0.8.7)
+ rcov (0.9.9)
+ rspec (2.3.0)
+ rspec-core (~> 2.3.0)
+ rspec-expectations (~> 2.3.0)
+ rspec-mocks (~> 2.3.0)
+ rspec-core (2.3.1)
+ rspec-expectations (2.3.0)
+ diff-lcs (~> 1.1.2)
+ rspec-mocks (2.3.0)
+
+PLATFORMS
+ ruby
+
+DEPENDENCIES
+ bundler (~> 1.0.0)
+ jeweler (~> 1.5.2)
+ rcov
+ rspec (~> 2.3.0)
View
4 Rakefile
@@ -15,8 +15,8 @@ Jeweler::Tasks.new do |gem|
gem.name = "romkan"
gem.homepage = "http://github.com/joeellis/romkan"
gem.license = "MIT"
- gem.summary = %Q{TODO: one-line summary of your gem}
- gem.description = %Q{TODO: longer description of your gem}
+ gem.summary = %Q{A gem for converting between hiragana, katakana, and romaji}
+ gem.description = %Q{A gem for converting between hiragana, katakana, and romaji}
gem.email = "joe@squarefour.net"
gem.authors = ["Joe Ellis"]
# Include your dependencies below. Runtime dependencies are required when using your gem,
View
286 lib/romkan.rb
@@ -0,0 +1,286 @@
+# coding: utf-8
+
+#
+# kana2rom.rb
+# A Ruby module for converting between hiragana, katakana and romaji.
+#
+# ---------------------------------------------------------------------------------
+# K.Kodama 2002.06
+# This script is distributed freely in the sense of GNU General Public License.
+# http://www.gnu.org/licenses/gpl.html
+#
+# ---------------------------------------------------------------------------------
+# Paul Chapman (paul [a../t] longweekendmobile 2010-04-01)
+# Repaired script to work with modern Ruby versions (1.86+), added comments,
+# made it support gaijin friendly transliterations!
+# ---------------------------------------------------------------------------------
+# USAGE
+#
+# Include kana2rom
+#
+# kana2rom(str) かな --> ロ-マ字 変換 / hira/katakana ->> romaji conv
+# rom2kata(str) ロ-マ字 --> 片仮名 変換 / romaji --> katakana conv
+# rom2hira(str) ロ-マ字 --> 平仮名 変換 / romaji --> hiragana conv
+# hira2kata(str) 平仮名 --> 片仮名 変換 / hiragana --> katakana conv
+# kata2hira(str) 片仮名 --> 平仮名 変換 / katakana ->> hiragana conv
+# kana2kana(str) attempts either to either, returns unique strings only
+#
+# ---------------------------------------------------------------------------------
+
+module Kana2rom
+
+ Kana2romH={
+ ""=>"a", ""=>"i", ""=>"u", ""=>"e",""=>"o",
+ ""=>"a", ""=>"i", ""=>"u", ""=>"e",""=>"o",
+ ""=>"ka", ""=>"ki", ""=>"ku", ""=>"ke", ""=>"ko",
+ ""=>"ka", ""=>"ki", ""=>"ku", ""=>"ke", ""=>"ko",
+ ""=>"ga", ""=>"gi", ""=>"gu", ""=>"ge", ""=>"go",
+ ""=>"ga", ""=>"gi", ""=>"gu", ""=>"ge", ""=>"go",
+ ""=>"sa", ""=>"si", ""=>"su", ""=>"se", ""=>"so",
+ ""=>"sa", ""=>"shi",""=>"su", ""=>"se", ""=>"so",
+ ""=>"za", ""=>"ji", ""=>"zu", ""=>"ze", ""=>"zo",
+ ""=>"za", ""=>"ji", ""=>"zu", ""=>"ze", ""=>"zo",
+ ""=>"ta", ""=>"chi",""=>"tsu",""=>"te", ""=>"to",
+ ""=>"ta", ""=>"chi",""=>"tsu",""=>"te", ""=>"to",
+ ""=>"da", ""=>"dji",""=>"dzu",""=>"de", ""=>"do",
+ ""=>"da", ""=>"dji",""=>"dzu",""=>"de", ""=>"do",
+ ""=>"na", ""=>"ni", ""=>"nu", ""=>"ne", ""=>"no",
+ ""=>"na", ""=>"ni", ""=>"nu", ""=>"ne", ""=>"no",
+ ""=>"ha", ""=>"hi", ""=>"fu", ""=>"he", ""=>"ho",
+ ""=>"ha", ""=>"hi", ""=>"fu", ""=>"he", ""=>"ho",
+ ""=>"ba", ""=>"bi", ""=>"bu", ""=>"be", ""=>"bo",
+ ""=>"ba", ""=>"bi", ""=>"bu", ""=>"be", ""=>"bo",
+ ""=>"pa", ""=>"pi", ""=>"pu", ""=>"pe", ""=>"po",
+ ""=>"pa", ""=>"pi", ""=>"pu", ""=>"pe", ""=>"po",
+ ""=>"ma", ""=>"mi", ""=>"mu", ""=>"me", ""=>"mo",
+ ""=>"ma", ""=>"mi", ""=>"mu", ""=>"me", ""=>"mo",
+ ""=>"ya", ""=>"yu", ""=>"yo",
+ ""=>"ya", ""=>"yu", ""=>"yo",
+ ""=>"ra", ""=>"ri", ""=>"ru",""=>"re",""=>"ro",
+ ""=>"ra", ""=>"ri", ""=>"ru",""=>"re",""=>"ro",
+ ""=>"wa", ""=>"wi", ""=>"we", ""=>"wo", ""=>"nn",
+ ""=>"wa", ""=>"wi", ""=>"we", ""=>"wo", ""=>"nn",
+ ""=>"xa", ""=>"xi", ""=>"xu", ""=>"xe", ""=>"xo",
+ ""=>"xa", ""=>"xi", ""=>"xu", ""=>"xe", ""=>"xo",
+ ""=>"xtsu",""=>"xya", ""=>"xyu", ""=>"xyo",
+ ""=>"xtsu",""=>"xya", ""=>"xyu", ""=>"xyo",
+ ""=>"vu", ""=>"xka",""=>"ga",""=>"xwa",
+ ""=>"xwa",
+ ""=>"-", ""=>"-", ""=>'"', ""=>"'", ""=>",", ""=>".",
+ ""=>":", " " => " ", "" => "@", "" => "(", "" => ")",
+ " " => " "
+ }
+
+ Kana2romH2={
+ "てぃ" => "ti", "でぃ" => "di"
+ }
+ # 1 character romaji patterns
+ Rom2KataH1={
+ "a"=>"", "i"=>"", "u"=>"", "e"=>"", "o"=>"", "-"=>""
+ }
+
+ # 2 character romaji patterns
+ Rom2KataH2={
+ "xa"=>"", "xi"=>"", "xu"=>"", "xe"=>"", "xo"=>"",
+ "ka"=>"", "ki"=>"", "ku"=>"", "ke"=>"", "ko"=>"",
+ "ca"=>"", "cu"=>"", "co"=>"",
+ "ga"=>"", "gi"=>"", "gu"=>"", "ge"=>"", "go"=>"",
+ "sa"=>"", "si"=>"", "su"=>"", "se"=>"", "so"=>"",
+ "za"=>"", "zi"=>"", "zu"=>"", "ze"=>"", "zo"=>"",
+ "ja"=>"ジャ","ji"=>"", "ju"=>"ジュ","je"=>"ジェ","jo"=>"ジョ",
+ "ta"=>"", "ti"=>"", "tsu"=>"", "te"=>"", "to"=>"",
+ "da"=>"", "di"=>"", "du"=>"", "de"=>"", "do"=>"",
+ "na"=>"", "ni"=>"", "nu"=>"", "ne"=>"", "no"=>"",
+ "ha"=>"", "hi"=>"", "hu"=>"", "he"=>"", "ho"=>"",
+ "ba"=>"", "bi"=>"", "bu"=>"", "be"=>"", "bo"=>"",
+ "pa"=>"", "pi"=>"", "pu"=>"", "pe"=>"", "po"=>"",
+ "va"=>"ヴァ","vi"=>"ヴィ","vu"=>"", "ve"=>"ヴェ","vo"=>"ヴォ",
+ "fa"=>"ファ","fi"=>"フィ","fu"=>"", "fe"=>"フェ","fo"=>"フォ",
+ "ma"=>"", "mi"=>"", "mu"=>"", "me"=>"", "mo"=>"",
+ "ya"=>"", "yi"=>"", "yu"=>"", "ye"=>"イェ", "yo"=>"",
+ "ra"=>"", "ri"=>"", "ru"=>"", "re"=>"", "ro"=>"",
+ "la"=>"", "li"=>"", "lu"=>"", "le"=>"", "lo"=>"",
+ "wa"=>"", "wi"=>"", "wu"=>"", "we"=>"", "wo"=>"",
+ "nn"=>""
+ }
+
+ # 3 character romaji patterns
+ Rom2KataH3={
+ "tsu"=>"",
+ "xka"=>"", "xke"=>"",
+ "xwa"=>"", "xtsu"=>"", "xya"=>"", "xyu"=>"", "xyo"=>"",
+ "kya"=>"キャ", "kyi"=>"キィ", "kyu"=>"キュ", "kye"=>"キェ", "kyo"=>"キョ",
+ "gya"=>"ギャ", "gyi"=>"ギィ", "gyu"=>"ギュ", "gye"=>"ギェ", "gyo"=>"ギョ",
+ "sya"=>"シャ", "syi"=>"シィ", "syu"=>"シュ", "sye"=>"シェ", "syo"=>"ショ",
+ "sha"=>"シャ", "shi"=>"", "shu"=>"シュ", "she"=>"シェ", "sho"=>"ショ",
+ "zya"=>"ジャ", "zyi"=>"ジィ", "zyu"=>"ジュ", "zye"=>"ジェ", "zyo"=>"ジョ",
+ "jya"=>"ジャ", "jyi"=>"ジィ", "jyu"=>"ジュ", "jye"=>"ジェ", "jyo"=>"ジョ",
+ "tya"=>"チャ", "tyi"=>"チィ", "tyu"=>"チュ", "tye"=>"チェ", "tyo"=>"チョ",
+ "cya"=>"チャ", "cyi"=>"チィ", "cyu"=>"チュ", "cye"=>"チェ", "cyo"=>"チョ",
+ "cha"=>"チャ", "chi"=>"", "chu"=>"チュ", "che"=>"チェ", "cho"=>"チョ",
+ "tha"=>"テャ", "thi"=>"ティ", "thu"=>"テュ", "the"=>"テェ", "tho"=>"テョ",
+ "dya"=>"ヂャ", "dyi"=>"ヂィ", "dyu"=>"ヂュ", "dye"=>"ヂェ", "dyo"=>"ヂョ",
+ "dha"=>"デャ", "dhi"=>"ディ", "dhu"=>"デュ", "dhe"=>"デェ", "dho"=>"デョ",
+ "nya"=>"ニャ", "nyi"=>"ニィ", "nyu"=>"ニュ", "nye"=>"ニェ", "nyo"=>"ニョ",
+ "hya"=>"ヒャ", "hyi"=>"ヒィ", "hyu"=>"ヒュ", "hye"=>"ヒェ", "hyo"=>"ヒョ",
+ "bya"=>"ビャ", "byi"=>"ビィ", "byu"=>"ビュ", "bye"=>"ビェ", "byo"=>"ビョ",
+ "pya"=>"ピャ", "pyi"=>"ピィ", "pyu"=>"ピュ", "pye"=>"ピェ", "pyo"=>"ピョ",
+ "mya"=>"ミャ", "myi"=>"ミィ", "myu"=>"ミュ", "mye"=>"ミェ", "myo"=>"ミョ",
+ "rya"=>"リャ", "ryi"=>"リィ", "ryu"=>"リュ", "rye"=>"リェ", "ryo"=>"リョ",
+ "lya"=>"リャ", "lyi"=>"リィ", "lyu"=>"リュ", "lye"=>"リェ", "lyo"=>"リョ"
+ }
+
+ Kata2hiraH={
+ ""=>"", ""=>"", ""=>"", ""=>"", ""=>"",
+ ""=>"", ""=>"", ""=>"", ""=>"", ""=>"",
+ ""=>"", ""=>"", ""=>"", ""=>"", ""=>"",
+ ""=>"", ""=>"", ""=>"", ""=>"", ""=>"",
+ ""=>"", ""=>"", ""=>"", ""=>"", ""=>"",
+ ""=>"", ""=>"", ""=>"", ""=>"", ""=>"",
+ ""=>"", ""=>"", ""=>"", ""=>"", ""=>"",
+ ""=>"", ""=>"", ""=>"", ""=>"", ""=>"",
+ ""=>"", ""=>"", ""=>"", ""=>"", ""=>"",
+ ""=>"", ""=>"", ""=>"", ""=>"", ""=>"",
+ ""=>"", ""=>"", ""=>"", ""=>"", ""=>"",
+ ""=>"", ""=>"", ""=>"", ""=>"", ""=>"",
+ ""=>"", ""=>"", ""=>"",
+ ""=>"", ""=>"", ""=>"", ""=>"", ""=>"",
+ ""=>"", ""=>"", ""=>"", ""=>"", ""=>"",
+ ""=>"", ""=>"", ""=>"", ""=>"", ""=>"",
+ ""=>"", ""=>"", ""=>"", ""=>"",
+ ""=>"う゛", ""=>"", ""=>"", ""=>""
+ }
+
+ Hira2kataH={}; Kata2hiraH.each_pair{|k,v| Hira2kataH[v]=k}; Hira2kataH[""]=""; Hira2kataH[""]=""
+
+ def kana2rom(str)
+ s="";str.each_char{|c|if(Kana2romH.key?(c))then s+=Kana2romH[c];else s+=c;end}
+ s=s.gsub(/(k)([aiueo])(")/,'g\2').gsub(/(s)([aiueo])(")/,'z\2').gsub(/(t)([aiueo])(")/,'d\2')
+ s=s.gsub(/(h)([aiueo])(")/,'b\2').gsub(/([fh])([aiueo])(')/,'p\2').gsub(/u"/,'vu') # [半]濁点゛゜
+ #---------------------------------------------------------
+ s=s.gsub(/\s(xtsu)?\s/,'xtsu') # Remove spaces before/after hanging 'っ'
+ #---------------------------------------------------------
+ sw=s;
+ while nil!=sw.gsub!(/(xtsu)([ckgszjtdhfbpmyrwnv])/,'\2\2') do; s=sw; end # ッカ-->xtsuka-->kka
+ #---------------------------------------------------------
+ # Compound Phoneme Pattern Rollbacks
+ # NB: Uses regex backrefs like "\1y\3" where \1 = 1st capture grp, y='y' and \3 = 3rd capture grp
+ #---------------------------------------------------------
+ s=s.gsub(/( +x)(.*)/,'x\2') # Avoid hanging chisaii moji due to leading spaces
+ s=s.gsub(/(ch)(ixy)([aueo])/,'\1\3') # チョ-->chixyo-->cho
+ s=s.gsub(/([kgszjtdnhfbpmr])(ixy)([auo])/,'\1y\3') # キャ-->kixya-->kya
+ s=s.gsub(/([kgszjtdnhfbpmr])(ix)([ie])/,'\1y\3') # キィ-->kixi-->kyi
+ #---------------------------------------------------------
+ s=s.gsub(/(sh)(y)([aueo])/,'\1\3') # シュ-->shyu-->shu
+ s=s.gsub(/(j)(y)([aueo])/,'\1\3') # ジュ-->jyu-->ju
+ #---------------------------------------------------------
+ s=s.gsub(/([td])(exy)([aueo])/,'\1h\3') # テャ-->texya-->tha
+ s=s.gsub(/([td])(ex)([ie])/,'\1\3') # ティ-->texi-->ti
+ s=s.gsub(/([td])(oxu)/,'\1oo') # ドゥ-->toxu-->too
+ s=s.gsub(/(tsu)(x)([aiueo])/,'ts\3') # ツァ-->tsuxa-->tsa
+ s=s.gsub(/([d])(oxy)/,'\1o\'y') # ドュ-->doxyu-->doyu
+ #---------------------------------------------------------
+ s=s.gsub(/(vux)([aieo])/ ,'v\2') # ヴァヴィヴェヴォ, ヴァ-->vuxa-->va
+ s=s.gsub(/(vuxy)([aueo])/ ,'vy\2') # ヴュ-->vuxyu-->vyu
+ s=s.gsub(/(ixe)/ ,'iye') # イェ-->ixe-->iye
+ s=s.gsub(/(hoxe)/ ,'howe') # ホェ-->hoxe-->howe
+ s=s.gsub(/(fux)([aieo])/ ,'f\2') # ファフィフェフォ, ファ-->fuxa-->fa
+ s=s.gsub(/(fuxy)([aueo])/,'fy\2') # フュ-->fuxyu-->fyu
+ s=s.gsub(/(ux)([ieo])/, 'w\2') # ウァウィウェ, ウァ-->uxa-->wa
+ #---------------------------------------------------------
+ s=s.strip.gsub(/(xtsu)$/,'h!') # Recombine hanging 'っ' followed by EOL
+ s=s.gsub(/([aiueo]?)(\-)/, '\1\1') # Replace boubiki chars and double preceding vowel
+ #---------------------------------------------------------
+ # Cleanup specifically for source strings that contain spaces!
+ s=s.gsub(/( +)([^a-z|A-z])/, '\2') # Remove spaces before any non-alphabetical char
+ s=s.gsub(/(nn)/,'n') # ン-->nn-->n
+ s=s.gsub(/( n)[^a-z|A-Z]?$/,'n') # Fix "n" appearing as separate word
+ s=s.gsub(/\s{2,}/, ' ') # Remove duplicate spaces!
+ #---------------------------------------------------------
+ return s
+ end
+
+ def rom2kata(str)
+ ## THIS LINE DOES NOT WORK IN RECENT RUBY VERSIONS!!! r=""; w=[]; chars=str.split(//e)
+ result=""; word_buffer=[]; chars=str.each_char.collect{|c| c}
+ loop{
+ case word_buffer.size
+ ##### When 0 characters in the buffer
+ when 0 then
+ if chars.size>0 then word_buffer.push(chars.shift) else return result; end
+ ##### Patterns with 1 roman character
+ when 1 then
+ if word_buffer[0]=~/[aiueo-]/ then result+=Rom2KataH1[word_buffer[0]]; word_buffer=[] # a-->ア
+ elsif word_buffer[0]=~/[xkcgszjtdnhbpvfmyrlw]/ then
+ if chars.size>0 then word_buffer.push(chars.shift)
+ else return result+(word_buffer[0].gsub(/n/,""));
+ end
+ else result+=word_buffer.shift;
+ end
+ ##### Patterns with 2 roman characters
+ when 2 then
+ if Rom2KataH2.key?(word_buffer.join) then result+=Rom2KataH2[word_buffer.join]; word_buffer=[];
+ elsif word_buffer.join=~/([kgszjtcdnhbpmrl]y)|([stcd]h)|ts|(x[wytk])/ then # goto 3
+ if chars.size>0 then word_buffer.push(chars.shift) # Consume next letter from source array
+ else return result+(word_buffer.join.gsub(/n/,""));
+ end
+ elsif word_buffer[0]=="n" then result+=""; word_buffer.shift # nk-->ンk
+ elsif word_buffer[0]==word_buffer[1] then result+=""; word_buffer.shift # kk-->ッk
+ else result+=word_buffer.shift;
+ end
+ ##### Patterns with 3 roman characters
+ when 3 then
+ if Rom2KataH3.key?(word_buffer.join) then result+=Rom2KataH3[word_buffer.join]; word_buffer=[];
+ elsif word_buffer[0]=="n" then result+=""; word_buffer.shift;
+ else result+=word_buffer.shift;
+ end
+ end
+ }
+ end
+
+ def rom2hira(str)
+ return kata2hira(rom2kata(str))
+ end
+
+ def kata2hira(str)
+ s=""; str.each_char{|c| s+=( Kata2hiraH.key?(c) ? Kata2hiraH[c] : c )}
+ return s
+ end
+
+ def hira2kata(str)
+ s=""; str.each_char{|c|if(Hira2kataH.key?(c))then s+=Hira2kataH[c];else s+=c; end}
+ return s
+ end
+
+ # Added by Paul 2009-05-12 22:31
+ def kana2kana(str1)
+ result = []
+ str2 = Kana2rom::hira2kata(str1)
+ str3 = Kana2rom::kata2hira(str1)
+ result << str1
+ result << str2 if str2.length > 0 and str1 !=str2
+ result << str3 if str3.length > 0 and str2 !=str3 and str3 != str1
+ return result
+ end
+
+ # module_function :kana2rom, :rom2kata, :kata2hira, :hira2kata, :rom2hira, :kana2kana
+end
+
+class String
+ extend Kana2rom
+end
+
+=begin
+### Uncomment this section to test at command line
+require 'jcode'
+if $0 == __FILE__ then
+ # sample
+ str="きにょうび/きんようび"
+ printf("ローマ字: %s\n", Kana2rom::kana2rom(str))
+ printf("平仮名 : %s\n", Kana2rom::kata2hira(str))
+ printf("片仮名 : %s\n", Kana2rom::hira2kata(str))
+ str="ro-maji"
+ printf("片仮名 : %s\n", Kana2rom::rom2kata(str))
+ printf("平仮名 : %s\n", Kana2rom::rom2hira(str))
+end
+=end
View
5 spec/romkan_spec.rb
@@ -1,7 +1,8 @@
+#coding: utf-8
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
describe "Romkan" do
- it "fails" do
- fail "hey buddy, you should probably rename this file and start specing for real"
+ it "should convert romaji to hiragana properly" do
+ String.rom2hira("tsukue").should == "つくえ"
end
end

0 comments on commit 4b8c140

Please sign in to comment.