/
simple.rb
72 lines (60 loc) · 1.82 KB
/
simple.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# -*- coding: utf-8 -*-
module PinYin
module Backend
class Simple
def initialize(override_files=[])
@override_files = override_files || []
end
def romanize(str, tone=nil, include_punctuations=false)
res = []
return res unless str && !str.empty?
str.unpack('U*').each_with_index do |t,idx|
code = sprintf('%x',t).upcase
readings = codes[code]
if readings
res << Value.new(format(readings, tone), false)
else
val = [t].pack('U*')
if val =~ /^[0-9a-zA-Z\s]*$/ # 复原,去除特殊字符,如全角符号等。
if res.last && res.last.english?
res.last << Value.new(val, true)
elsif val != ' '
res << Value.new(val, true)
end
elsif include_punctuations
val = [Punctuation[code]].pack('H*') if Punctuation.include?(code)
(res.last ? res.last : res) << Value.new(val, false)
end
end
end
res.map {|phrase| phrase.split(/\s+/)}.flatten
end
private
def codes
return @codes if @codes
@codes = {}
src = File.expand_path('../../data/Mandarin.dat', __FILE__)
@override_files.unshift(src).each do |file|
load_codes_from(file)
end
@codes
end
def load_codes_from(file)
File.readlines(file).map do |line|
code, readings = line.split(' ')
@codes[code] = readings.split(',')
end
end
def format(readings, tone)
case tone
when :unicode
readings[0]
when :ascii, true
PinYin::Util.to_ascii(readings[0])
else
PinYin::Util.to_ascii(readings[0], false)
end
end
end
end
end