-
Notifications
You must be signed in to change notification settings - Fork 68
/
DataFileUtil.java
141 lines (124 loc) · 4.19 KB
/
DataFileUtil.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
package com.github.houbb.opencc4j.util;
import com.github.houbb.opencc4j.constant.AppConstant;
import com.github.houbb.opencc4j.exception.Opencc4jRuntimeException;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Map;
/**
* 2018/2/11
*
* 数据文件工具类
* 1. 为了保证性能,在项目启动的时候,将词组文件加载到内存中去,且只加载一次。
* 2. 此处不用分词,性能更好。
* @author houbinbin
* @version 1.0
* @since 1.7
*/
public final class DataFileUtil {
private DataFileUtil(){}
/**
* 简写=》繁写 字符 Map
*/
private static final Map<String, String> S2T_CHAR_MAP;
/**
* 简写=》繁写 词组 Map
*/
private static final Map<String, String> S2T_PHASE_MAP;
/**
* 繁写=》简写 字符 Map
*/
private static final Map<String, String> T2S_CHAR_MAP;
/**
* 繁写=》简写 词组 Map
*/
private static final Map<String, String> T2S_PHASE_MAP;
static {
synchronized (DataFileUtil.class) {
S2T_CHAR_MAP = buildDataMap(AppConstant.SimpleToTraditional.CHAR_PATH);
S2T_PHASE_MAP = buildDataMap(AppConstant.SimpleToTraditional.PHRASE_PATH);
T2S_CHAR_MAP = buildDataMap(AppConstant.TraditionalToSimple.CHAR_PATH);
T2S_PHASE_MAP = buildDataMap(AppConstant.TraditionalToSimple.PHRASE_PATH);
}
}
/**
* 构建数据集合
* @param path 文件路径
* @return 返回数据集合
*/
private static Map<String, String> buildDataMap(final String path) {
try {
Map<String, String> map = new HashMap<>();
InputStream is = DataFileUtil.class.getResourceAsStream(path);
BufferedReader e = new BufferedReader(new InputStreamReader(is,
Charset.forName(AppConstant.DEFAULT_CHARSET)));
while (e.ready()) {
String entry = e.readLine();
if (StringUtil.isEmpty(entry)) {
continue;
}
String[] strings = StringUtil.splitByAnyBlank(entry);
map.put(strings[0], strings[1]);
}
return map;
} catch (IOException e) {
throw new Opencc4jRuntimeException("Dict 数据加载失败!", e);
}
}
/**
* 获取简体=》繁体的转换结果
* @param original 原始信息
* @return 繁体
*/
public static String getS2TResult(final String original) {
return getPhaseResult(original, S2T_PHASE_MAP, S2T_CHAR_MAP);
}
/**
* 获取繁体=》简体的转换结果
* @param original 原始信息
* @return 简体
*/
public static String getT2SResult(final String original) {
return getPhaseResult(original, T2S_PHASE_MAP, T2S_CHAR_MAP);
}
/**
* 对于词组的转换
*
* @param original original
* @return java.lang.String
*/
private static String getPhaseResult(final String original,
final Map<String, String> phraseMap,
final Map<String, String> charMap) {
String phrase = phraseMap.get(original);
if(StringUtil.isNotEmpty(phrase)
&& !AppConstant.EMPTY_RESULT.equals(phrase)) {
return phrase;
}
char[] chars = original.toCharArray();
StringBuilder stringBuilder = new StringBuilder();
for(char c : chars) {
String result = getCharResult(Character.toString(c), charMap);
stringBuilder.append(result);
}
return stringBuilder.toString();
}
/**
* 对于单个生词的转换
*
* @param original original
* @param charMap 字符集合
* @return java.lang.String
*/
private static String getCharResult(final String original, final Map<String, String> charMap) {
String c = charMap.get(original);
if(StringUtil.isNotEmpty(c)
&& !AppConstant.EMPTY_RESULT.equals(c)) {
return c;
}
return original;
}
}