New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve classification accuracy and coverage by merging profiles #69

Merged
merged 22 commits into from Jun 7, 2017
Commits
Jump to file or symbol
Failed to load files and symbols.
+220 −32
Diff settings

Always

Just for now

Viewing a subset of changes. View all
@@ -214,6 +214,7 @@ public String getProfile() {
}
public List<Language> detectAll(String text) throws LanguageDetectionException {
text = NGram.normalizeVietnamese(text);
if (!isStarted) {
load(settings);
init();
@@ -3,6 +3,8 @@
import java.lang.Character.UnicodeBlock;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class NGram {
@@ -72,9 +74,19 @@ public static char normalize(char ch) {
if (LATIN1_EXCLUDED.indexOf(ch) >= 0) {
ch = ' ';
}
} else if (block == UnicodeBlock.LATIN_EXTENDED_B) {
// Normalization for Romanian
if (ch == '\u0219') {
// Small S with comma below => with cedilla
ch = '\u015f';
} else if (ch == '\u021b') {
// Small T with comma below => with cedilla
ch = '\u0163';
}
} else if (block == UnicodeBlock.GENERAL_PUNCTUATION) {
ch = ' ';
} else if (block == UnicodeBlock.ARABIC) {
// Farsi yeh => Arabic yeh
if (ch == '\u06cc') {
ch = '\u064a';
}
@@ -98,6 +110,37 @@ public static char normalize(char ch) {
return ch;
}
private static final String[] VI_NORMALIZED_CHARS = {
"\u00C0\u00C8\u00CC\u00D2\u00D9\u1EF2\u00E0\u00E8\u00EC\u00F2\u00F9\u1EF3\u1EA6\u1EC0\u1ED2\u1EA7\u1EC1\u1ED3\u1EB0\u1EB1\u1EDC\u1EDD\u1EEA\u1EEB",
"\u00C1\u00C9\u00CD\u00D3\u00DA\u00DD\u00E1\u00E9\u00ED\u00F3\u00FA\u00FD\u1EA4\u1EBE\u1ED0\u1EA5\u1EBF\u1ED1\u1EAE\u1EAF\u1EDA\u1EDB\u1EE8\u1EE9",
"\u00C3\u1EBC\u0128\u00D5\u0168\u1EF8\u00E3\u1EBD\u0129\u00F5\u0169\u1EF9\u1EAA\u1EC4\u1ED6\u1EAB\u1EC5\u1ED7\u1EB4\u1EB5\u1EE0\u1EE1\u1EEE\u1EEF",
"\u1EA2\u1EBA\u1EC8\u1ECE\u1EE6\u1EF6\u1EA3\u1EBB\u1EC9\u1ECF\u1EE7\u1EF7\u1EA8\u1EC2\u1ED4\u1EA9\u1EC3\u1ED5\u1EB2\u1EB3\u1EDE\u1EDF\u1EEC\u1EED",
"\u1EA0\u1EB8\u1ECA\u1ECC\u1EE4\u1EF4\u1EA1\u1EB9\u1ECB\u1ECD\u1EE5\u1EF5\u1EAC\u1EC6\u1ED8\u1EAD\u1EC7\u1ED9\u1EB6\u1EB7\u1EE2\u1EE3\u1EF0\u1EF1"
};
private static final String VI_CHARS = "AEIOUYaeiouy\u00c2\u00ca\u00d4\u00e2\u00ea\u00f4\u0102\u0103\u01a0\u01a1\u01af\u01b0";
private static final String VI_DIACRITICS = "\u0300\u0301\u0303\u0309\u0323";
private static final Pattern VI_CHARS_WITH_DIACRITIC_PATTERN = Pattern.compile("([" + VI_CHARS + "])([" + VI_DIACRITICS + "])");
/**
* Normalize Vietnamese letter + diacritical mark (U+03xx) to a single character (U+1Exx).
*/
public static String normalizeVietnamese(String text) {
Matcher matcher = VI_CHARS_WITH_DIACRITIC_PATTERN.matcher(text);
StringBuffer buf = new StringBuffer();
while (matcher.find()) {
int charIndex = VI_CHARS.indexOf(matcher.group(1));
matcher.appendReplacement(
buf,
VI_NORMALIZED_CHARS[VI_DIACRITICS.indexOf(matcher.group(2))].substring(charIndex, charIndex + 1)
);
}
if (buf.length() == 0) {
return text;
}
matcher.appendTail(buf);
return buf.toString();
}
static final String[] CJK_CLASS = {
"\u4F7C\u6934",
"\u88CF\u95B2",
@@ -119,4 +119,148 @@ public final void testNGram() {
assertEquals(ngram.get(3), null);
}
/**
* Test method for {@link NGram#normalize(char)} with Romanian characters
*/
@Test
public final void testNormalizeForRomanian() {
assertEquals(NGram.normalize('\u015f'), '\u015f');
assertEquals(NGram.normalize('\u0163'), '\u0163');
assertEquals(NGram.normalize('\u0219'), '\u015f');
assertEquals(NGram.normalize('\u021b'), '\u0163');
}
@Test
public final void testNormalizeVietnamese() {
assertEquals(NGram.normalizeVietnamese(""), "");
assertEquals(NGram.normalizeVietnamese("ABC"), "ABC");
assertEquals(NGram.normalizeVietnamese("012"), "012");
assertEquals(NGram.normalizeVietnamese("\u00c0"), "\u00c0");
assertEquals(NGram.normalizeVietnamese("\u0041\u0300"), "\u00C0");
assertEquals(NGram.normalizeVietnamese("\u0045\u0300"), "\u00C8");
assertEquals(NGram.normalizeVietnamese("\u0049\u0300"), "\u00CC");
assertEquals(NGram.normalizeVietnamese("\u004F\u0300"), "\u00D2");
assertEquals(NGram.normalizeVietnamese("\u0055\u0300"), "\u00D9");
assertEquals(NGram.normalizeVietnamese("\u0059\u0300"), "\u1EF2");
assertEquals(NGram.normalizeVietnamese("\u0061\u0300"), "\u00E0");
assertEquals(NGram.normalizeVietnamese("\u0065\u0300"), "\u00E8");
assertEquals(NGram.normalizeVietnamese("\u0069\u0300"), "\u00EC");
assertEquals(NGram.normalizeVietnamese("\u006F\u0300"), "\u00F2");
assertEquals(NGram.normalizeVietnamese("\u0075\u0300"), "\u00F9");
assertEquals(NGram.normalizeVietnamese("\u0079\u0300"), "\u1EF3");
assertEquals(NGram.normalizeVietnamese("\u00C2\u0300"), "\u1EA6");
assertEquals(NGram.normalizeVietnamese("\u00CA\u0300"), "\u1EC0");
assertEquals(NGram.normalizeVietnamese("\u00D4\u0300"), "\u1ED2");
assertEquals(NGram.normalizeVietnamese("\u00E2\u0300"), "\u1EA7");
assertEquals(NGram.normalizeVietnamese("\u00EA\u0300"), "\u1EC1");
assertEquals(NGram.normalizeVietnamese("\u00F4\u0300"), "\u1ED3");
assertEquals(NGram.normalizeVietnamese("\u0102\u0300"), "\u1EB0");
assertEquals(NGram.normalizeVietnamese("\u0103\u0300"), "\u1EB1");
assertEquals(NGram.normalizeVietnamese("\u01A0\u0300"), "\u1EDC");
assertEquals(NGram.normalizeVietnamese("\u01A1\u0300"), "\u1EDD");
assertEquals(NGram.normalizeVietnamese("\u01AF\u0300"), "\u1EEA");
assertEquals(NGram.normalizeVietnamese("\u01B0\u0300"), "\u1EEB");
assertEquals(NGram.normalizeVietnamese("\u0041\u0301"), "\u00C1");
assertEquals(NGram.normalizeVietnamese("\u0045\u0301"), "\u00C9");
assertEquals(NGram.normalizeVietnamese("\u0049\u0301"), "\u00CD");
assertEquals(NGram.normalizeVietnamese("\u004F\u0301"), "\u00D3");
assertEquals(NGram.normalizeVietnamese("\u0055\u0301"), "\u00DA");
assertEquals(NGram.normalizeVietnamese("\u0059\u0301"), "\u00DD");
assertEquals(NGram.normalizeVietnamese("\u0061\u0301"), "\u00E1");
assertEquals(NGram.normalizeVietnamese("\u0065\u0301"), "\u00E9");
assertEquals(NGram.normalizeVietnamese("\u0069\u0301"), "\u00ED");
assertEquals(NGram.normalizeVietnamese("\u006F\u0301"), "\u00F3");
assertEquals(NGram.normalizeVietnamese("\u0075\u0301"), "\u00FA");
assertEquals(NGram.normalizeVietnamese("\u0079\u0301"), "\u00FD");
assertEquals(NGram.normalizeVietnamese("\u00C2\u0301"), "\u1EA4");
assertEquals(NGram.normalizeVietnamese("\u00CA\u0301"), "\u1EBE");
assertEquals(NGram.normalizeVietnamese("\u00D4\u0301"), "\u1ED0");
assertEquals(NGram.normalizeVietnamese("\u00E2\u0301"), "\u1EA5");
assertEquals(NGram.normalizeVietnamese("\u00EA\u0301"), "\u1EBF");
assertEquals(NGram.normalizeVietnamese("\u00F4\u0301"), "\u1ED1");
assertEquals(NGram.normalizeVietnamese("\u0102\u0301"), "\u1EAE");
assertEquals(NGram.normalizeVietnamese("\u0103\u0301"), "\u1EAF");
assertEquals(NGram.normalizeVietnamese("\u01A0\u0301"), "\u1EDA");
assertEquals(NGram.normalizeVietnamese("\u01A1\u0301"), "\u1EDB");
assertEquals(NGram.normalizeVietnamese("\u01AF\u0301"), "\u1EE8");
assertEquals(NGram.normalizeVietnamese("\u01B0\u0301"), "\u1EE9");
assertEquals(NGram.normalizeVietnamese("\u0041\u0303"), "\u00C3");
assertEquals(NGram.normalizeVietnamese("\u0045\u0303"), "\u1EBC");
assertEquals(NGram.normalizeVietnamese("\u0049\u0303"), "\u0128");
assertEquals(NGram.normalizeVietnamese("\u004F\u0303"), "\u00D5");
assertEquals(NGram.normalizeVietnamese("\u0055\u0303"), "\u0168");
assertEquals(NGram.normalizeVietnamese("\u0059\u0303"), "\u1EF8");
assertEquals(NGram.normalizeVietnamese("\u0061\u0303"), "\u00E3");
assertEquals(NGram.normalizeVietnamese("\u0065\u0303"), "\u1EBD");
assertEquals(NGram.normalizeVietnamese("\u0069\u0303"), "\u0129");
assertEquals(NGram.normalizeVietnamese("\u006F\u0303"), "\u00F5");
assertEquals(NGram.normalizeVietnamese("\u0075\u0303"), "\u0169");
assertEquals(NGram.normalizeVietnamese("\u0079\u0303"), "\u1EF9");
assertEquals(NGram.normalizeVietnamese("\u00C2\u0303"), "\u1EAA");
assertEquals(NGram.normalizeVietnamese("\u00CA\u0303"), "\u1EC4");
assertEquals(NGram.normalizeVietnamese("\u00D4\u0303"), "\u1ED6");
assertEquals(NGram.normalizeVietnamese("\u00E2\u0303"), "\u1EAB");
assertEquals(NGram.normalizeVietnamese("\u00EA\u0303"), "\u1EC5");
assertEquals(NGram.normalizeVietnamese("\u00F4\u0303"), "\u1ED7");
assertEquals(NGram.normalizeVietnamese("\u0102\u0303"), "\u1EB4");
assertEquals(NGram.normalizeVietnamese("\u0103\u0303"), "\u1EB5");
assertEquals(NGram.normalizeVietnamese("\u01A0\u0303"), "\u1EE0");
assertEquals(NGram.normalizeVietnamese("\u01A1\u0303"), "\u1EE1");
assertEquals(NGram.normalizeVietnamese("\u01AF\u0303"), "\u1EEE");
assertEquals(NGram.normalizeVietnamese("\u01B0\u0303"), "\u1EEF");
assertEquals(NGram.normalizeVietnamese("\u0041\u0309"), "\u1EA2");
assertEquals(NGram.normalizeVietnamese("\u0045\u0309"), "\u1EBA");
assertEquals(NGram.normalizeVietnamese("\u0049\u0309"), "\u1EC8");
assertEquals(NGram.normalizeVietnamese("\u004F\u0309"), "\u1ECE");
assertEquals(NGram.normalizeVietnamese("\u0055\u0309"), "\u1EE6");
assertEquals(NGram.normalizeVietnamese("\u0059\u0309"), "\u1EF6");
assertEquals(NGram.normalizeVietnamese("\u0061\u0309"), "\u1EA3");
assertEquals(NGram.normalizeVietnamese("\u0065\u0309"), "\u1EBB");
assertEquals(NGram.normalizeVietnamese("\u0069\u0309"), "\u1EC9");
assertEquals(NGram.normalizeVietnamese("\u006F\u0309"), "\u1ECF");
assertEquals(NGram.normalizeVietnamese("\u0075\u0309"), "\u1EE7");
assertEquals(NGram.normalizeVietnamese("\u0079\u0309"), "\u1EF7");
assertEquals(NGram.normalizeVietnamese("\u00C2\u0309"), "\u1EA8");
assertEquals(NGram.normalizeVietnamese("\u00CA\u0309"), "\u1EC2");
assertEquals(NGram.normalizeVietnamese("\u00D4\u0309"), "\u1ED4");
assertEquals(NGram.normalizeVietnamese("\u00E2\u0309"), "\u1EA9");
assertEquals(NGram.normalizeVietnamese("\u00EA\u0309"), "\u1EC3");
assertEquals(NGram.normalizeVietnamese("\u00F4\u0309"), "\u1ED5");
assertEquals(NGram.normalizeVietnamese("\u0102\u0309"), "\u1EB2");
assertEquals(NGram.normalizeVietnamese("\u0103\u0309"), "\u1EB3");
assertEquals(NGram.normalizeVietnamese("\u01A0\u0309"), "\u1EDE");
assertEquals(NGram.normalizeVietnamese("\u01A1\u0309"), "\u1EDF");
assertEquals(NGram.normalizeVietnamese("\u01AF\u0309"), "\u1EEC");
assertEquals(NGram.normalizeVietnamese("\u01B0\u0309"), "\u1EED");
assertEquals(NGram.normalizeVietnamese("\u0041\u0323"), "\u1EA0");
assertEquals(NGram.normalizeVietnamese("\u0045\u0323"), "\u1EB8");
assertEquals(NGram.normalizeVietnamese("\u0049\u0323"), "\u1ECA");
assertEquals(NGram.normalizeVietnamese("\u004F\u0323"), "\u1ECC");
assertEquals(NGram.normalizeVietnamese("\u0055\u0323"), "\u1EE4");
assertEquals(NGram.normalizeVietnamese("\u0059\u0323"), "\u1EF4");
assertEquals(NGram.normalizeVietnamese("\u0061\u0323"), "\u1EA1");
assertEquals(NGram.normalizeVietnamese("\u0065\u0323"), "\u1EB9");
assertEquals(NGram.normalizeVietnamese("\u0069\u0323"), "\u1ECB");
assertEquals(NGram.normalizeVietnamese("\u006F\u0323"), "\u1ECD");
assertEquals(NGram.normalizeVietnamese("\u0075\u0323"), "\u1EE5");
assertEquals(NGram.normalizeVietnamese("\u0079\u0323"), "\u1EF5");
assertEquals(NGram.normalizeVietnamese("\u00C2\u0323"), "\u1EAC");
assertEquals(NGram.normalizeVietnamese("\u00CA\u0323"), "\u1EC6");
assertEquals(NGram.normalizeVietnamese("\u00D4\u0323"), "\u1ED8");
assertEquals(NGram.normalizeVietnamese("\u00E2\u0323"), "\u1EAD");
assertEquals(NGram.normalizeVietnamese("\u00EA\u0323"), "\u1EC7");
assertEquals(NGram.normalizeVietnamese("\u00F4\u0323"), "\u1ED9");
assertEquals(NGram.normalizeVietnamese("\u0102\u0323"), "\u1EB6");
assertEquals(NGram.normalizeVietnamese("\u0103\u0323"), "\u1EB7");
assertEquals(NGram.normalizeVietnamese("\u01A0\u0323"), "\u1EE2");
assertEquals(NGram.normalizeVietnamese("\u01A1\u0323"), "\u1EE3");
assertEquals(NGram.normalizeVietnamese("\u01AF\u0323"), "\u1EF0");
assertEquals(NGram.normalizeVietnamese("\u01B0\u0323"), "\u1EF1");
}
}
Oops, something went wrong.
ProTip! Use n and p to navigate between commits in a pull request.