New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve classification accuracy and coverage by merging profiles #69

Merged
merged 22 commits into from Jun 7, 2017
Commits
Jump to file or symbol
Failed to load files and symbols.
+58 −16
Diff settings

Always

Just for now

Viewing a subset of changes. View all

Add tests for the short text language profile

  • Loading branch information...
yanirs committed Nov 28, 2016
commit 20e572db8d632f7daddc425f2a73aa837038a7b9
@@ -3,6 +3,7 @@
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.elasticsearch.common.io.Streams;
import org.elasticsearch.common.settings.Settings;
import org.junit.Assert;
import org.junit.Test;
import org.xbib.elasticsearch.common.langdetect.LangdetectService;
@@ -51,9 +52,9 @@ public void testKorean() throws IOException {
/**
* Test classification accuracies on translations of the Universal Declaration of Human Rights (UDHR).
*
* The translations were obtained from http://unicode.org/udhr/. Some minimal processing was done to create the
* udhr.tsv resource file: matched the dataset's language code with the one returned by the library, and removed
* each file's English intro and redundant whitespace.
* The translations were obtained from <a href="http://unicode.org/udhr/">the UDHR unicode dumps</a>. Some minimal
* processing was done to create the udhr.tsv resource file: matched the dataset's language code with the one
* returned by the library, and removed each file's English intro and redundant whitespace.
*/
@Test
public void testUdhrAccuracies() throws IOException {
@@ -67,29 +68,62 @@ public void testUdhrAccuracies() throws IOException {
{ 100, 100, 0.94, 0.99 },
{ 300, 100, 1.00, 1.00 },
{ 0, 1, 1.00, 1.00 }
}
},
false
);
}
@Test
public void testUdhrAccuraciesShortProfile() throws IOException {
testSubstringAccuracies(
"udhr.tsv",
new double[][] {
{ 5, 100, 0.16, 0.64 },
{ 10, 100, 0.50, 0.82 },
{ 20, 100, 0.68, 0.93 },
{ 50, 100, 0.86, 0.98 },
{ 100, 100, 0.94, 0.99 },
{ 300, 100, 0.99, 0.99 },
{ 0, 1, 1.00, 1.00 }
},
true
);
}
/**
* Test classification accuracies on WordPress interface translations.
*
* The translations were obtained from https://translate.wordpress.org/projects/wp/4.6.x. Some minimal processing
* was done to create the wp-translations.tsv resource file: matched the dataset's language code with the one
* returned by the library, unescaped HTML entities, and dropped variable placeholders, HTML tags, and redundant
* whitespace. To speed up testing, the resource file contains only the 50 longest translated phrases for each
* language, excluding URL translations and word lists.
* The translations are for <a href="https://translate.wordpress.org/projects/wp/4.6.x">WordPress 4.6.x</a>. Some
* minimal processing was done to create the wp-translations.tsv resource file: matched the dataset's language code
* with the one returned by the library, unescaped HTML entities, and dropped variable placeholders, HTML tags, and
* redundant whitespace. To speed up testing, the resource file contains only the 50 longest translated phrases for
* each language, excluding URL translations and word lists.
*/
@Test
public void testWordPressTranslationsAccuracies() throws IOException {
testSubstringAccuracies(
"wp-translations.tsv",
new double[][] {
{ 5, 10, 0.25, 0.60 },
{ 10, 10, 0.44, 0.76 },
{ 20, 10, 0.65, 0.88 },
{ 0, 1, 0.80, 0.98 }
}
{ 5, 10, 0.25, 0.60 },
{ 10, 10, 0.44, 0.76 },
{ 20, 10, 0.65, 0.88 },
{ 0, 1, 0.80, 0.98 }
},
false
);
}
@Test
public void testWordPressTranslationsAccuraciesShortProfile() throws IOException {
testSubstringAccuracies(
"wp-translations.tsv",
new double[][] {
{ 5, 10, 0.23, 0.61 },
{ 10, 10, 0.47, 0.77 },
{ 20, 10, 0.69, 0.90 },
{ 0, 1, 0.94, 0.99 }
},
true
);
}
@@ -106,9 +140,17 @@ public void testWordPressTranslationsAccuracies() throws IOException {
* substring length and sample size, which are passed to
* {@link #generateSubstringSample(String, int, int)}, and a per-language accuracy threshold
* and mean accuracy threshold, which are used to determine whether the trial passes or fails
* @param useShortProfile if true, the short text language profile will be used instead of the default profile
*/
private void testSubstringAccuracies(String datasetPath, double[][] allTrialParams) throws IOException {
LangdetectService service = new LangdetectService();
private void testSubstringAccuracies(String datasetPath,
double[][] allTrialParams,
boolean useShortProfile) throws IOException {
LangdetectService service = new LangdetectService(
Settings.builder()
.putArray("languages", LangdetectService.DEFAULT_LANGUAGES)
.put("profile", useShortProfile ? "short-text" : "")
.build()
);
Map<String, List<String>> languageToFullTexts = readMultiLanguageDataset(datasetPath);
// Sort the languages to make the log output prettier.
List<String> languages = new ArrayList<>(languageToFullTexts.keySet());
ProTip! Use n and p to navigate between commits in a pull request.