Improve classification accuracy and coverage by merging profiles #69

merged 22 commits into from Jun 7, 2017
+322 −273
Make classification accuracy tests more granular

yanirs committed Dec 2, 2016
commit 6cd972e24d1421b4328115e93a63177129dfbeb4
@@ -93,6 +93,7 @@ tasks.withType(JavaCompile) {
test {
systemProperties['path.home'] = System.getProperty("user.dir")
systemProperties['path.accuracies.out'] = System.getProperty("path.accuracies.out")
testLogging {
showStandardStreams = false
exceptionFormat = 'full'
@@ -0,0 +1,275 @@
package org.xbib.elasticsearch.index.mapper.langdetect;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.elasticsearch.common.settings.Settings;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.Parameterized;
import org.xbib.elasticsearch.common.langdetect.LangdetectService;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Random;
import java.util.Scanner;
import java.util.Set;
import java.util.TreeSet;
public class DetectLanguageAccuracyTest extends Assert {
private static final Logger logger = LogManager.getLogger();
private static final double ACCURACY_DELTA = 1e-6;
private static final String ALL_LANGUAGES =
"af,ar,bg,bn,ca,cs,da,de,el,en,es,et,fa,fi,fr,gu,he,hi,hr,hu,id,it,ja,kn,ko,lt,lv,mk,ml,mr,ne,nl,no,pa,pl,pt," +
private static final String DEFAULT_LANGUAGES = Joiner.on(",").join(LangdetectService.DEFAULT_LANGUAGES);
private static final String ALL_DEFAULT_PROFILE_LANGUAGES =
"af,ar,bg,bn,cs,da,de,el,en,es,et,fa,fi,fr,gu,he,hi,hr,hu,id,it,ja,kn,ko,lt,lv,mk,ml,mr,ne,nl,no,pa,pl,pt,ro," +
private static final String ALL_SHORT_PROFILE_LANGUAGES =
"ar,bg,bn,ca,cs,da,de,el,en,es,et,fa,fi,fr,gu,he,hi,hr,hu,id,it,ja,ko,lt,lv,mk,ml,nl,no,pa,pl,pt,ro,ru,si,sq," +
private static Map<String, Map<String, List<String>>> multiLanguageDatasets;
private static Path outputPath;
private final String datasetName;
private final int substringLength;
private final int sampleSize;
private final boolean useShortProfile;
private final boolean useAllLanguages;
private final Map<String, Double> languageToExpectedAccuracy;
* Construct a test for classification accuracies on substrings of texts from a single dataset.
* For each text and substring length, this test generates a sample of substrings (drawn uniformly with
* replacement from the set of possible substrings of the given length), runs the language identification code,
* measures the per-language accuracy (percentage of substrings classified correctly), and fails if the accuracy
* varies by more than {@link #ACCURACY_DELTA} from the expected accuracy for the language.
* @param datasetName multi-language dataset name, as read in the setup step (see {@link #setUp()})
* @param substringLength substring length to test (see {@link #generateSubstringSample(String, int, int)})
* @param sampleSize number of substrings to test (see {@link #generateSubstringSample(String, int, int)})
* @param useShortProfile if true, the short text language profile will be used instead of the default profile
* @param useAllLanguages if true, all supported languages will be used instead of just the default ones
* @param languageToExpectedAccuracy mapping from language code to expected accuracy
public DetectLanguageAccuracyTest(String datasetName,
int substringLength,
int sampleSize,
boolean useShortProfile,
boolean useAllLanguages,
Map<String, Double> languageToExpectedAccuracy) {
this.datasetName = datasetName;
this.substringLength = substringLength;
this.sampleSize = sampleSize;
this.useShortProfile = useShortProfile;
this.useAllLanguages = useAllLanguages;
this.languageToExpectedAccuracy = languageToExpectedAccuracy;
* Perform the common set up tasks for tests of this class: read the datasets, and write the header row of the
* output CSV if the path.accuracies.out system property is set.
public static void setUp() throws IOException {
multiLanguageDatasets = new HashMap<>();
multiLanguageDatasets.put("udhr", readMultiLanguageDataset("udhr.tsv"));
multiLanguageDatasets.put("wordpress-translations", readMultiLanguageDataset("wordpress-translations.tsv"));
String outputPathStr = System.getProperty("path.accuracies.out");
if (outputPathStr != null && !outputPathStr.isEmpty()) {
logger.warn("File argument given ({}) -- running in output mode without assertions", outputPathStr);
outputPath = Paths.get(outputPathStr);
// Write column headers
Collections.singletonList("datasetName,substringLength,sampleSize,useShortProfile,useAllLanguages," +
* Run the test according to the parameters passed to the constructor.
* If {@link #outputPath} is not null, the test always passes and the results are written to the output path.
public void test() throws IOException {
// Set up the detection service according to the test's parameters
String languageSetting = DEFAULT_LANGUAGES;
if (useAllLanguages) {
LangdetectService service = new LangdetectService(
.put("languages", languageSetting)
.put("profile", useShortProfile ? "short-text" : "")
Map<String, List<String>> languageToFullTexts = multiLanguageDatasets.get(datasetName);
Set<String> testedLanguages = new TreeSet<>(languageToFullTexts.keySet());
// Classify the texts and calculate the accuracy for each language
Map<String, Double> languageToAccuracy = new HashMap<>(testedLanguages.size());
for (String language : testedLanguages) {
double numCorrect = 0;
List<String> fullTexts = languageToFullTexts.get(language);
for (String text : fullTexts) {
for (String substring : generateSubstringSample(text, substringLength, sampleSize)) {
if (Objects.equals(DetectLanguageTest.getTopLanguageCode(service, substring), language)) {
double accuracy = numCorrect / (fullTexts.size() * sampleSize);
languageToAccuracy.put(language, accuracy);
logger.debug("Language: {} Accuracy: {}", language, accuracy);
// If no output file is given, compare the obtained accuracies to the expected values. Otherwise, write the
// results to the output path without any assertions.
if (outputPath == null) {
assertEquals(languageToExpectedAccuracy.size(), languageToAccuracy.size());
for (Map.Entry<String, Double> entry : languageToAccuracy.entrySet()) {
assertEquals(languageToExpectedAccuracy.get(entry.getKey()), entry.getValue(), ACCURACY_DELTA);
} else {
List<Object> row = new ArrayList<>();
Collections.addAll(row, datasetName, substringLength, sampleSize, useShortProfile, useAllLanguages);
for (String language : ALL_LANGUAGES.split(",")) {
row.add(languageToAccuracy.containsKey(language) ? languageToAccuracy.get(language) : Double.NaN);
* Read and parse the test parameters from the accuracies.csv resource.
* @return the parsed parameters
@Parameterized.Parameters(name="{0}: substringLength={1} sampleSize={2} useShortProfile={3} useAllLanguages={4}")
public static Collection<Object[]> data() throws IOException {
List<Object[]> data = new ArrayList<>();
try (BufferedReader br = getResourceReader("accuracies.csv")) {
// Skip header line
String line;
while ((line = br.readLine()) != null) {
Scanner scanner = new Scanner(line).useDelimiter(",");
data.add(new Object[] {
// datasetName,
// substringLength
// sampleSize
// useShortProfile
// useAllLanguages
// languageToExpectedAccuracy
Map<String, Double> languageToExpectedAccuracy = new HashMap<>();
for (String language : ALL_LANGUAGES.split(",")) {
double expectedAccuracy = scanner.nextDouble();
if (!Double.isNaN(expectedAccuracy)) {
languageToExpectedAccuracy.put(language, expectedAccuracy);
data.get(data.size() - 1)[5] = languageToExpectedAccuracy;
return data;
* Read and parse a multi-language dataset from the given path.
* @param path resource path, where the file is in tab-separated format with two columns: language code and text
* @return a mapping from each language code found in the file to the texts of this language
private static Map<String, List<String>> readMultiLanguageDataset(String path) throws IOException {
Map<String, List<String>> languageToFullTexts = new HashMap<>();
try (BufferedReader br = getResourceReader(path)) {
String line;
while ((line = br.readLine()) != null) {
String[] splitLine = line.split("\t");
String language = splitLine[0];
if (!languageToFullTexts.containsKey(language)) {
languageToFullTexts.put(language, new ArrayList<String>());
return languageToFullTexts;
* Helper method to open a resource path and return it as a BufferedReader instance.
private static BufferedReader getResourceReader(String path) throws IOException {
return new BufferedReader(new InputStreamReader(DetectLanguageAccuracyTest.class.getResourceAsStream(path),
* Generate a random sample of substrings from the given text.
* Sampling is performed uniformly with replacement from the set of substrings of the provided text, ignoring
* whitespace-only substrings. The random seed is set to a deterministic function of the method's parameters, so
* repeated calls to this method with the same parameters will return the same sample.
* @param text the text from which the substring sample is drawn
* @param substringLength length of each generated substring (set to zero to return a singleton list with the
* text -- sampleSize must be 1 in this case)
* @param sampleSize number of substrings to include in the sample
* @return the sample (a list of strings)
private List<String> generateSubstringSample(String text, int substringLength, int sampleSize) {
if (substringLength == 0 && sampleSize == 1) {
return Collections.singletonList(text);
if (substringLength > text.trim().length()) {
throw new IllegalArgumentException("Provided text is too short.");
Random rnd = new Random(Objects.hash(text, substringLength, sampleSize));
List<String> sample = new ArrayList<>(sampleSize);
while (sample.size() < sampleSize) {
int startIndex = rnd.nextInt(text.length() - substringLength + 1);
String substring = text.substring(startIndex, startIndex + substringLength);
if (!substring.trim().isEmpty()) {
return sample;
