Skip to content

Commit

Permalink
update to language-detector 0.5
Browse files Browse the repository at this point in the history
  • Loading branch information
danielnaber committed Apr 6, 2015
1 parent f836230 commit 81f1372
Show file tree
Hide file tree
Showing 4 changed files with 12 additions and 14 deletions.
2 changes: 1 addition & 1 deletion languagetool-core/pom.xml
Expand Up @@ -106,7 +106,7 @@
<dependency>
<groupId>com.optimaize.languagedetector</groupId>
<artifactId>language-detector</artifactId>
<version>0.4</version>
<version>0.5</version>
<exclusions>
<exclusion>
<groupId>stax</groupId>
Expand Down
Expand Up @@ -21,6 +21,7 @@
import com.google.common.base.Optional;
import com.optimaize.langdetect.LanguageDetector;
import com.optimaize.langdetect.LanguageDetectorBuilder;
import com.optimaize.langdetect.i18n.LdLocale;
import com.optimaize.langdetect.ngram.NgramExtractors;
import com.optimaize.langdetect.profiles.LanguageProfile;
import com.optimaize.langdetect.profiles.LanguageProfileReader;
Expand Down Expand Up @@ -54,7 +55,7 @@ public class LanguageIdentifier {
private static final List<String> ignoreLangCodes = Arrays.asList("ast", "gl");

// languages that we offer profiles for as they are not yet supported by language-detector:
private static final List<String> externalLangCodes = Arrays.asList("km", "eo");
private static final List<String> externalLangCodes = Arrays.asList("eo");

private final LanguageDetector languageDetector;
private final TextObjectFactory textObjectFactory;
Expand All @@ -81,8 +82,8 @@ private static List<String> getLanguageCodes() {
continue;
}
if ("zh".equals(langCode)) {
langCodes.add("zh-cn");
langCodes.add("zh-tw");
langCodes.add("zh-CN");
langCodes.add("zh-TW");
} else {
langCodes.add(langCode);
}
Expand Down Expand Up @@ -122,14 +123,11 @@ public Language detectLanguage(String text) {
@Nullable
private String detectLanguageCode(String text) {
TextObject textObject = textObjectFactory.forText(text);
Optional<String> lang = languageDetector.detect(textObject);
Optional<LdLocale> lang = languageDetector.detect(textObject);
// comment in for debugging:
//System.out.println(languageDetector.getProbabilities(textObject));
if (lang.isPresent()) {
if ("zh-cn".equals(lang.get()) || "zh-tw".equals(lang.get())) {
return "zh";
}
return lang.get();
return lang.get().getLanguage();
} else {
return null;
}
Expand Down
3 changes: 3 additions & 0 deletions languagetool-standalone/CHANGES.md
Expand Up @@ -29,6 +29,9 @@
* `AbstractCompoundRule.setShort(String)` has been removed and added as
a constructor parameter instead.

#### Internal
* updated to language-detector 0.5


## Older versions

Expand Down
Expand Up @@ -39,8 +39,7 @@ public void testDetection() {
langAssert("en", "This is an English text");
langAssert("fr", "Le mont Revard est un sommet du département français ...");
// some test sentences from the "Linux" article of Wikipedia:
// TODO: results not stable - comment in once https://github.com/optimaize/language-detector/issues/14 is resolved
/*langAssert("be", "Першапачаткова Linux распрацоўваўся і выкарыстоўваўся асобнымі аматарамі на сваіх персанальных камп'ютарах.");
langAssert("be", "Першапачаткова Linux распрацоўваўся і выкарыстоўваўся асобнымі аматарамі на сваіх персанальных камп'ютарах.");
langAssert("ca", "Aquest sistema operatiu va créixer gràcies al treball col·laboratiu de programadors de tot el món ...");
langAssert("zh", "Linux最初是作为支持英特尔x86架构的个人电脑的一个自由操作系统。目前Linux已经被移植到更多的计算机硬件平台");
langAssert("da", "Linux-distributionerne har traditionelt deres største udbredelse på servere, men er hastigt på vej på almindelige pc'er.");
Expand All @@ -64,14 +63,12 @@ public void testDetection() {
langAssert("tl", "Ang Linux ay isang operating system kernel para sa mga operating system na humahalintulad sa Unix.");
langAssert("ta", "Linux பற்றி பிற கட்டுரைகளில் தேடிப்பாருங்கள்.");
langAssert("uk", "Лі́нукс — загальна назва UNIX-подібних операційних систем на основі однойменного ядра.");
*/
// not yet in language-detector 0.4:
langAssert("km", "អ្នក\u200Bអាច\u200Bជួយ\u200Bលើក\u200Bស្ទួយ\u200Bវិគីភីឌាភាសាខ្មែរ\u200Bនេះ\u200Bឱ្យ\u200Bមាន\u200Bលក្ខណៈ");
// not yet in language-detector 0.5:
langAssert("eo", "Imperiestraj pingvenoj manĝas ĉefe krustacojn kaj malgrandajn ...");
}

@Test
@Ignore("this test isn't stable due to https://github.com/optimaize/language-detector/issues/14") // TODO: re-activate
public void testKnownLimitations() {
// not activated because it impairs detection of Spanish, so ast and gl may be mis-detected:
langAssert("es", "L'Iberorrománicu o Iberromance ye un subgrupu de llingües romances que posiblemente ..."); // ast
Expand Down

0 comments on commit 81f1372

Please sign in to comment.