In [1]:
from utils import translate_iso_639_1_to_639_3

In [2]:
SAMPLE_TEXT = (
    "Yesterday, miałem naprawdę busy day w pracy. Było mnóstwo meetings"
    " i deadlines, które musiałem dotrzymać. W połowie dnia, mój boss poprosił"
    " mnie o przygotowanie prezentacji na bardzo ważny client meeting."
    " Spędziłem hours working on it, ensuring every detail był perfect."
    " When I finally finished, czułem się naprawdę exhausted."
)

## langdetect
> https://github.com/Mimino666/langdetect
>
> Python wrapper for `language-detection` developed in Java (https://github.com/shuyo/language-detection). Approach description as slides - https://www.slideshare.net/slideshow/language-detection-library-for-java/6014274#1
>
> **TL;DR**: Naive Bayes algorithm and character n-grams as features; open source; tested on over 9,000 news articles in 49 languages with an accuracy of 99.77%; can be innacurate for short texts.
>
> License: Apache License 2.0

In [17]:
import langdetect

langdetect.DetectorFactory.seed = 0

In [18]:
langdetect_result = {
    item.lang: item.prob
    for item in langdetect.detect_langs(SAMPLE_TEXT)
}
langdetect_result

{'pl': 0.8571418233629462, 'en': 0.14285650999355323}

In [19]:
langdetect_result_iso_639_3 = {
    translate_iso_639_1_to_639_3(key): value
    for key, value in langdetect_result.items()
}
langdetect_result_iso_639_3

{'pol': 0.8571418233629462, 'eng': 0.14285650999355323}

In [20]:
pred = max(langdetect_result_iso_639_3, key=langdetect_result_iso_639_3.get)
pred

'pol'

In [22]:
pred = translate_iso_639_1_to_639_3(langdetect.detect(SAMPLE_TEXT))
pred

'pol'

## lingua-py

> https://github.com/pemistahl/lingua-py
>
> Python bindigs to `Lingua` implemented in Rust
>
> **TL;DR**: suitable for short text and mixed-language text; utilizes a language model based on character n-grams (1-5); supports multithreading
>
> License: Apache License 2.0

In [7]:
from lingua import LanguageDetectorBuilder

In [8]:
detector = (
    LanguageDetectorBuilder.from_all_languages()
    .with_preloaded_language_models()
    # .with_low_accuracy_mode()  # for mostly long texts or need to save resources, enables a low accuracy mode that loads only a small subset of the language models into memory
    .build())

In [9]:
lingua_result = detector.compute_language_confidence_values(SAMPLE_TEXT)
lingua_result[:5]

[ConfidenceValue(language=Language.POLISH, value=1),
 ConfidenceValue(language=Language.TAGALOG, value=0.00000000000000000000000000000000109596124442021),
 ConfidenceValue(language=Language.LATIN, value=0.0000000000000000000000000000000000000000000000030292104252334643),
 ConfidenceValue(language=Language.AFRIKAANS, value=0.000000000000000000000000000000000000000000000000000000004528876621020498),
 ConfidenceValue(language=Language.BASQUE, value=0.00000000000000000000000000000000000000000000000000000000000000007637790572887204)]

In [31]:
lingua_result = sorted(lingua_result, key=lambda x: x.value, reverse=True)
lingua_result[:5]

[ConfidenceValue(language=Language.POLISH, value=1),
 ConfidenceValue(language=Language.TAGALOG, value=0.00000000000000000000000000000000109596124442021),
 ConfidenceValue(language=Language.LATIN, value=0.0000000000000000000000000000000000000000000000030292104252334643),
 ConfidenceValue(language=Language.AFRIKAANS, value=0.000000000000000000000000000000000000000000000000000000004528876621020498),
 ConfidenceValue(language=Language.BASQUE, value=0.00000000000000000000000000000000000000000000000000000000000000007637790572887204)]

In [13]:
det_threshold = 0.01

lingua_result_iso_639_3 = {
    item.language.iso_code_639_3.name.lower(): item.value
    for item in lingua_result
    if item.value > det_threshold
}
lingua_result_iso_639_3

{'pol': 1.0}

In [11]:
pred = max(lingua_result_iso_639_3, key=lingua_result_iso_639_3.get)
pred

'pol'