In [3]:
import fasttext
from fasttext import tokenize

In [4]:
import re

In [36]:
import os

In [37]:
# I am using lid.176.bin, which is faster and slightly more accurate,
# there is a compressed version avaiable at https://fasttext.cc/docs/en/language-identification.html
fasttext_language_model = fasttext.load_model(os.path.join("model", "lid.176.bin"))



In [20]:
white_space_pattern = re.compile(r"\s")

In [147]:
def preprocess_text_for_language_detection(text: str):
    """
    Cleans the text as per fasttext requirements.
    The requirements can be found here: https://pypi.org/project/fasttext/
    
    :text: str: text to clean
    :returns: str: cleaned text
    """
    # fastText assumes UTF-8 encoded text
    text = str(text)
    
    # fastText is not aware of UTF-8 whitespace
    # Replace all white space with space
    text = white_space_pattern.sub(text, " ")
    
    # Tokenize text, per fastext function and rejoin
    tokens = tokenize(text)
    text = " ".join(tokens)
    n = len(tokens)
    
    # Remove white space char as it affects the model accuracy
    text = text.replace("</s>", "")
    
    return text.lower()

In [148]:
# Effect of new line on language detection
print(fasttext_language_model.predict(" ".join(tokenize("The door bell rang twice,\n but no one opened the door"))))
print(fasttext_language_model.predict(" ".join(tokenize("The door bell rang twice, but no one opened the door"))))

(('__label__en',), array([0.44373196]))
(('__label__en',), array([0.7606498]))


In [149]:
# Effect of exclamation, at the end of sentence
print(fasttext_language_model.predict(" ".join(tokenize("The door opened!")), 2))
print(fasttext_language_model.predict(" ".join(tokenize("The door opened"))))

(('__label__en', '__label__nl'), array([0.58429289, 0.37814581]))
(('__label__en',), array([0.75521648]))


In [150]:
# Effect of exclamation, at the end of sentence
print(fasttext_language_model.predict(" ".join(tokenize("You scared the life out of me!"))))
print(fasttext_language_model.predict(" ".join(tokenize("You scared the life out of me"))))

(('__label__en',), array([0.91090018]))
(('__label__en',), array([0.80553901]))


In [151]:
# Effect of exclamation, at the end of sentence
print(fasttext_language_model.predict(" ".join(tokenize("It's a boy!"))))
print(fasttext_language_model.predict(" ".join(tokenize("It's a boy"))))

(('__label__en',), array([0.98612791]))
(('__label__en',), array([0.95733058]))


In [152]:
# Effect of exclamation, at the end of sentence
print(fasttext_language_model.predict(" ".join(tokenize("We won!"))))
print(fasttext_language_model.predict(" ".join(tokenize("We won"))))

(('__label__en',), array([0.78890896]))
(('__label__en',), array([1.00000072]))


In [153]:
# Effect of question mark, at the end of sentence
print(fasttext_language_model.predict(" ".join(tokenize("Do you drink?"))))
print(fasttext_language_model.predict(" ".join(tokenize("Do you drink"))))

(('__label__en',), array([0.95603132]))
(('__label__en',), array([0.97222412]))


In [154]:
# Effect of full stop, at the end of sentence
print(fasttext_language_model.predict(" ".join(tokenize("John opened the door."))))
print(fasttext_language_model.predict(" ".join(tokenize("John opened the door"))))

(('__label__en',), array([0.97491264]))
(('__label__en',), array([0.79377073]))


In [155]:
# Effect of quotes, at the end of sentence
print(fasttext_language_model.predict(" ".join(tokenize('I told her, "Take the plates away."'))))
print(fasttext_language_model.predict(" ".join(tokenize('I told her, Take the plates away'))))

(('__label__en',), array([0.98221743]))
(('__label__en',), array([0.95327079]))


In [156]:
# Effect of lowercase
print(fasttext_language_model.predict(" ".join(tokenize("The door opened"))))
print(fasttext_language_model.predict(" ".join(tokenize("The door opened".lower()))))

(('__label__en',), array([0.75521648]))
(('__label__en',), array([0.79327965]))


In [157]:
text = "¿Cómo estás"

In [158]:
cleaned_text = preprocess_text_for_language_detection(text)
cleaned_text

'¿cómo estás'

In [160]:
fasttext_language_model.predict(text), fasttext_language_model.predict(cleaned_text)

((('__label__es',), array([1.00004685])),
 (('__label__es',), array([1.00004709])))

In [217]:
def identify_languages(text: str, no_of_languages: int =1, threshold: float=0.0):
    
    clean_text = preprocess_text_for_language_detection(text)
    ft_output = fasttext_language_model.predict(text, no_of_languages, threshold=threshold)
    # format output
    result = [(ft_output[0][i][-2:], ft_output[1][i]) for i in range(len(ft_output[0]))]
    return result

In [218]:
identify_languages("the door is open", 4)

[('en', 0.8984457850456238),
 ('nl', 0.09827558696269989),
 ('ru', 0.0020195997785776854),
 ('it', 0.0004695519746746868)]