In [1]:
from wordfreq import top_n_list, random_words
import numba
import numpy as np
import random
from collections import Counter

random.seed(2020)

In [2]:
# Somehow similar languages.
LANGUAGES = {
    'es': 'Spanish',
    'en': 'English',
    'pt': 'Portuguese',
    'fr': 'French',
    'de': 'German',
    'it': 'Italian',
    'nl': 'Dutch'
}

LANG_CODES = list(LANGUAGES.keys())

# Population of native speakers in Millions.
POPULATION = {
    'es': 480,
    'en': 379,
    'pt': 221,
    'fr': 77.2,
    'de': 76.1,
    'it': 64.8,
    'nl': 23.1
}

[POPULATION.__setitem__(lang, 100) for lang in POPULATION]

[None, None, None, None, None, None, None]

In [3]:
# Numero de documentos.
M = 100

# Numero de clases.
n = len(POPULATION)

# Generate some random documents.
docs = []

# Collect the unique words.
unique_words = set()

# Keep a count
counters = [Counter() for _ in LANG_CODES]

# Crear documentos proporcionales a la poblacion.
for k in random.choices(list(range(len(POPULATION))), weights=list(POPULATION.values()), k=M):

    # Language.
    lang = LANG_CODES[k]
    
    # Document.
    doc = random_words(lang=lang, wordlist='best', nwords=50000, bits_per_word=14).split()
    
    # Count
    counters[k].update(doc)

# Collect the unique words
unique_words = set()

for counter in counters:
    unique_words = unique_words.union(counter)

unique_words = list(unique_words)

# Numero de palabras.
N = len(unique_words)

# Numero total de palabras.
M_k = np.array([sum(counter.values()) for counter in counters])
M_w = sum(M_k)
M_jk = np.array([[counter[w] for counter in counters] for w in unique_words])

# Fix for zero. Add all the words to all the languages.
M += n
M_w += N * n
M_k += N
M_jk += 1

In [4]:
def classify(doc):
    
    # Remove some symbols.
    for s in ',.?!':
        doc = doc.replace(s, '')
    
    # Eliminar las palabras que no estan en el diccionario.
    doc = [w for w in doc.lower().split() if w in unique_words]
    
    #print(doc)
    
    assert len(doc) > 0
    
    r = - (len(doc) - 1) * np.log(M_k)
    
    for word in doc:
        r += np.log(M_jk[unique_words.index(word)])
                
    k = r.argmax()
    
    return LANG_CODES[k]
    

In [5]:
DOCS = [
    "Hola, mi nombre es Jorge, hi",
    "where is my bag? da",
    "Ich hatte gerne ein cerveza,",
    "je ne sais quoi hola,",
    "eu não falo, bier, ",
    "io non parlo italiano, what",
    "Ik spreek geen nederlands, muchas gracias"
]

for doc in DOCS:
    lang = classify(doc)
    print(f'({lang}) "{doc}"')

(es) "Hola, mi nombre es Jorge, hi"
(de) "where is my bag? da"
(de) "Ich hatte gerne ein cerveza,"
(fr) "je ne sais quoi hola,"
(pt) "eu não falo, bier, "
(it) "io non parlo italiano, what"
(nl) "Ik spreek geen nederlands, muchas gracias"
