In [2]:
from wordfreq import top_n_list, random_words
import numpy as np
import random

random.seed(2020)

In [3]:
# Somehow similar languages.
LANGUAGES = {
    'es': 'Spanish',
    'en': 'English',
    'pt': 'Portuguese',
    'fr': 'French',
    'de': 'German',
    'it': 'Italian',
    'nl': 'Dutch'
}

LANG_CODES = list(LANGUAGES.keys())

# Population of native speakers in Millions.
POPULATION = {
    'es': 480,
    'en': 379,
    'pt': 221,
    'fr': 77.2,
    'de': 76.1,
    'it': 64.8,
    'nl': 23.1
}

#[POPULATION.set(lang, 100) for lang in POPULATION]

In [4]:
# Numero de documentos.
M = 10000

# Numero de clases.
n = len(POPULATION)

# Generate some random documents.
docs = []

# Collect the unique words.
unique_words = set()

# Crear documentos proporcionales a la poblacion.
for k in random.choices(list(range(len(POPULATION))), weights=list(POPULATION.values()), k=M):

    # Seleccionar 100 palabras al azar.
    doc = random_words(lang=LANG_CODES[k], wordlist='best', nwords=10, bits_per_word=8)
    docs.append((k, doc))

    # Collect unique words.
    unique_words = unique_words.union(doc.split())

# Numero de palabras.
N = len(unique_words)

# Parametros.
M_k = np.zeros(n)
M_jk = np.zeros((N, n))
    
# Turn uniue words into a list.
unique_words = list(unique_words)    

# Crear documentos proporcionales a la poblacion.
for k, doc in docs:
        
    # Mas documentos en una clase.
    M_k[k] += 1
    
    for word in set(doc.split()):
        M_jk[unique_words.index(word), k] += 1

 
# Fix for zero. Add all the words to all the languages.
M += N * n
M_k += N
M_jk += 1

In [5]:
def phi_b(doc):
    
    doc = doc.split()
    d = np.zeros(N)
    
    for i, word in enumerate(unique_words):
        if word in doc:
            d[i] = 1
            
    return d

In [6]:
def classify(doc):
    
    # Remove some symbols.
    for s in ',.?!':
        doc = doc.replace(s, '')

    # Representacion binomial.
    d = phi_b(doc)    

    # Magic.
    r = np.sum(np.log(np.einsum('j,jk->jk', d, M_jk) + np.einsum('j,k->jk', 1 - d, M_k) - np.einsum('j,jk->jk', 1 - d, M_jk)), 0) - (N - 1) * np.log(M_k)
    
    k = r.argmax()
    
    return LANG_CODES[k]
    

In [7]:
DOCS = [
    "Hola, mi nombre es Jorge",
    "where is my bag? da",
    "Ich hatte gerne ein cerveza",
    "je ne sais quoi hola",
    "eu no falo, bier",
    "io non parlo italiano, what",
    "Ik spreek geen nederlands, si"
]

for doc in DOCS:
    lang = classify(doc)
    print(f'({lang}) "{doc}"')

(es) "Hola, mi nombre es Jorge"
(en) "where is my bag? da"
(de) "Ich hatte gerne ein cerveza"
(fr) "je ne sais quoi hola"
(pt) "eu no falo, bier"
(it) "io non parlo italiano, what"
(nl) "Ik spreek geen nederlands, si"
