In [47]:
import re 
import pandas as pd 
from nltk.tokenize import WhitespaceTokenizer
from nltk.lm.preprocessing import padded_everygram_pipeline, pad_both_ends
from nltk.util import bigrams
from nltk.lm import MLE,Laplace

# Regex and MLE/Laplace perperplexity
--- 

In [9]:
dadosPT = pd.read_csv('../bases/stackoverflow_portugues.csv')
dadosPT['idioma'] = 'portugues'
dadosEN = pd.read_csv('../bases/stackoverflow_ingles.csv')
dadosEN['idioma'] = 'ingles'
dadosES = pd.read_csv('../bases/stackoverflow_espanhol.csv')
dadosES['idioma'] = 'espanhol'

dados = pd.concat([dadosEN,dadosPT,dadosES], ignore_index=True).reset_index(drop=True)[['Questão','idioma']]
dados = dados.rename({"Questão":'question'},axis=1)
dados.head(2)

Unnamed: 0,question,idioma
0,<p>Here is a piece of C++ code that seems very...,ingles
1,<p>I accidentally committed the wrong files to...,ingles


In [18]:
def removeWords(word,regex,sub=""):
    if type(word) == str:
        return regex.sub(sub,word)
    else:
        return [regex.sub(sub,w) for w in word]
    
regex = re.compile("<code(.|(\n))*?/code>")
#print(removeWords(dadosEN['Questão'][33],regex))
# remove everything between code tag with any character or breakrow, and ? is to get the higher amount of words as possible
dados['question'] = removeWords(dados['question'],regex)

# remove all html tags remaning
regex = re.compile("<.*?>")
#print(removeWords(dadosEN['Questão'][33],regex))
dados['question'] = removeWords(dados['question'],regex)

# Drop all except alphanumeric and spaces
regex = re.compile(r"[^\w\s]")
#print(removeWords(dadosPT['Questão'][33],regex))
dados['question'] = removeWords(dados['question'],regex)

# Drop numbers
regex = re.compile("\d+")
#print(removeWords(dadosPT['Questão'][33],regex))
dados['question'] = removeWords(dados['question'],regex)

# spaces and \n repited or not will be replaced by " "
regex = re.compile(r"\s+")
#print(removeWords(dadosPT['Questão'][33],regex,sub=" "))
dados['question'] = removeWords(dados['question'],regex,sub=" ")

# Strip blank spaces
dados['question'] = dados['question'].str.strip()

# let everything to lowercase
dados['question'] = dados['question'].str.lower()


dados['question'].sample()

1471    tengo una librería en c que se conecta con una...
Name: question, dtype: object

In [19]:
from sklearn.model_selection import train_test_split

In [37]:
models = {
    idioma:
    {
        'train_test': train_test_split(dados[dados['idioma'] == idioma]['question'], test_size=0.33, random_state=42),
    }
 for idioma in dados['idioma'].unique()
}

models = {
    idioma:
    {
        'train': items['train_test'][0],
        'test': items['train_test'][1],
    }
 for idioma,items in models.items()
}

In [42]:
def fit_model(train,model):
    vocabulary = ' '.join(train)
    vocabulary = WhitespaceTokenizer().tokenize(vocabulary)
    wordBigrams,vocabulary = padded_everygram_pipeline(2,vocabulary)
    model.fit(wordBigrams,vocabulary)
    return model

In [44]:
for idioma in models.keys():
    models[idioma]['modelMLE'] = fit_model(models[idioma]['train'],MLE(2))

In [53]:
def calculate_perplexity(model,text,tokenizer=WhitespaceTokenizer()):
    words = tokenizer.tokenize(text)
    fake_chars = [list(pad_both_ends(word, n = 2)) for word in words]
    wordBigrams = [list(bigrams(word)) for word in fake_chars]
    
    return sum([model.perplexity(word) for word in wordBigrams])

In [55]:
print(calculate_perplexity(models['ingles']['modelMLE'], "good morning"))

32.96559450762038


In [64]:
def perplexity_argmax(text):
    perplexities = {
        idioma:calculate_perplexity(models[idioma]['modelMLE'], text)
        for idioma in models.keys()
    }
    return pd.Series(perplexities).idxmin()

In [69]:
perplexity_argmax("deixa os garoto brinca")

'portugues'

In [11]:
from nltk.util import bigrams
from nltk.lm.preprocessing import pad_both_ends
list(bigrams(["nick","joe","jonas"]))

from nltk.lm.preprocessing import pad_both_ends

list(bigrams(pad_both_ends("alura",2)))


[('<s>', 'a'), ('a', 'l'), ('l', 'u'), ('u', 'r'), ('r', 'a'), ('a', '</s>')]