In [29]:
import re 
import pandas as pd 
from nltk.tokenize import WhitespaceTokenizer
from nltk.lm.preprocessing import padded_everygram_pipeline, pad_both_ends
from nltk.util import bigrams
from nltk.lm import MLE,Laplace

# Regex and MLE/Laplace perperplexity
--- 

In [30]:
dadosPT = pd.read_csv('../bases/stackoverflow_portugues.csv')
dadosPT['idioma'] = 'portugues'
dadosEN = pd.read_csv('../bases/stackoverflow_ingles.csv')
dadosEN['idioma'] = 'ingles'
dadosES = pd.read_csv('../bases/stackoverflow_espanhol.csv')
dadosES['idioma'] = 'espanhol'

dados = pd.concat([dadosEN,dadosPT,dadosES], ignore_index=True).reset_index(drop=True)[['Questão','idioma']]
dados = dados.rename({"Questão":'question'},axis=1)
dados.head(2)

Unnamed: 0,question,idioma
0,<p>Here is a piece of C++ code that seems very...,ingles
1,<p>I accidentally committed the wrong files to...,ingles


In [31]:
def removeWords(word,regex,sub=""):
    if type(word) == str:
        return regex.sub(sub,word)
    else:
        return [regex.sub(sub,w) for w in word]
    
regex = re.compile("<code(.|(\n))*?/code>")
#print(removeWords(dadosEN['Questão'][33],regex))
# remove everything between code tag with any character or breakrow, and ? is to get the higher amount of words as possible
dados['question'] = removeWords(dados['question'],regex)

# remove all html tags remaning
regex = re.compile("<.*?>")
#print(removeWords(dadosEN['Questão'][33],regex))
dados['question'] = removeWords(dados['question'],regex)

# Drop all except alphanumeric and spaces
regex = re.compile(r"[^\w\s]")
#print(removeWords(dadosPT['Questão'][33],regex))
dados['question'] = removeWords(dados['question'],regex)

# Drop numbers
regex = re.compile("\d+")
#print(removeWords(dadosPT['Questão'][33],regex))
dados['question'] = removeWords(dados['question'],regex)

# spaces and \n repited or not will be replaced by " "
regex = re.compile(r"\s+")
#print(removeWords(dadosPT['Questão'][33],regex,sub=" "))
dados['question'] = removeWords(dados['question'],regex,sub=" ")

# Strip blank spaces
dados['question'] = dados['question'].str.strip()

# let everything to lowercase
dados['question'] = dados['question'].str.lower()


dados['question'].sample()

1011    hice un commit e inmediatamente después noté q...
Name: question, dtype: object

In [32]:
from sklearn.model_selection import train_test_split

In [33]:
models = {
    idioma:
    {
        'train_test': train_test_split(dados[dados['idioma'] == idioma]['question'], test_size=0.2, random_state=42),
    }
 for idioma in dados['idioma'].unique()
}

models = {
    idioma:
    {
        'train': items['train_test'][0],
        'test': items['train_test'][1],
    }
 for idioma,items in models.items()
}

In [34]:
def fit_model(train,model):
    vocabulary = ' '.join(train)
    vocabulary = WhitespaceTokenizer().tokenize(vocabulary)
    wordBigrams,vocabulary = padded_everygram_pipeline(2,vocabulary)
    model.fit(wordBigrams,vocabulary)
    return model

In [45]:
for idioma in models.keys():
    models[idioma]['modelMLE'] = fit_model(models[idioma]['train'],MLE(2))
for idioma in models.keys():
    models[idioma]['modelLaplace'] = fit_model(models[idioma]['train'],Laplace(2))

In [46]:
def calculate_perplexity(model,text,tokenizer=WhitespaceTokenizer()):
    words = tokenizer.tokenize(text)
    fake_chars = [list(pad_both_ends(word, n = 2)) for word in words]
    wordBigrams = [list(bigrams(word)) for word in fake_chars]
    
    return sum([model.perplexity(word) for word in wordBigrams])

In [47]:
print(calculate_perplexity(models['ingles']['modelMLE'], "good morning"))
print(calculate_perplexity(models['ingles']['modelLaplace'], "good morning"))

33.8325426888377
33.915936843206005


In [48]:
def perplexity_argmax(text,use_model='modelMLE'):
    perplexities = {
        idioma:calculate_perplexity(models[idioma][use_model], text)
        for idioma in models.keys()
    }
    return pd.Series(perplexities).idxmin()

In [49]:
perplexity_argmax("deixa os garoto brinca power ranger ")

'portugues'

In [50]:
print("Using MLE")
for idioma in models.keys():
    predicted = models[idioma]['test'].apply(lambda x: perplexity_argmax(x,use_model='modelMLE'))
    print(f"{idioma} accuracy: {round((predicted == idioma).sum()/predicted.shape[0],2)}")
    
print("Using Laplace")
for idioma in models.keys():
    predicted = models[idioma]['test'].apply(lambda x: perplexity_argmax(x,use_model='modelLaplace'))
    print(f"{idioma} accuracy: {round((predicted == idioma).sum()/predicted.shape[0],2)}")

Using MLE
ingles accuracy: 0.98
portugues accuracy: 0.8
espanhol accuracy: 0.86
Using Laplace
ingles accuracy: 1.0
portugues accuracy: 1.0
espanhol accuracy: 0.97


In [41]:
models[idioma]['test'][predicted != idioma].sample().values[0],predicted[predicted != idioma].sample().values[0]

('tengo una serie de en mi vista cada muestra una unidad de medida y en caso de querer editarla se reemplaza via javascript por un via boton nota tanto las labels como los input son elementos con classccc_unidad para iterarlos posteriormente el resultante simplificado quedaria algo asi como véis en el snippet intento coger el valor de la label si no lo tiene busco el valor del input pero no esta funcionando correctamente cómo puedo diferenciar que tipo de elemento es la variable para así coger el atributo correspondiente a cada tipo',
 'ingles')