# Char level models

This code uses a special kind of neural network called an "N-gram-based neural language model" to do its work. It uses a tool called PyTorch to do this. It has custom or pre-trained embeddings.
Classical training with early stopping.
This tool can be used to compare how similar two chars are and how precise they are.

In [97]:
#Tools
import os
import time
import shutil
import random
from typing import Tuple #Tiping es una libreria que permite definir tipos de variables
from argparse import Namespace #Permite definir argumentos de entrada
import matplotlib.pyplot as plt

#Prepocessing
import nltk
from nltk.corpus import stopwords
from nltk import ngrams
import numpy as np
from nltk import FreqDist
import pandas as pd
from nltk.tokenize import TweetTokenizer
import math


#Pythorch
from torch.utils.data import DataLoader,TensorDataset
import torch
import torch.nn as nn
import torch.nn.functional as F

#scikit-learn
from sklearn.metrics import accuracy_score,confusion_matrix,f1_score,precision_recall_fscore_support




from sklearn.feature_selection import SelectKBest,chi2 #esta se tiene que cambiar por una a mano
from sklearn.manifold import TSNE



#Importar librerias de sklearn
from sklearn import svm 
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn import metrics 


from IPython.display import display

In [2]:

from nltk.corpus import stopwords

from nltk.tokenize import TweetTokenizer
from nltk import FreqDist

In [3]:
seed=1111
random.seed(seed) #phyton seed
np.random.seed(seed) #numpy seed
torch.manual_seed(seed) #torch seed
torch.backends.cudnn.deterministic = False #Se refiere a la implementacion de cudnn, que es la libreria de pytorch que permite hacer calculos en la GPU, False para que no sea determinista

In [4]:
X_train=pd.read_csv('mex20_train.txt',sep='\r\n',engine='python',header=None).loc[:,0].values.tolist()
X_val=pd.read_csv('mex20_val.txt',sep='\r\n',engine='python',header=None).loc[:,0].values.tolist()


In [5]:
args=Namespace()
args.N=4

In [6]:
class NgramData():
    def __init__(self, N:int,vocab_max:int=5000,tokenizer=None, embeddings_model=None):
        self.tokenizer=tokenizer if tokenizer else self.default_tokenizer()
        self.punct=set(['.','?','!',',',';',':','^','*','+','/','\\','"','´','`','¨','~','{','}','[',']','(',')','_','-','&','%','$','#','@','¿','?','¡','!','<','>','=','|','°','¬','¦','ª','º','©','®','«','»','“','”','‘','’','…','–','—','•','·','»','«','…','‘','’','“','”','–','—','•','·','¡','¿','<url>','@usuario','...'])
        self.N=N
        self.vocab_max=vocab_max
        self.UNK="<unk>"
        self.SOS='<s>'
        self.EOS='</s>'
        self.embeddings_model=embeddings_model
        
    def get_vocab_size(self) -> int:
        return len(self.vocab)
        
    def default_tokenizer(self,doc:str) -> list: #-> es para definir el tipo de variable que regresa
        return doc.split(" ")
    
    
    def remove_word(self,word:str) -> bool:
        word=word.lower()
        is_punct=True if word in self.punct else False
        is_digit=word.isnumeric()
        return is_punct or is_digit
    
    def get_vocabulary(self,corpus:list) -> set:
        freq_dist=FreqDist([w.lower() for sentence in corpus for w in self.tokenizer(sentence) if not  self.remove_word(w)]) #genera un diccionario de palabras y su frecuencia
        sorted_words=self.sortFreqDict(freq_dist)[:self.vocab_max-3] #selecciona las palabras mas frecuentes del diccionario creado, el -3 es para dejar espacio para los tokens especiales
        return set(sorted_words)
    
    def sortFreqDict(self,freq_dist) -> list:
        freq_dict=dict(freq_dist)
        return sorted(freq_dict, key=freq_dict.get, reverse=True)
    
    
 
        
    def fit(self,corpus:list) -> None:
        self.vocab=self.get_vocabulary(corpus)#tokens especiales
        self.vocab.add(self.UNK)
        self.vocab.add(self.SOS)
        self.vocab.add(self.EOS)
        
        #mapeo id a palabra y viceversa
        self.w2id={}
        self.id2w={}
        #embeddings preentrenados
        #vocab = sorted(self.vocab)  #
        if self.embeddings_model is not None:
            self.embeddings_matrix=np.empty((len(self.vocab),self.embeddings_model.vector_size))
            
        id=0
        for doc in corpus:
            for word in self.tokenizer(doc):
                word_=word.lower()
                if word_ in self.vocab and not word_ in self.w2id:
                    self.w2id[word_]=id
                    self.id2w[id]=word_
                    #solo crear la sub matriz de los que importan
                    
                    if self.embeddings_model is not None:
                        if word_ in self.embeddings_model:
                            self.embeddings_matrix[id]=self.embeddings_model[word_]
                        else:
                            self.embeddings_matrix[id]=np.random.rand(self.embeddings_model.vector_size)
                    id+=1
        #siempre agregar los tokens especiales  
        self.w2id.update({self.UNK:id,self.SOS:id+1,self.EOS:id+2})
        self.id2w.update({id:self.UNK,id+1:self.SOS,id+2:self.EOS})
        
        
        
        
    
    def transform(self,corpus:list)-> Tuple [np.array,np.array]:
        X_ngrams=[]
        y=[]
        for doc in corpus:
            doc_ngram=self.get_ngram_doc(doc)
            for words_window in doc_ngram:
                #print("words_window:", words_window)
                words_window_ids = [self.w2id[w] if w in self.w2id else self.w2id["<unk>"] for w in words_window]

                #words_window_ids=[self.w2id[w] for w in words_window]
                X_ngrams.append(list(words_window_ids[:-1]))
                y.append(words_window_ids[-1])
        return np.array(X_ngrams),np.array(y)
    
    

    def get_ngram_doc(self, doc: str) -> list:
        doc_tokens = self.tokenizer(doc)
        doc_tokens = self.replace_unk(doc_tokens)
        doc_tokens = [w.lower() for w in doc_tokens]
        doc_tokens = [self.SOS] * (self.N - 1) + doc_tokens + [self.EOS]  # Corrección aquí
        return list(ngrams(doc_tokens, self.N))
    
    def replace_unk(self,doc_tokens:list) -> list:
        for i, token in enumerate(doc_tokens):
            if token.lower() not in self.vocab: #si el token no essta en minuscula entonces reemplazar por UNK
                doc_tokens[i]=self.UNK
        return doc_tokens #regresa tokens ya procesados
    
        

In [7]:
tk=TweetTokenizer()
ngram_data=NgramData(args.N,5000,tk.tokenize)
ngram_data.fit(X_train)

In [8]:
X_ngram_train,y_ngram_train=ngram_data.transform(X_train)
X_ngram_val,y_ngram_val=ngram_data.transform(X_val)

In [9]:
print(f'Training observations (X): {X_ngram_train.shape},y: {y_ngram_train.shape}')
print(f'Validation observations (X): {X_ngram_val.shape},y: {y_ngram_val.shape}')

Training observations (X): (102751, 3),y: (102751,)
Validation observations (X): (11558, 3),y: (11558,)


In [10]:
#Set batch size in args
args.batch_size=64
#Num workers 
args.num_workers=0

#Train
train_dataset=TensorDataset(torch.tensor(X_ngram_train,dtype=torch.int64),torch.tensor(y_ngram_train,dtype=torch.int64))
train_loader=DataLoader(dataset=train_dataset,batch_size=args.batch_size,shuffle=True,num_workers=args.num_workers)

#Val
val_dataset=TensorDataset(torch.tensor(X_ngram_val,dtype=torch.int64),torch.tensor(y_ngram_val,dtype=torch.int64))
val_loader=DataLoader(dataset=val_dataset,batch_size=args.batch_size,shuffle=False,num_workers=args.num_workers)


In [11]:
batch=next(iter(train_loader))

In [12]:
#Vocab size
args.vocab_size=ngram_data.get_vocab_size()

#dimension of word embeddings
args.d=50
args.d_h=100
args.dropout=0.1

In [13]:
class NeuralLM2(nn.Module):
    def __init__(self,args):
        super(NeuralLM2,self).__init__()
        self.window_size=args.N-1
        self.embedding_dim=args.d
        
        self.emb=nn.Embedding(args.vocab_size,args.d)
        self.fc1=nn.Linear(args.d*(args.N-1),args.d_h)
        self.drop1=nn.Dropout(p=args.dropout) #apagar neuronas, en este caso bajo para evitar el sobreajuste
        self.fc2=nn.Linear(args.d_h,args.vocab_size,bias=False)
        
    def forward(self,x):
        x=self.emb(x)
        x=x.view(-1,self.window_size*self.embedding_dim)
        h=F.relu(self.fc1(x))
        #quitar algunos elemntos para que no se sobreentrene
        h=self.drop1(h)
        return self.fc2(h)
    

In [14]:
class NeuralLM(nn.Module):
    def __init__(self,args, pretrained_embeddings=None):
        super(NeuralLM,self).__init__()
        self.window_size=args.N-1
        self.embedding_dim=args.d
        
        #self.emb = nn.Embedding.from_pretrained(
        #    torch.tensor(pretrained_embeddings, dtype=torch.float32),
        #    freeze=False  # Debe ser False para que los embeddings se actualicen
        #    )

        
        # Si hay embeddings preentrenados, los usamos
        if pretrained_embeddings is not None:
            self.emb = nn.Embedding.from_pretrained(
            torch.tensor(pretrained_embeddings, dtype=torch.float32),
            freeze=False  # Debe ser False para que los embeddings se actualicen
            )
        #    self.emb = nn.Embedding.from_pretrained(torch.tensor(pretrained_embeddings, dtype=torch.float32), freeze=False)
        else:
            self.emb = nn.Embedding(args.vocab_size, args.d)
        self.fc1=nn.Linear(args.d*(args.N-1),args.d_h)
        #self.fc1=nn.Linear(args.d * self.window_size, args.d_h)
        self.drop1=nn.Dropout(p=args.dropout)
        self.fc2=nn.Linear(args.d_h, args.vocab_size, bias=False)
        
    def forward(self,x):
        x = self.emb(x)
        x = x.view(-1, self.window_size * self.embedding_dim)
        h = F.relu(self.fc1(x))
        h = self.drop1(h)
        return self.fc2(h)


In [15]:
#normalizar la red para obtener la probabilidad 1
def get_preds(raw_logits):
    probs=F.softmax(raw_logits.detach(),dim=1) #solamente a los valores de la matriz del tensor
    y_pred=torch.argmax(probs,dim=1).cpu().numpy() #argmax para indice
    
    return y_pred

In [16]:
def model_eval(data,model,gpu=False):
    with torch.no_grad():
        preds,tgts=[],[]
        for window_words,labels in data:
            if gpu:
                window_words=window_words.cuda()
            outputs=model(window_words)
            
            #Get predictions
            y_pred=get_preds(outputs)
            tgt=labels.numpy()
            tgt_list = tgt.tolist()  # Convertir a lista de Python
            tgts.append(tgt_list)  # Ahora sí puedes hacer append()

            #tgt.append(tgt)
            preds.append(y_pred)
    tgts=[e for l in tgts for e in l]
    preds=[e for l in preds for e in l]
    
    return accuracy_score(tgts,preds)

In [17]:
def save_checkpoint(state,is_best,checkpoint_path,filename='checkpoint.pt'):
    filename=os.path.join(checkpoint_path,filename)
    torch.save(state,filename)
    if is_best:
        shutil.copyfile(filename,os.path.join(checkpoint_path,'model_best.pt')) #shutil es una libreria que permite copiar archivos

In [94]:
def log_likehood(model,text,ngram_model):
    X,y=ngram_data.transform([text])
    X,y=X[2:],y[2:]
    X=torch.LongTensor(X).unsqueeze(0)
    
    logits=model(X).detach()
    probs=F.softmax(logits,dim=1).numpy()
    
    return (np.sum([np.log(probs[i][w]) for i,w in enumerate(y)]))
    

In [161]:
args = Namespace()
args.N = 6

In [163]:
tokenizer_char = lambda x: list(x)
ngram_data_char = NgramData( args.N, 5000, tokenizer_char )
ngram_data_char.fit( X_train)

In [166]:
X_ngram_train,y_ngram_train=ngram_data_char.transform(X_train)
X_ngram_val,y_ngram_val=ngram_data_char.transform(X_val)

In [None]:
args.batch_size = 64
args.num_workers = 0

train_dataset = TensorDataset(torch.tensor(X_ngram_train, dtype=torch.int64),
                              torch.tensor(y_ngram_train, dtype=torch.int64))

train_loader = DataLoader(train_dataset,
                          batch_size= args.batch_size,
                          num_workers= args.num_workers,
                          shuffle= True)

val_dataset = TensorDataset(torch.tensor(X_ngram_val, dtype=torch.int64),
                            torch.tensor(y_ngram_val, dtype=torch.int64))


val_loader = DataLoader(val_dataset,
                          batch_size= args.batch_size,
                          num_workers= args.num_workers,
                          shuffle= True)

batch = next(iter(train_loader))

In [192]:
args.vocab_size = ngram_data_char.get_vocab_size()
args.d = 100 
args.d_h = 200 
args.dropout = 0.1

# Training
args.lr = 1e-3 
args.num_epochs = 100
args.patience = 20 

# Scheduler
args.lr_patience = 10
args.lr_factor = 0.5

# Save
args.save_dir = 'char_model'
os.makedirs(args.save_dir, exist_ok=True)
args.use_gpu=torch.cuda.is_available()

In [195]:
model_char = NeuralLM( args )

In [196]:
criterion=nn.CrossEntropyLoss()
optimizer=torch.optim.SGD(model_char.parameters(),lr=args.lr)

In [197]:
start_time=time.time()
best_metric=0
metric_history=[]
train_metric_history=[]


for epoch in range(args.num_epochs):
    epoch_start_time=time.time()
    loss_epoch=[]
    training_metric=[]
    model_char.train()
    
    for window_words,labels in train_loader:
        
        if args.use_gpu:
            window_words=window_words.cuda()
            labels=labels.cuda()
            
        outputs=model_char(window_words)
        loss=criterion(outputs,labels)  
        loss_epoch.append(loss.item())
        
        y_pred=get_preds(outputs)
        tgt=labels.cpu().numpy()
        training_metric.append(accuracy_score(tgt,y_pred))
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    mean_epoch_metric=np.mean(training_metric)
    train_metric_history.append(mean_epoch_metric)
    
    model_char.eval()
    tuning_metric=model_eval(val_loader,model_char,gpu=args.use_gpu)
    metric_history.append(mean_epoch_metric)
    
    scheduler.step(tuning_metric)
    
    is_improvement=tuning_metric>best_metric
    if is_improvement:
        best_metric=tuning_metric
        n_no_improve=0
    else:
        n_no_improve+=1
    
    
    save_checkpoint({
        "epoch":epoch+1,
        "state_dict":model_char.state_dict(),
        "optimizer": optimizer.state_dict(),
        "scheduler": scheduler.state_dict(),
        "best_metric": best_metric,
    },is_improvement,args.save_dir)
    
    
    if n_no_improve >= args.patience:
        print("No improvement. Breaking out of loop")
        break
    print('Train acc: {}'.format(mean_epoch_metric))
    print('Epochh [{}/{}], Loss: {:.4f} - Val accuracy:{:.4f} - Epoch time:{:.2f}'
          .format(epoch+1,args.num_epochs,np.mean(loss_epoch),tuning_metric,(time.time() - epoch)))
    
print("--- %s seconds ---" % (time.time() - start_time))

Train acc: 0.25377462767607817
Epochh [1/100], Loss: 3.1522 - Val accuracy:0.3211 - Epoch time:1742440606.36
Train acc: 0.32609525286999685
Epochh [2/100], Loss: 2.3803 - Val accuracy:0.3515 - Epoch time:1742440611.10
Train acc: 0.34730666304685076
Epochh [3/100], Loss: 2.2644 - Val accuracy:0.3661 - Epoch time:1742440615.51
Train acc: 0.3630930034129693
Epochh [4/100], Loss: 2.2007 - Val accuracy:0.3802 - Epoch time:1742440620.61
Train acc: 0.37441203847347193
Epochh [5/100], Loss: 2.1572 - Val accuracy:0.3922 - Epoch time:1742440625.25
Train acc: 0.38338737201365186
Epochh [6/100], Loss: 2.1243 - Val accuracy:0.3985 - Epoch time:1742440629.61
Train acc: 0.39004111076636677
Epochh [7/100], Loss: 2.0961 - Val accuracy:0.4019 - Epoch time:1742440634.43
Train acc: 0.3953073611542042
Epochh [8/100], Loss: 2.0724 - Val accuracy:0.4067 - Epoch time:1742440638.50
Train acc: 0.40004401954700586
Epochh [9/100], Loss: 2.0524 - Val accuracy:0.4118 - Epoch time:1742440642.90
Train acc: 0.40469360

In [198]:
best_char_model = NeuralLM(args)


In [None]:
def parse_text(text, tokenizer):
    all_tokens = tokenizer(text) 
    all_tokens = [w.lower() if w in ngram_data_char.w2id else '<unk>' for w in all_tokens]  
    tokens_ids = [ngram_data_char.w2id.get(w.lower(), ngram_data_char.w2id["<unk>"]) for w in all_tokens]
    return all_tokens, tokens_ids


In [226]:
test_text = "hola mundo!"
tokens, ids = parse_text(test_text, tokenizer_char)

print("Tokens:", tokens)
print("Token IDs:", ids)


Tokens: ['h', 'o', 'l', 'a', ' ', 'm', 'u', 'n', 'd', 'o', '<unk>']
Token IDs: [25, 5, 11, 2, 6, 12, 0, 14, 10, 5, 341]


In [None]:
def generate_sentence(model,initial_text,tokenizer): 
    #se cambio para que detecte los caracteres
    all_tokens,window_word_ids=parse_text(initial_text,tokenizer)
    for i in range(100):
        y_pred=predict_next_token(model,window_word_ids) 
        next_word=ngram_data.id2w[y_pred]
        all_tokens.append(next_word)
        
        if next_word == '<\s>':
            break
        else:
            window_word_ids.pop(0)
            window_word_ids.append(y_pred)
    return " ".join(all_tokens)

In [None]:
def clean_generated_text(text):
    import re
    return re.sub(r'[^a-zA-Z0-9áéíóúüñÁÉÍÓÚÜÑ.,!? ]', '', text)

In [None]:
def generate_sentence(model, initial_text, tokenizer, max_length=300):
    model.eval()
    all_tokens, window_word_ids = parse_text(initial_text, tokenizer)

    while len(window_word_ids) < args.N - 1:
        window_word_ids.insert(0, ngram_data_char.w2id["<unk>"])

    for _ in range(max_length):
        y_pred = predict_next_token(model, window_word_ids)  
        next_word = ngram_data_char.id2w.get(y_pred, "<unk>")  

        all_tokens.append(next_word)
        
        if next_word == '</s>':
            break
        else:
            window_word_ids.pop(0) 
            window_word_ids.append(y_pred)

    generated_text= "".join(all_tokens) 
    return clean_generated_text(generated_text)

In [216]:
input_texts = ["hola", "como", "estas"]

for inpu in input_texts:
    generated_text = generate_sentence(model_char, inpu, tokenizer_char)
    print(f"Texto generado para '{inpu}':\n{generated_text}\n")


Texto generado para 'hola':
holaé úuxtjies

Texto generado para 'como':
comocqsúübxdyxy u nfunksevvéxyqówks

Texto generado para 'estas':
estas  uizpmmgpueiáhjgeunkunkywpfqjgóanipunk



In [227]:
sentences = [
    "coca",
    "cola",
    "es",
    "amor",
    "cafe"
]
for sentence in sentences:
    likelihood = log_likehood(model_char, sentence, ngram_data_char)
    print(f"Likelihood de '{sentence}': {likelihood}")

Likelihood de 'coca': -13.142048835754395
Likelihood de 'cola': -11.808069229125977
Likelihood de 'es': -8.629572868347168
Likelihood de 'amor': -9.064281463623047
Likelihood de 'cafe': -13.684392929077148


In [None]:
def evaluate_permutations(model, word, ngram_data_char):
    char_list = list(word)  
    perms = [''.join(perm) for perm in permutations(char_list)]
    
    print("-" * 30)
    print("Las más probables")
    for p, t in sorted([(log_likehood(model, text, ngram_data_char), text) for text in perms], reverse=True)[:5]:
        print(p, t)
    
    print("-" * 30)
    print("Las menos probables")
    for p, t in sorted([(log_likehood(model, text, ngram_data_char), text) for text in perms], reverse=True)[-5:]:
        print(p, t)

In [223]:
evaluate_permutations(model_char, "madre", ngram_data_char)

------------------------------
Las más probables
-4.7533803 madre
-9.330919 mdare
-9.621315 mdera
-10.010485 darme
-10.522482 marde
------------------------------
Las menos probables
-21.793102 aedmr
-21.861979 rmdae
-22.092697 edmra
-23.233648 eadmr
-23.982748 daemr


#

In [224]:
perplexity_char = perplexity(model_char, val_loader, ngram_data_char, use_gpu=args.use_gpu)
print("Modelo a nivel caracter", perplexity_char)

Modelo a nivel caracter 5.550668298053085
