# Word models

This code uses a special kind of neural network called an "N-gram-based neural language model" to do its work. It uses a tool called PyTorch to do this. It has custom or pre-trained embeddings.
Classical training with early stopping.
This tool can be used to compare how similar two words are and how precise they are.

In [None]:
#Tools
import os
import time
import shutil
import random
from typing import Tuple #Tiping es una libreria que permite definir tipos de variables
from argparse import Namespace #Permite definir argumentos de entrada
import matplotlib.pyplot as plt

#Prepocessing
import nltk
from nltk.corpus import stopwords
from nltk import ngrams
import numpy as np
from nltk import FreqDist
import pandas as pd
from nltk.tokenize import TweetTokenizer
import math


#Pythorch
from torch.utils.data import DataLoader,TensorDataset
import torch
import torch.nn as nn
import torch.nn.functional as F

#scikit-learn
from sklearn.metrics import accuracy_score,confusion_matrix,f1_score,precision_recall_fscore_support




from sklearn.feature_selection import SelectKBest,chi2 #esta se tiene que cambiar por una a mano
from sklearn.manifold import TSNE



#Importar librerias de sklearn
from sklearn import svm 
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn import metrics 


In [3]:
seed=1111
random.seed(seed) #phyton seed
np.random.seed(seed) #numpy seed
torch.manual_seed(seed) #torch seed
torch.backends.cudnn.deterministic = False #Se refiere a la implementacion de cudnn, que es la libreria de pytorch que permite hacer calculos en la GPU, False para que no sea determinista

In [4]:
X_train=pd.read_csv('mex20_train.txt',sep='\r\n',engine='python',header=None).loc[:,0].values.tolist()
X_val=pd.read_csv('mex20_val.txt',sep='\r\n',engine='python',header=None).loc[:,0].values.tolist()


In [5]:
args=Namespace()
args.N=4

In [6]:
class NgramData():
    def __init__(self, N:int,vocab_max:int=5000,tokenizer=None, embeddings_model=None):
        self.tokenizer=tokenizer if tokenizer else self.default_tokenizer()
        self.punct=set(['.','?','!',',',';',':','^','*','+','/','\\','"','´','`','¨','~','{','}','[',']','(',')','_','-','&','%','$','#','@','¿','?','¡','!','<','>','=','|','°','¬','¦','ª','º','©','®','«','»','“','”','‘','’','…','–','—','•','·','»','«','…','‘','’','“','”','–','—','•','·','¡','¿','<url>','@usuario','...'])
        self.N=N
        self.vocab_max=vocab_max
        self.UNK="<unk>"
        self.SOS='<s>'
        self.EOS='</s>'
        self.embeddings_model=embeddings_model
        
    def get_vocab_size(self) -> int:
        return len(self.vocab)
        
    def default_tokenizer(self,doc:str) -> list: #-> es para definir el tipo de variable que regresa
        return doc.split(" ")
    
    
    def remove_word(self,word:str) -> bool:
        word=word.lower()
        is_punct=True if word in self.punct else False
        is_digit=word.isnumeric()
        return is_punct or is_digit
    
    def get_vocabulary(self,corpus:list) -> set:
        freq_dist=FreqDist([w.lower() for sentence in corpus for w in self.tokenizer(sentence) if not  self.remove_word(w)]) #genera un diccionario de palabras y su frecuencia
        sorted_words=self.sortFreqDict(freq_dist)[:self.vocab_max-3] #selecciona las palabras mas frecuentes del diccionario creado, el -3 es para dejar espacio para los tokens especiales
        return set(sorted_words)
    
    def sortFreqDict(self,freq_dist) -> list:
        freq_dict=dict(freq_dist)
        return sorted(freq_dict, key=freq_dict.get, reverse=True)
    
    
 
        
    def fit(self,corpus:list) -> None:
        self.vocab=self.get_vocabulary(corpus)#tokens especiales
        self.vocab.add(self.UNK)
        self.vocab.add(self.SOS)
        self.vocab.add(self.EOS)
        
        #mapeo id a palabra y viceversa
        self.w2id={}
        self.id2w={}
        #embeddings preentrenados
        #vocab = sorted(self.vocab)  #
        if self.embeddings_model is not None:
            self.embeddings_matrix=np.empty((len(self.vocab),self.embeddings_model.vector_size))
            
        id=0
        for doc in corpus:
            for word in self.tokenizer(doc):
                word_=word.lower()
                if word_ in self.vocab and not word_ in self.w2id:
                    self.w2id[word_]=id
                    self.id2w[id]=word_
                    #solo crear la sub matriz de los que importan
                    
                    if self.embeddings_model is not None:
                        if word_ in self.embeddings_model:
                            self.embeddings_matrix[id]=self.embeddings_model[word_]
                        else:
                            self.embeddings_matrix[id]=np.random.rand(self.embeddings_model.vector_size)
                    id+=1
        #siempre agregar los tokens especiales  
        self.w2id.update({self.UNK:id,self.SOS:id+1,self.EOS:id+2})
        self.id2w.update({id:self.UNK,id+1:self.SOS,id+2:self.EOS})
        
        
        
        
    
    def transform(self,corpus:list)-> Tuple [np.array,np.array]:
        X_ngrams=[]
        y=[]
        for doc in corpus:
            doc_ngram=self.get_ngram_doc(doc)
            for words_window in doc_ngram:
                #print("words_window:", words_window)
                words_window_ids = [self.w2id[w] if w in self.w2id else self.w2id["<unk>"] for w in words_window]

                #words_window_ids=[self.w2id[w] for w in words_window]
                X_ngrams.append(list(words_window_ids[:-1]))
                y.append(words_window_ids[-1])
        return np.array(X_ngrams),np.array(y)
    
    

    def get_ngram_doc(self, doc: str) -> list:
        doc_tokens = self.tokenizer(doc)
        doc_tokens = self.replace_unk(doc_tokens)
        doc_tokens = [w.lower() for w in doc_tokens]
        doc_tokens = [self.SOS] * (self.N - 1) + doc_tokens + [self.EOS]  # Corrección aquí
        return list(ngrams(doc_tokens, self.N))
    
    def replace_unk(self,doc_tokens:list) -> list:
        for i, token in enumerate(doc_tokens):
            if token.lower() not in self.vocab: #si el token no essta en minuscula entonces reemplazar por UNK
                doc_tokens[i]=self.UNK
        return doc_tokens #regresa tokens ya procesados
    
        

In [7]:
tk=TweetTokenizer()
ngram_data=NgramData(args.N,5000,tk.tokenize)
ngram_data.fit(X_train)

In [8]:
X_ngram_train,y_ngram_train=ngram_data.transform(X_train)
X_ngram_val,y_ngram_val=ngram_data.transform(X_val)

In [9]:
print(f'Training observations (X): {X_ngram_train.shape},y: {y_ngram_train.shape}')
print(f'Validation observations (X): {X_ngram_val.shape},y: {y_ngram_val.shape}')

Training observations (X): (102751, 3),y: (102751,)
Validation observations (X): (11558, 3),y: (11558,)


In [10]:
#Set batch size in args
args.batch_size=64
#Num workers 
args.num_workers=0

#Train
train_dataset=TensorDataset(torch.tensor(X_ngram_train,dtype=torch.int64),torch.tensor(y_ngram_train,dtype=torch.int64))
train_loader=DataLoader(dataset=train_dataset,batch_size=args.batch_size,shuffle=True,num_workers=args.num_workers)

#Val
val_dataset=TensorDataset(torch.tensor(X_ngram_val,dtype=torch.int64),torch.tensor(y_ngram_val,dtype=torch.int64))
val_loader=DataLoader(dataset=val_dataset,batch_size=args.batch_size,shuffle=False,num_workers=args.num_workers)


In [11]:
batch=next(iter(train_loader))

In [12]:
#Vocab size
args.vocab_size=ngram_data.get_vocab_size()

#dimension of word embeddings
args.d=50
args.d_h=100
args.dropout=0.1

In [14]:
class NeuralLM(nn.Module):
    def __init__(self,args, pretrained_embeddings=None):
        super(NeuralLM,self).__init__()
        self.window_size=args.N-1
        self.embedding_dim=args.d
        
        #self.emb = nn.Embedding.from_pretrained(
        #    torch.tensor(pretrained_embeddings, dtype=torch.float32),
        #    freeze=False  # Debe ser False para que los embeddings se actualicen
        #    )

        
        # Si hay embeddings preentrenados, los usamos
        if pretrained_embeddings is not None:
            self.emb = nn.Embedding.from_pretrained(
            torch.tensor(pretrained_embeddings, dtype=torch.float32),
            freeze=False  # Debe ser False para que los embeddings se actualicen
            )
        #    self.emb = nn.Embedding.from_pretrained(torch.tensor(pretrained_embeddings, dtype=torch.float32), freeze=False)
        else:
            self.emb = nn.Embedding(args.vocab_size, args.d)
        self.fc1=nn.Linear(args.d*(args.N-1),args.d_h)
        #self.fc1=nn.Linear(args.d * self.window_size, args.d_h)
        self.drop1=nn.Dropout(p=args.dropout)
        self.fc2=nn.Linear(args.d_h, args.vocab_size, bias=False)
        
    def forward(self,x):
        x = self.emb(x)
        x = x.view(-1, self.window_size * self.embedding_dim)
        h = F.relu(self.fc1(x))
        h = self.drop1(h)
        return self.fc2(h)


In [15]:
#normalizar la red para obtener la probabilidad 1
def get_preds(raw_logits):
    probs=F.softmax(raw_logits.detach(),dim=1) #solamente a los valores de la matriz del tensor
    y_pred=torch.argmax(probs,dim=1).cpu().numpy() #argmax para indice
    
    return y_pred

In [16]:
def model_eval(data,model,gpu=False):
    with torch.no_grad():
        preds,tgts=[],[]
        for window_words,labels in data:
            if gpu:
                window_words=window_words.cuda()
            outputs=model(window_words)
            
            #Get predictions
            y_pred=get_preds(outputs)
            tgt=labels.numpy()
            tgt_list = tgt.tolist()  # Convertir a lista de Python
            tgts.append(tgt_list)  # Ahora sí puedes hacer append()

            #tgt.append(tgt)
            preds.append(y_pred)
    tgts=[e for l in tgts for e in l]
    preds=[e for l in preds for e in l]
    
    return accuracy_score(tgts,preds)

In [17]:
def save_checkpoint(state,is_best,checkpoint_path,filename='checkpoint.pt'):
    filename=os.path.join(checkpoint_path,filename)
    torch.save(state,filename)
    if is_best:
        shutil.copyfile(filename,os.path.join(checkpoint_path,'model_best.pt')) #shutil es una libreria que permite copiar archivos

In [None]:

file_path = "word2vec_col.txt"
word_to_idx = {}
embeddings_list = []

with open(file_path, "r", encoding="utf-8") as f:
    first_line = f.readline().strip().split()
    vocab_size = int(first_line[0])  
    emb_dim = int(first_line[1])  

    for line in f:
        parts = line.strip().split()
        word = parts[0]  
        vector = np.array(parts[1:], dtype=np.float32) 
        word_to_idx[word] = len(word_to_idx)  
        embeddings_list.append(vector)

embedding_matrix_pre = np.vstack(embeddings_list)

vocab_size, emb_dim, embedding_matrix_pre.shape

pretrained_embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix_pre, dtype=torch.float32), freeze=False)



In [19]:
#Model hyperparameters
args.vocab_size=ngram_data.get_vocab_size()
args.d=100 #dimension of word embeddings
args.d_h=200 #dimension of hidden layer
args.dropout=0.1

#Training hyperparameters
args.lr=2.3e-1
args.num_epochs=100
args.patience=20 #si despues de 20 epocas no mejora el modelo entonces se detiene

#scheduler hyperparameters
args.lr_patience=10
args.lr_factor=0.5

#Saving directory
args.save_dir='model'
os.makedirs(args.save_dir,exist_ok=True)

----------------------------------------------
## 1.1 Embeddings pre entrenados

In [None]:
# Con embeddings preentrenados
model1 = NeuralLM(args, pretrained_embeddings=embedding_matrix_pre)


In [None]:
best_metric = 0
n_no_improve = 0  
metric_history = []
train_metric_history = []

args.use_gpu = torch.cuda.is_available()
if args.use_gpu:
    model1 = model1.cuda()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model1.parameters(), lr=args.lr)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=args.lr_factor, patience=args.lr_patience, verbose=True)

start_time = time.time()


In [24]:
for epoch in range(args.num_epochs):
    epoch_start_time = time.time()
    loss_epoch = []
    training_metric = []
    model1.train()
    
    for window_words, labels in train_loader:
        if args.use_gpu:
            window_words, labels = window_words.cuda(), labels.cuda()
            
        outputs = model1(window_words)
        loss = criterion(outputs, labels)
        loss_epoch.append(loss.item())
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        y_pred = get_preds(outputs)
        tgt = labels.cpu().numpy()
        training_metric.append(accuracy_score(tgt, y_pred))
    
    mean_epoch_metric = np.mean(training_metric)
    train_metric_history.append(mean_epoch_metric)

    model1.eval()
    tuning_metric = model_eval(val_loader, model1, gpu=args.use_gpu)
    metric_history.append(tuning_metric)  

    scheduler.step(tuning_metric)
    
    is_improvement = tuning_metric > best_metric
    if is_improvement:
        best_metric = tuning_metric
        n_no_improve = 0
    else:
        n_no_improve += 1
    
    save_checkpoint({
        "epoch": epoch + 1,
        "state_dict": model1.state_dict(),
        "optimizer": optimizer.state_dict(),
        "scheduler": scheduler.state_dict(),
        "best_metric": best_metric,
    }, is_improvement, args.save_dir)
    
    if n_no_improve >= args.patience:
        print("No improvement. Breaking training loop.")
        break
    
    print(f'Epoch [{epoch+1}/{args.num_epochs}], Loss: {np.mean(loss_epoch):.4f}, '
          f'Val Acc: {tuning_metric:.4f}, Train Acc: {mean_epoch_metric:.4f}, '
          f'Time: {time.time() - epoch_start_time:.2f}s')

print("--- Training completed in %s seconds ---" % (time.time() - start_time))


Epoch [1/100], Loss: 5.7847, Val Acc: 0.1168, Train Acc: 0.1453, Time: 54.40s
Epoch [2/100], Loss: 5.3253, Val Acc: 0.2020, Train Acc: 0.1641, Time: 55.19s
Epoch [3/100], Loss: 5.1008, Val Acc: 0.2041, Train Acc: 0.1700, Time: 52.82s
Epoch [4/100], Loss: 4.9338, Val Acc: 0.1473, Train Acc: 0.1721, Time: 52.32s
Epoch [5/100], Loss: 4.7812, Val Acc: 0.1726, Train Acc: 0.1757, Time: 52.22s
Epoch [6/100], Loss: 4.6538, Val Acc: 0.2174, Train Acc: 0.1773, Time: 57.83s
Epoch [7/100], Loss: 4.5293, Val Acc: 0.1844, Train Acc: 0.1813, Time: 58.72s
Epoch [8/100], Loss: 4.4258, Val Acc: 0.1492, Train Acc: 0.1830, Time: 64.94s
Epoch [9/100], Loss: 4.3332, Val Acc: 0.1602, Train Acc: 0.1880, Time: 63.49s
Epoch [10/100], Loss: 4.2494, Val Acc: 0.1760, Train Acc: 0.1918, Time: 58.34s
Epoch [11/100], Loss: 4.1723, Val Acc: 0.1559, Train Acc: 0.1954, Time: 55.12s
Epoch [12/100], Loss: 4.1005, Val Acc: 0.1602, Train Acc: 0.2007, Time: 56.77s
Epoch [13/100], Loss: 3.6360, Val Acc: 0.1874, Train Acc: 0.2

In [27]:

best_model_pre = NeuralLM(args, pretrained_embeddings=embedding_matrix_pre)
checkpoint = torch.load('model/model_best.pt', map_location=torch.device('cpu'))  
best_model_pre.load_state_dict(checkpoint['state_dict'])
best_model_pre.eval()


  checkpoint = torch.load('model/model_best.pt', map_location=torch.device('cpu'))  # Cargar en CPU por defecto


NeuralLM(
  (emb): Embedding(973265, 100)
  (fc1): Linear(in_features=300, out_features=200, bias=True)
  (drop1): Dropout(p=0.1, inplace=False)
  (fc2): Linear(in_features=200, out_features=5000, bias=False)
)

In [34]:
def print_closest_words(embedding, ngram_data, word, n):
    if word not in ngram_data.w2id:
        print(f"La palabra '{word}' no está en el vocabulario.")
        return
    
    word_id = torch.LongTensor([ngram_data.w2id[word]])
    word_embed = embedding(word_id)
    dists = torch.norm(embedding.weight - word_embed, dim=1).detach()
    lst = sorted(enumerate(dists.numpy()), key=lambda x: x[1])

    print(f"{'-'*25} Palabras más similares a '{word}' {'-'*25}")
    for idx, difference in lst[1:n+1]:
        word_closest = ngram_data.id2w.get(idx, "<unk>")  # Usa "<unk>" si no encuentra la palabra
        print(word_closest, difference)


In [68]:
word = 'vez'  

if word in ngram_data.w2id:
    print(f"La palabra '{word}' está en el vocabulario.")
    print_closest_words(best_model_pre.emb, ngram_data, word, 10)
else:
    print(f"La palabra '{word}' NO está en el vocabulario.")

La palabra 'vez' está en el vocabulario.
------------------------- Palabras más similares a 'vez' -------------------------
😪 18.096907
dejan 21.678274
<unk> 23.59277
<unk> 24.248632
tuyo 24.355232
agresivo 24.852814
votando 25.206497
irán 25.450642
<unk> 26.123274
<unk> 26.426844


In [36]:
def parse_text(text,tokenizer):
    all_tokens=[w.lower() if w in ngram_data.w2id else '<unk>' for w in tokenizer.tokenize(text)] #verificar que este en el diccionario
    tokens_ids=[ngram_data.w2id[word.lower()] for word in all_tokens]
    return all_tokens,tokens_ids

In [37]:
def sample_next_word(logits,temperature=1.0):
    
    logits=np.array(logits).astype('float64')
    
    preds=logits/temperature
    exp_preds=np.exp(preds)
    preds=exp_preds/np.sum(exp_preds)
    
    probas=np.random.multinomial(1,preds)
    return np.argmax(probas)
    

In [None]:
def predict_next_token(model,token_ids):
    words_ids_tensor=torch.LongTensor(token_ids).unsqueeze(0)
    y_raw_pred=model(words_ids_tensor).squeeze(0).detach().numpy()
    
    
    y_pred=sample_next_word(y_raw_pred,1.0)
    return y_pred

In [39]:
def generate_sentence(model,initial_text,tokenizer): #entrada de 3 tokens
    all_tokens,window_word_ids=parse_text(initial_text,tokenizer)
    for i in range(100):
        y_pred=predict_next_token(model,window_word_ids) #numero
        next_word=ngram_data.id2w[y_pred]
        all_tokens.append(next_word)
        
        if next_word == '<\s>':
            break
        else:
            window_word_ids.pop(0)
            window_word_ids.append(y_pred)
    return " ".join(all_tokens)

In [108]:
initial_tokens= " Me importa tu "

print("-"*30)
print("Secuencia generada")
print("-"*30)
print(generate_sentence(best_model_pre,initial_tokens,tk))

------------------------------
Secuencia generada
------------------------------
me importa tu dentro cobijas decir aaaaaah perdonar app responsabilidad llevamos vídeos bailar 🎵 pronta io sabemos sacarle hacerle europa ruco reputisima técnico ronchas bastante pájaros limpia trabaja victor antojos capaz serían quedé supongo zaragoza 🤦🏾‍♂ máquina trastornado quedas instinto esposo samudio da iletradas sacarse tiempo 👅 ine ideología mostraste llegue putazos banca trabajan lamemelapanucha chilenos spotify mia tarjetas comido dignidad policías #una hahaha fachas curioso solo reputa tolerar baja rodillas jajajajajajajajajaja #ruggeropasquarelli izquierdos bebe tecates tonta word cambiando fascina modelos cejas trump joto chamba saber osorio circulan pega contrario love iba hola 👍 valí pelar ni ardida situación lee 👌🏼 pasará ¯


In [94]:
def log_likehood(model,text,ngram_model):
    X,y=ngram_data.transform([text])
    X,y=X[2:],y[2:]
    X=torch.LongTensor(X).unsqueeze(0)
    
    logits=model(X).detach()
    probs=F.softmax(logits,dim=1).numpy()
    
    return (np.sum([np.log(probs[i][w]) for i,w in enumerate(y)]))
    

In [59]:
from itertools import permutations
from random import shuffle

word_list="estamos en la clase de lenguaje natural".split(' ')
perms=[' '.join(perm) for perm in permutations(word_list)]
print("-" * 30)
print("Las más probables")

for p,t in sorted([(log_likehood(best_model_pre,text,ngram_data),text) for text in perms], reverse=True)[:5]:
    print(p,t)
    
print("-" * 30)
print("Las menos probables")

for p,t in sorted([(log_likehood(best_model_pre,text,ngram_data),text) for text in perms], reverse=True)[-5:]:
    print(p,t)

------------------------------
Las más probables
-19.440208 natural estamos en la clase de lenguaje
-20.465725 estamos natural en la clase de lenguaje
-23.164532 estamos natural de clase en la lenguaje
-24.567703 natural estamos en la de clase lenguaje
-24.582191 natural estamos de clase en la lenguaje
------------------------------
Las menos probables
-83.05227 en de lenguaje clase estamos natural la
-84.606544 lenguaje en de clase estamos natural la
-85.820175 lenguaje de natural estamos clase la en
-86.08365 de lenguaje estamos natural clase la en
-89.090355 de lenguaje clase estamos natural la en


In [95]:
sentences = [
    "coca cola es",
    "me gusta el",
    "me trata mejor",
    "quiero unos tacos",
    "cafe con leche"
]

likelihoods = {sentence: log_likehood(best_model_pre, sentence, ngram_data) for sentence in sentences}
likelihoods

{'coca cola es': -19.389812,
 'me gusta el': -14.146886,
 'me trata mejor': -18.892828,
 'quiero unos tacos': -12.596671,
 'cafe con leche': -15.369717}

In [99]:

def perplexity(model, data_loader, ngram_data, use_gpu=False):
    total_log_likelihood = 0
    total_word_count = 0

    with torch.no_grad():
        for window_words, labels in data_loader:
            if use_gpu:
                window_words = window_words.cuda()
                labels = labels.cuda()
            
            outputs = model(window_words)
            log_probs = F.log_softmax(outputs, dim=1)

            batch_log_likelihood = log_probs[range(labels.shape[0]), labels].sum().item()
            total_log_likelihood += batch_log_likelihood
            total_word_count += labels.shape[0]

    avg_log_likelihood = total_log_likelihood / total_word_count
    perplexity = math.exp(-avg_log_likelihood)
    
    return perplexity

perplexity_pretrained = perplexity(best_model_pre, val_loader, ngram_data, use_gpu=args.use_gpu)
print("Modelo con embeddings preentrenados", perplexity_pretrained)

Modelo con embeddings preentrenados 256.64211944881106


---------------
## 1.2 Modelo sin embeddings pre entrenados

In [103]:
model_no_pre = NeuralLM(args)  # Modelo sin embeddings preentrenados
torch.save(model_no_pre.state_dict(), 'model/model_no_embeddings.pt')
best_metric = 0
n_no_improve = 0  
metric_history = []
train_metric_history = []

args.use_gpu = torch.cuda.is_available()
if args.use_gpu:
    model_no_pre = model_no_pre.cuda()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model_no_pre.parameters(), lr=args.lr)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=args.lr_factor, patience=args.lr_patience, verbose=True)

start_time = time.time()
for epoch in range(args.num_epochs):
    epoch_start_time = time.time()
    loss_epoch = []
    training_metric = []
    model_no_pre.train()
    
    for window_words, labels in train_loader:
        if args.use_gpu:
            window_words, labels = window_words.cuda(), labels.cuda()
            
        outputs = model_no_pre(window_words)
        loss = criterion(outputs, labels)
        loss_epoch.append(loss.item())
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        y_pred = get_preds(outputs)
        tgt = labels.cpu().numpy()
        training_metric.append(accuracy_score(tgt, y_pred))
    
    mean_epoch_metric = np.mean(training_metric)
    train_metric_history.append(mean_epoch_metric)

    model_no_pre.eval()
    tuning_metric = model_eval(val_loader, model_no_pre, gpu=args.use_gpu)
    metric_history.append(tuning_metric)  

    scheduler.step(tuning_metric)
    
    is_improvement = tuning_metric > best_metric
    if is_improvement:
        best_metric = tuning_metric
        n_no_improve = 0
    else:
        n_no_improve += 1
    
    save_checkpoint({
        "epoch": epoch + 1,
        "state_dict": model_no_pre.state_dict(),
        "optimizer": optimizer.state_dict(),
        "scheduler": scheduler.state_dict(),
        "best_metric": best_metric,
    }, is_improvement, args.save_dir)
    
    if n_no_improve >= args.patience:
        print("No improvement. Breaking training loop.")
        break
    
    print(f'Epoch [{epoch+1}/{args.num_epochs}], Loss: {np.mean(loss_epoch):.4f}, '
          f'Val Acc: {tuning_metric:.4f}, Train Acc: {mean_epoch_metric:.4f}, '
          f'Time: {time.time() - epoch_start_time:.2f}s')

print("--- Training completed in %s seconds ---" % (time.time() - start_time))




Epoch [1/100], Loss: 5.4366, Val Acc: 0.2342, Train Acc: 0.1872, Time: 5.49s
Epoch [2/100], Loss: 5.0023, Val Acc: 0.1765, Train Acc: 0.1976, Time: 4.85s
Epoch [3/100], Loss: 4.7895, Val Acc: 0.2211, Train Acc: 0.2019, Time: 4.68s
Epoch [4/100], Loss: 4.6274, Val Acc: 0.2319, Train Acc: 0.2065, Time: 4.59s
Epoch [5/100], Loss: 4.4834, Val Acc: 0.2229, Train Acc: 0.2095, Time: 4.63s
Epoch [6/100], Loss: 4.3494, Val Acc: 0.2256, Train Acc: 0.2123, Time: 4.51s
Epoch [7/100], Loss: 4.2340, Val Acc: 0.2099, Train Acc: 0.2151, Time: 4.59s
Epoch [8/100], Loss: 4.1221, Val Acc: 0.2272, Train Acc: 0.2165, Time: 4.51s
Epoch [9/100], Loss: 4.0126, Val Acc: 0.1589, Train Acc: 0.2195, Time: 4.56s
Epoch [10/100], Loss: 3.9101, Val Acc: 0.1589, Train Acc: 0.2237, Time: 4.54s
Epoch [11/100], Loss: 3.8195, Val Acc: 0.1907, Train Acc: 0.2265, Time: 4.69s
Epoch [12/100], Loss: 3.7332, Val Acc: 0.1761, Train Acc: 0.2310, Time: 5.77s
Epoch [13/100], Loss: 3.6453, Val Acc: 0.2293, Train Acc: 0.2408, Time: 6

In [107]:
best_model_no_pre = NeuralLM(args)


#