In [1]:
# prueba del transformer
import numpy as np
import torch  # libreria principal de python
import torch.nn as nn  # modulo para las redes neuronales
import torch.optim as optim  # modulo para algoritmos de optimizacion en redes neuronales
import torch.utils.data as data  # modulo para tratar con los datasets
import math  # operaciones matematicas
import copy  # para copiar objetos y estructuras

import pandas as pd
from Tokenizador.Tokenizador import TokenizadorBatch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = pd.read_csv("dataset_train/merge_small.csv", sep=",", encoding="utf-8")
dataset.head()

Unnamed: 0,idioma,traduccion
0,Go.,Ve.
1,Go.,Vete.
2,Go.,Vaya.
3,Go.,Váyase.
4,Hi.,Hola.


Al tratarse de un dataset donde le voy a pasar tanto ingles como español, voy a crearme dos datasets, uno para cada idioma. Luego los unire

In [3]:
dataset.tail()

Unnamed: 0,idioma,traduccion
1995,Lo dejó.,He gave up.
1996,Cedió.,He gave up.
1997,Tiró la toalla.,He gave up.
1998,Descolgó.,He hung up.
1999,Él cogió el teléfono.,He hung up.


In [4]:
#dataset_reducido = dataset.head(1500)
#dataset_reducido

In [5]:
# definimos objeto tokenizador (secuencia maxima de 128 tokens y batch de 1024 elementos)
tokenizador = TokenizadorBatch(max_length=128, batch_size=1024)
# le pasamos el dataset
dataset_tokenizado = tokenizador.tokenizar_dataframe(dataset)

In [6]:
dataset.shape

(2000, 4)

In [7]:
dataset.info

<bound method DataFrame.info of                      idioma   traduccion  \
0                       Go.          Ve.   
1                       Go.        Vete.   
2                       Go.        Vaya.   
3                       Go.      Váyase.   
4                       Hi.        Hola.   
...                     ...          ...   
1995               Lo dejó.  He gave up.   
1996                 Cedió.  He gave up.   
1997        Tiró la toalla.  He gave up.   
1998              Descolgó.  He hung up.   
1999  Él cogió el teléfono.  He hung up.   

                                        entrada_encoder  \
0     [[tensor(2016, device='mps:0'), tensor(5, devi...   
1     [[tensor(2016, device='mps:0'), tensor(5, devi...   
2     [[tensor(2016, device='mps:0'), tensor(5, devi...   
3     [[tensor(2016, device='mps:0'), tensor(5, devi...   
4     [[tensor(2673, device='mps:0'), tensor(5, devi...   
...                                                 ...   
1995  [[tensor(2091, devic

In [8]:
#dataset_reducido.tail(20)
dataset.head()

Unnamed: 0,idioma,traduccion,entrada_encoder,entrada_decoder
0,Go.,Ve.,"[[tensor(2016, device='mps:0'), tensor(5, devi...","[[tensor(250004, device='mps:0'), tensor(2609,..."
1,Go.,Vete.,"[[tensor(2016, device='mps:0'), tensor(5, devi...","[[tensor(250004, device='mps:0'), tensor(2609,..."
2,Go.,Vaya.,"[[tensor(2016, device='mps:0'), tensor(5, devi...","[[tensor(250004, device='mps:0'), tensor(3453,..."
3,Go.,Váyase.,"[[tensor(2016, device='mps:0'), tensor(5, devi...","[[tensor(250004, device='mps:0'), tensor(43287..."
4,Hi.,Hola.,"[[tensor(2673, device='mps:0'), tensor(5, devi...","[[tensor(250004, device='mps:0'), tensor(47958..."


In [9]:
# Device
device = torch.device("cpu")

In [10]:
# Hyperparámetros
batch_size = 64           # batch más pequeño, más rápido en CPU
num_epochs = 50
max_seq_length = 128      # truncamos secuencias largas
src_vocab_size = 250006
tgt_vocab_size = 250006
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
dropout = 0.1

In [11]:
# Inicializar modelo, loss y optimizador
from TFM.Transformer_Bloque.Transformer import Transformer

transformer = Transformer( src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(transformer.parameters(), lr=1e-4, betas=(0.9, 0.98), eps=1e-9)

In [12]:
# Función para convertir a tensores con padding dinámico
# padding dinámico -> calcula la secuencia máxima del batch, no del texto entero (padding estático)
def preparar_batch(batch_encoder, batch_decoder):
    # Encontramos la longitud máxima de este batch
    max_len_enc = max(len(seq) for seq in batch_encoder)
    max_len_dec = max(len(seq) for seq in batch_decoder)

    # Padding dinámico (rellenamos con 0s)
    encoder_padded = [seq + [0]*(max_len_enc - len(seq)) for seq in batch_encoder]
    decoder_padded = [seq + [0]*(max_len_dec - len(seq)) for seq in batch_decoder]

    # Convertir a tensores de pytorch
    entrada_encoder = torch.tensor(encoder_padded, dtype=torch.long, device=device)
    entrada_decoder = torch.tensor(decoder_padded, dtype=torch.long, device=device)
    return entrada_encoder, entrada_decoder


In [13]:
# Crear batches desde dataset
def generar_batches(encoder_list, decoder_list, batch_size):
    for i in range(0, len(encoder_list), batch_size):
        batch_enc = encoder_list[i:i+batch_size]
        batch_dec = decoder_list[i:i+batch_size]
        yield preparar_batch(batch_enc, batch_dec)


In [14]:
# Preparar listas de IDs
encoder_ids = [fila.squeeze(0).tolist()[:max_seq_length] for fila in dataset["entrada_encoder"]]
decoder_ids = [fila.squeeze(0).tolist()[:max_seq_length] for fila in dataset["entrada_decoder"]]


In [15]:
encoder_ids

[[2016, 5],
 [2016, 5],
 [2016, 5],
 [2016, 5],
 [2673, 5],
 [28398, 38],
 [28398, 5],
 [40469, 32],
 [31143, 38],
 [31143, 38],
 [31143, 38],
 [39527, 38],
 [39527, 38],
 [39527, 38],
 [150508, 38],
 [150508, 5],
 [42284, 38],
 [42284, 38],
 [42284, 38],
 [175254, 38],
 [175254, 5],
 [2016, 98, 5],
 [2016, 98, 5],
 [35378, 38],
 [87, 13028, 5],
 [87, 13028, 5],
 [87, 9790, 5],
 [87, 23742, 38],
 [10160, 110, 38],
 [169168, 5],
 [172815, 5],
 [188515, 38],
 [188515, 38],
 [20779, 1257, 5],
 [2016, 5036, 5],
 [53983, 442, 38],
 [53983, 442, 32],
 [53983, 442, 32],
 [1529, 13028, 5],
 [101082, 23, 5],
 [99926, 163, 5],
 [87, 77736, 5],
 [87, 3714, 5],
 [87, 25737, 5],
 [87, 110942, 5],
 [87, 72856, 5],
 [87, 569, 18, 5],
 [87, 569, 18, 5],
 [87, 4488, 5],
 [87, 25, 39, 6529],
 [87, 25, 39, 1257, 5],
 [159366, 5],
 [159366, 5],
 [159366, 5],
 [438, 3917, 38],
 [438, 3917, 38],
 [438, 3917, 38],
 [438, 3917, 38],
 [438, 3917, 38],
 [438, 3917, 38],
 [438, 3917, 38],
 [438, 3917, 38],
 [438

In [16]:
decoder_ids

[[250004, 2609, 5, 2],
 [250004, 2609, 67, 5, 2],
 [250004, 3453, 395, 5, 2],
 [250004, 43287, 395, 184, 5, 2],
 [250004, 47958, 5, 2],
 [250004, 14701, 50886, 107, 38, 2],
 [250004, 46348, 71, 5, 2],
 [250004, 3936, 219349, 32, 2],
 [250004, 14701, 80163, 3081, 38, 2],
 [250004, 14701, 4153, 120173, 31, 38, 2],
 [250004, 14701, 44456, 6276, 71, 38, 2],
 [250004, 14701, 284, 53, 6874, 38, 2],
 [250004, 14701, 8912, 67070, 31, 38, 14701, 284, 6077, 46144, 38, 2],
 [250004, 14701, 284, 6077, 46144, 38, 2],
 [250004, 14701, 294, 9180, 38, 2],
 [250004, 6565, 67, 5, 2],
 [250004, 14701, 55292, 71, 38, 2],
 [250004, 14701, 55292, 38, 2],
 [250004, 14701, 28636, 13, 38, 2],
 [250004, 14701, 63723, 1704, 38, 2],
 [250004, 73166, 33, 5, 2],
 [250004, 137425, 4096, 11, 5, 2],
 [250004, 137425, 4096, 13, 5, 2],
 [250004, 47958, 5, 2],
 [250004, 5631, 10605, 5, 2],
 [250004, 5631, 7239, 5, 2],
 [250004, 2091, 111897, 5, 2],
 [250004, 14701, 13025, 1756, 1138, 38, 2],
 [250004, 14701, 56886, 4, 11

In [17]:
import torch
import torch.nn as nn
import torch.optim as optim

# Entrenamiento
transformer.train()
for epoch in range(num_epochs):
    total_loss = 0
    num_batches = 0

    for src_batch, tgt_batch in generar_batches(encoder_ids, decoder_ids, batch_size):
        optimizer.zero_grad()
        output = transformer(src_batch, tgt_batch[:, :-1])
        loss = criterion(output.contiguous().view(-1, tgt_vocab_size),
                         tgt_batch[:, 1:].contiguous().view(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        num_batches += 1

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/num_batches:.4f}")

Epoch 1/50, Loss: 8.4467
Epoch 2/50, Loss: 6.6588
Epoch 3/50, Loss: 5.5728
Epoch 4/50, Loss: 4.6921
Epoch 5/50, Loss: 4.1585
Epoch 6/50, Loss: 3.8454
Epoch 7/50, Loss: 3.5449
Epoch 8/50, Loss: 3.2240
Epoch 9/50, Loss: 2.9471
Epoch 10/50, Loss: 2.6748
Epoch 11/50, Loss: 2.4345
Epoch 12/50, Loss: 2.2225
Epoch 13/50, Loss: 2.0006
Epoch 14/50, Loss: 1.7897
Epoch 15/50, Loss: 1.6083
Epoch 16/50, Loss: 1.4410
Epoch 17/50, Loss: 1.2890
Epoch 18/50, Loss: 1.1688
Epoch 19/50, Loss: 1.0500
Epoch 20/50, Loss: 0.9286
Epoch 21/50, Loss: 0.8199
Epoch 22/50, Loss: 0.7226
Epoch 23/50, Loss: 0.6439
Epoch 24/50, Loss: 0.5781
Epoch 25/50, Loss: 0.5275
Epoch 26/50, Loss: 0.4618
Epoch 27/50, Loss: 0.3966
Epoch 28/50, Loss: 0.3470
Epoch 29/50, Loss: 0.3159
Epoch 30/50, Loss: 0.2830
Epoch 31/50, Loss: 0.2668
Epoch 32/50, Loss: 0.2433
Epoch 33/50, Loss: 0.2349
Epoch 34/50, Loss: 0.2071
Epoch 35/50, Loss: 0.1995
Epoch 36/50, Loss: 0.1920
Epoch 37/50, Loss: 0.1832
Epoch 38/50, Loss: 0.1800
Epoch 39/50, Loss: 0.

In [None]:
# Guardar modelo entrenado
torch.save(transformer, "transformer_modelo_completo.pth")
print("Modelo guardado en 'transformer_modelo.pth'")

Shape entrada_encoder: torch.Size([1500, 100])
Shape entrada_decoder: torch.Size([1500, 100])
Primer encoder: tensor([2016,    5,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0])
Primer decoder: tensor([250004,   2016,      5,      2,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,  