In [1]:
import pandas as pd
import re
from collections import Counter
import torch
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformer import Transformer
import torch.nn as nn

In [2]:
'''
from peft import LoraConfig, get_peft_model
# Configurar LoRA
lora_config = LoraConfig(
    r=8,  # Rango bajo para LoRA
    lora_alpha=32,  # Factor de escala
    lora_dropout=0.1,  # Dropout
    #bias="none",  # Bias para LoRA 
    target_modules= ['W_q', 'W_k', 'W_v', 'W_o']
)
'''

'''
from accelerate import Accelerator
accelerator = Accelerator()
device = accelerator.device
print(device)
'''

'''
from peft import LoraConfig, get_peft_model
lora_config = LoraConfig(
    r=8,  # Rangos bajos para LoRA
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules= ['W_q', 'W_k', 'W_v', 'W_o']
)
'''

"\nfrom peft import LoraConfig, get_peft_model\nlora_config = LoraConfig(\n    r=8,  # Rangos bajos para LoRA\n    lora_alpha=16,\n    lora_dropout=0.1,\n    target_modules= ['W_q', 'W_k', 'W_v', 'W_o']\n)\n"

In [3]:
 # "encoder.layers.*.self_attn.W_q",  
 #   "encoder.layers.*.self_attn.W_k",  
 #   "encoder.layers.*.self_attn.W_v",  
 #   "encoder.layers.*.self_attn.W_o",  
 #   "decoder.layers.*.self_attn.W_q",
 #   "decoder.layers.*.self_attn.W_k",
 #   "decoder.layers.*.self_attn.W_v",
 #   "decoder.layers.*.self_attn.W_o"

In [4]:
#ignorar (es original del video)
torch.manual_seed(23)

if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device("cpu")
print(device)

mps


In [5]:
MAX_SEQ_LEN = 128

In [6]:
PATH = '../datasets/idiomas-engl-span.tsv'

with open(PATH, 'r', encoding='utf-8') as f:
    lines = f.readlines()
pairs_engl_span = [line.strip().split('\t') for line in lines if '\t' in line]
#print(pairs_engl_span[:5])

engl_sentences = [pair[1] for pair in pairs_engl_span]
span_sentences = [pair[3] for pair in pairs_engl_span]
#print(engl_sentences[:10])
#print(span_sentences[:10])

In [7]:
def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = re.sub(r"[á]+", "a", sentence)
    sentence = re.sub(r"[é]+", "e", sentence)
    sentence = re.sub(r"[í]+", "i", sentence)
    sentence = re.sub(r"[ó]+", "o", sentence)
    sentence = re.sub(r"[ú]+", "u", sentence)
    sentence = re.sub(r"[^a-z]+", " ", sentence)
    sentence = sentence.strip()
    sentence = '<sos> ' + sentence + ' <eos>'
    return sentence
#s1 = '¿Hola @ cómo estás? 123'
#print(s1)
#print(preprocess_sentence(s1))

engl_sentences = [preprocess_sentence(sentence) for sentence in engl_sentences]
span_sentences = [preprocess_sentence(sentence) for sentence in span_sentences]
#print(engl_sentences[:10])
#print(span_sentences[:10])
print(len(engl_sentences))
print(len(span_sentences))

265486
265486


In [8]:
def build_vocab(sentences):
    words = [word for sentence in sentences for word in sentence.split()]
    word_count = Counter(words)
    sorted_word_counts = sorted(word_count.items(), key=lambda x:x[1], reverse=True)
    word2idx = {word: idx for idx, (word, _) in enumerate(sorted_word_counts, 2)}
    word2idx['<pad>'] = 0
    word2idx['<unk>'] = 1
    idx2word = {idx: word for word, idx in word2idx.items()}
    return word2idx, idx2word

In [9]:
eng_word2idx, eng_idx2word = build_vocab(engl_sentences)
spa_word2idx, spa_idx2word = build_vocab(span_sentences)
eng_vocab_size = len(eng_word2idx)
spa_vocab_size = len(spa_word2idx)
#print(eng_vocab_size, spa_vocab_size)
#print(eng_idx2word)
#print(spa_idx2word)

In [10]:
class EngSpaDataset(Dataset):
    def __init__(self, eng_sentences, spa_sentences, eng_word2idx, spa_word2idx):
        self.eng_sentences = eng_sentences
        self.spa_sentences = spa_sentences
        self.eng_word2idx = eng_word2idx
        self.spa_word2idx = spa_word2idx
        
    def __len__(self):
        return len(self.eng_sentences)
    
    def __getitem__(self, idx):
        eng_sentence = self.eng_sentences[idx]
        spa_sentence = self.spa_sentences[idx]
        # return tokens idxs
        eng_idxs = [self.eng_word2idx.get(word, self.eng_word2idx['<unk>']) for word in eng_sentence.split()]
        spa_idxs = [self.spa_word2idx.get(word, self.spa_word2idx['<unk>']) for word in spa_sentence.split()]
        
        return torch.tensor(eng_idxs), torch.tensor(spa_idxs)

In [11]:
def collate_fn(batch):
    eng_batch, spa_batch = zip(*batch)
    eng_batch = [seq[:MAX_SEQ_LEN].clone().detach() for seq in eng_batch]
    spa_batch = [seq[:MAX_SEQ_LEN].clone().detach() for seq in spa_batch]
    eng_batch = torch.nn.utils.rnn.pad_sequence(eng_batch, batch_first=True, padding_value=0)
    spa_batch = torch.nn.utils.rnn.pad_sequence(spa_batch, batch_first=True, padding_value=0)
    return eng_batch, spa_batch

In [12]:
def train(model, dataloader, loss_function, optimiser, epochs):
    model.train()
    for epoch in range(epochs):
        total_loss = 0 
        for i, (eng_batch, spa_batch) in enumerate(dataloader):
            eng_batch = eng_batch.to(device)
            spa_batch = spa_batch.to(device)
            # Decoder preprocessing
            target_input = spa_batch[:, :-1]
            target_output = spa_batch[:, 1:].contiguous().view(-1)
            # Zero grads
            optimiser.zero_grad()
            # run model
            output = model(eng_batch, target_input)
            output = output.view(-1, output.size(-1))
            # loss\
            loss = loss_function(output, target_output)
            # gradient and update parameters
            loss.backward()
            #accelerator.backward(loss) #para accelerate
            optimiser.step()
            total_loss += loss.item()
            
        avg_loss = total_loss/len(dataloader)
        print(f'Epoch: {epoch}/{epochs}, Loss: {avg_loss:.4f}')

In [13]:
BATCH_SIZE = 32#64
dataset = EngSpaDataset(engl_sentences, span_sentences, eng_word2idx, spa_word2idx)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

In [14]:
model = Transformer(d_model=512, num_heads=8, d_ff=2048, num_layers=6,
                    input_vocab_size=eng_vocab_size, target_vocab_size=spa_vocab_size,
                    max_len=MAX_SEQ_LEN, dropout=0.1)

In [15]:
model

Transformer(
  (encoder_embedding): Embedding(27593, 512)
  (decoder_embedding): Embedding(46821, 512)
  (pos_embedding): PositionalEmbedding()
  (encoder): Encoder(
    (layers): ModuleList(
      (0-5): 6 x EncoderSubLayer(
        (self_attn): MultiHeadAttention(
          (W_q): Linear(in_features=512, out_features=512, bias=True)
          (W_k): Linear(in_features=512, out_features=512, bias=True)
          (W_v): Linear(in_features=512, out_features=512, bias=True)
          (W_o): Linear(in_features=512, out_features=512, bias=True)
        )
        (ffn): PositionFeedForward(
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
        )
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (droupout1): Dropout(p=0.1, inplace=False)
        (droupout2): Dropout(p=0.1, inplace=False)
   

In [16]:
'''
# Imprimir los nombres de los módulos
for name, module in model.named_modules():
    print(name)
'''

'\n# Imprimir los nombres de los módulos\nfor name, module in model.named_modules():\n    print(name)\n'

In [17]:
'''
model = get_peft_model(model, lora_config) #lora
'''

'''
model, dataloader = accelerator.prepare(model, dataloader)
'''

'\nmodel, dataloader = accelerator.prepare(model, dataloader)\n'

In [18]:
model = model.to(device) #para usar accelerate sin esto segun ejemplo
loss_function = nn.CrossEntropyLoss(ignore_index=0)
optimiser = optim.Adam(model.parameters(), lr=0.0001)

In [19]:
train(model, dataloader, loss_function, optimiser, epochs = 10)

here 1
here 2
here 3
here 4
i: 0
here 5
here 6
here 1
here 2
here 3
here 4
i: 1
here 5
here 6
here 1
here 2
here 3
here 4
i: 2
here 5
here 6
here 1
here 2
here 3
here 4
i: 3
here 5
here 6
here 1
here 2
here 3
here 4
i: 4
here 5
here 6
here 1
here 2
here 3
here 4
i: 5
here 5
here 6
here 1
here 2
here 3
here 4
i: 6
here 5
here 6
here 1
here 2
here 3
here 4
i: 7
here 5
here 6
here 1
here 2
here 3
here 4
i: 8
here 5
here 6
here 1
here 2
here 3
here 4
i: 9
here 5
here 6
here 1
here 2
here 3
here 4
i: 10
here 5
here 6
here 1
here 2
here 3
here 4
i: 11
here 5
here 6
here 1
here 2
here 3
here 4
i: 12
here 5
here 6
here 1
here 2
here 3
here 4
i: 13
here 5
here 6
here 1
here 2
here 3
here 4
i: 14
here 5
here 6
here 1
here 2
here 3
here 4
i: 15
here 5
here 6
here 1
here 2
here 3
here 4
i: 16
here 5
here 6
here 1
here 2
here 3
here 4
i: 17
here 5
here 6
here 1
here 2
here 3
here 4
i: 18
here 5
here 6
here 1
here 2
here 3
here 4
i: 19
here 5
here 6
here 1
here 2
here 3
here 4
i: 20
here 5
here 6
he

RuntimeError: MPS backend out of memory (MPS allocated: 3.35 GB, other allocations: 14.25 GB, max allowed: 18.13 GB). Tried to allocate 611.55 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
len(dataloader)

4149