# Train the Model using datas

In [1]:
from torch.optim import Adam
from datetime import datetime
import torch 
from tqdm import tqdm

In [2]:
# define device 
# configuration 

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using CUDA as device")
else:
    # Check that MPS is available
    if not torch.backends.mps.is_available():
        if not torch.backends.mps.is_built():
            print("MPS not available because the current PyTorch install was not "
                  "built with MPS enabled.")
        else:
            print("MPS not available because the current MacOS version is not 12.3+ "
                  "and/or you do not have an MPS-enabled device on this machine.")
        device = torch.device("cpu")
        print("Using CPU as device")
    else:
        device = torch.device("mps")
        print("Using MPS as device")
    
torch.set_default_device(device)

Using MPS as device


In [3]:
# Define some configuration of training 
d_model = 256 
n_head = 2
max_len = 128
ffn_hidden = 64 
n_layers=6 
drop_prob=0.1
epochs=300
init_lr = 1e-3
weight_decay = 5e-4
clip = 5

In [4]:
%run data-loader.ipynb

  from .autonotebook import tqdm as notebook_tqdm


Using MPS as device


In [5]:
%run Transformer.ipynb

In [6]:
%run BLEU-metric.ipynb

In [7]:
# Define some configuration of training 

src_pad_token = kr_tokenizer.pad_token_id
trg_pad_token = en_tokenizer.pad_token_id
trg_sos_token = en_tokenizer.sep_token_id
enc_voc_size = kr_tokenizer.vocab_size
dec_voc_size = en_tokenizer.vocab_size

print(f'src_pad_token: {src_pad_token}')
print(f'trg_pad_token: {trg_pad_token}')
print(f'trg_sos_token: {trg_sos_token}')
print(f'enc_voc_size: {enc_voc_size}')
print(f'dec_voc_size: {dec_voc_size}')

src_pad_token: 0
trg_pad_token: 0
trg_sos_token: 102
enc_voc_size: 42000
dec_voc_size: 30522


In [8]:
# Prepare the model 
model = Transformer(src_pad_token, trg_pad_token, trg_sos_token, enc_voc_size, dec_voc_size, ffn_hidden, n_layers, drop_prob, device).to(device)
model.train()

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'model parameter #: {count_parameters(model)}')


model parameter #: 31559226


In [9]:
# Setup optimizer 
optimizer = Adam(params=model.parameters(), lr=init_lr, weight_decay=weight_decay)

loss_func = nn.CrossEntropyLoss(ignore_index=src_pad_token)


In [10]:
def train_epoch(epoch_num): 
    train_epoch_loss = 0 

    for step, (kr_tokenized, en_tokenized) in tqdm(enumerate(train_dataloader)): 
        optimizer.zero_grad()

        kr_tokenized = kr_tokenized.to(device)
        en_tokenized = en_tokenized.to(device)

        out = model(kr_tokenized, en_tokenized[:, :-1])

        # remove sos token from en_tokenized when calculating loss because out will not include sos token. 
        en_tokenized = en_tokenized[:, 1:].contiguous().view(-1)

        out = out.contiguous().view(-1, out.shape[-1])

        loss = loss_func(out, en_tokenized.type(torch.LongTensor))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        train_epoch_loss += loss.item()

    train_step_loss = train_epoch_loss / (step+1)
    # After training epoch, do evaluation 

    return train_step_loss
    

# evaluate the model 
def evaluate(): 
    model.eval()
    test_epoch_loss = 0 
    test_bleu_loss = 0
    
    with torch.no_grad(): 
        for step, batch in tqdm(enumerate(test_dataloader)): 
            kr_tokenized = batch["kr"].to(device)
            en_tokenized = batch["en"].to(device)

            out = model(kr_tokenized, en_tokenized[:, :-1])

            # remove sos token from en_tokenized when calculating loss because out will not include sos token. 
            en_tokenized = en_tokenized[:, 1:].contiguous().view(-1)
    
            out = out.contiguous().view(-1, out.shape[-1])
            loss = loss_func(out, en_tokenized.type(torch.LongTensor))
            test_epoch_loss += loss.item()

            # calcuate the bleu 
            # TODO
    return test_step_loss

In [11]:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

for epoch in range(epochs):
    train_loss = train_epoch(epoch)
    test_loss = evaluate()

    best_vloss = 100_000_000

    print(f'Epoch {epoch}: Train Loss {train_loss}, Test Loss {test_loss}')

    if test_loss < best_vloss:
        best_vloss = avg_vloss 
        model_path = f'models/model_{timestamp}_{epoch}' 
        torch.save(model.state_dict(), model_path)  

0it [00:00, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
73it [01:38,  1.34s/it]


KeyboardInterrupt: 