## Requisitos

* Python 3.9
* Na pasta raiz, crie e ative o ambiente:
```sh
python3 -m venv env_fase2_llms
source env_fase2_llms/bin/activate
```
* Volte para esta pasta
```sh
cd fase2/llms/aulas-1-a-4
```
* Instale as dependências
```sh
pip install -U pip
pip install -r requirements.txt
```

A lib "accelerator" serve para utilizar a GPU, especialmente quando há multiplos dispositivos do tipo disponíveis.

No Apple Silicon, tentar usar esta lib causa problemas de incompatibilidade.

In [1]:
# ! pip install datasets
# ! pip install -U 'accelerate==0.27.0'
# ! pip install -U 'transformers==4.41.2'
# ! pip install -U 'torch==2.3.0'

In [2]:
from datasets import load_dataset
from transformers import BertTokenizer, EncoderDecoderModel, Trainer, TrainingArguments
import torch
from transformers import get_linear_schedule_with_warmup
from sklearn.preprocessing import LabelEncoder
import pandas as pd
# from accelerate import Accelerator
from torch.utils.data import DataLoader

# Inicializar Accelerator
# accelerator = Accelerator(mixed_precision="fp16")


# Carregar o dataset - Reduzido para 10% para economizar memória
all_dataset = load_dataset("VanessaSchenkel/translation-en-pt", field="data")

dataset = (
    all_dataset["train"]
    .shuffle(seed=42)
    .select(range(int(len(all_dataset["train"]) * 0.10)))  # Reduzido de 50% para 10%
)


# Função para preparar os dados no formato correto
def preprocess_function(examples):
    # Extrair inputs e targets
    inputs = [ex["portuguese"] for ex in examples["translation"]]
    targets = [ex["english"] for ex in examples["translation"]]

    # Tokenizar inputs com max_length reduzido para economizar memória
    model_inputs = tokenizer(inputs, max_length=128, padding="max_length", truncation=True)

    # Tokenizar targets e process labels
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, padding="max_length", truncation=True)["input_ids"]

    # Ensure labels match the input length and shape
    model_inputs["labels"] = labels

    return model_inputs


# Carregar o tokenizer e o modelo BERT
model_name = "bert-base-multilingual-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = EncoderDecoderModel.from_encoder_decoder_pretrained(model_name, model_name)

# Definir o decoder_start_token_id
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id

  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.dense.weight',

In [3]:
# Tokenizar o dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 26048/26048 [00:03<00:00, 6658.38 examples/s]


In [4]:
# Criação de DataLoader para treinamento
from torch.utils.data import DataLoader
train_dataloader = DataLoader(tokenized_datasets, batch_size=16, shuffle=True)

In [5]:
# Configurar os parâmetros de treinamento com ajuste adicional
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=3e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=1000,
    # fp16=True,  # Habilitar mixed precision training
    gradient_accumulation_steps=2,
)


# Inicializar o Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets
)

Conferindo se vai rodar em Apple Metal (MPS)

In [6]:
trainer.args.device

device(type='mps')

Agora sim, executando o treinamento do modelo.

In [7]:
# Treinar o modelo
trainer.train()

  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)


Step,Training Loss
1000,0.8913
2000,0.4799
3000,0.4056
4000,0.3617
5000,0.3176
6000,0.2944


  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)
  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)
  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)
  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)
  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)
  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)
  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)
  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)
  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)
  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids

TrainOutput(global_step=6512, training_loss=0.4462749237510438, metrics={'train_runtime': 2787.3997, 'train_samples_per_second': 9.345, 'train_steps_per_second': 2.336, 'total_flos': 3996594035343360.0, 'train_loss': 0.4462749237510438, 'epoch': 1.0})

In [8]:
# Salvar o modelo
trainer.save_model("./results/model/bert-translation-en-pt-v1")
tokenizer.save_pretrained("./results/model/bert-tokenizer-translation-en-pt-v1")

('./results/model/bert-tokenizer-translation-en-pt-v1/tokenizer_config.json',
 './results/model/bert-tokenizer-translation-en-pt-v1/special_tokens_map.json',
 './results/model/bert-tokenizer-translation-en-pt-v1/vocab.txt',
 './results/model/bert-tokenizer-translation-en-pt-v1/added_tokens.json')