In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, EncoderDecoderModel, Seq2SeqTrainingArguments, Seq2SeqTrainer
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
data = pd.DataFrame({"col1": ["texto a resumir"]*30,
                    "col2": ["resumen"]*30})


In [5]:
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        input_text = row['col1']
        target_text = row['col2']
        inputs = self.tokenizer(input_text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
        targets = self.tokenizer(target_text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
        return {'input_ids': inputs['input_ids'].squeeze(), 'attention_mask': inputs['attention_mask'].squeeze(), 'labels': targets['input_ids'].squeeze()}


In [17]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-multilingual-cased', 'bert-base-multilingual-cased')
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id

loading file vocab.txt from cache at C:\Users\josep/.cache\huggingface\hub\models--bert-base-multilingual-cased\snapshots\fdfce55e83dbed325647a63e7e1f5de19f0382ba\vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at C:\Users\josep/.cache\huggingface\hub\models--bert-base-multilingual-cased\snapshots\fdfce55e83dbed325647a63e7e1f5de19f0382ba\tokenizer_config.json
loading configuration file config.json from cache at C:\Users\josep/.cache\huggingface\hub\models--bert-base-multilingual-cased\snapshots\fdfce55e83dbed325647a63e7e1f5de19f0382ba\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-multilingual-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "int

Generation config file not found, using a generation config created from the model config.
Set `config.is_decoder=True` and `config.add_cross_attention=True` for decoder_config
Generate config GenerationConfig {
  "transformers_version": "4.26.1"
}



In [18]:
train_data, val_data = data.iloc[:int(len(data) * 0.8)], data.iloc[int(len(data) * 0.8):]  # Ajusta la proporción de división según sea necesario
max_length = 128  # Ajusta la longitud máxima de secuencia según sea necesario

train_dataset = CustomDataset(train_data, tokenizer, max_length)
val_dataset = CustomDataset(val_data, tokenizer, max_length)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)


In [19]:
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    num_train_epochs=3,  # Ajusta el número de épocas según sea necesario
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    save_steps=100,
    logging_steps=10,
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    fp16=False,  # Omitir o cambiar a False si no se desea usar la precisión mixta (half-precision)
    warmup_steps=200,
    save_total_limit=3
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [20]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)


In [21]:
trainer.train()

***** Running training *****
  Num examples = 24
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 9
  Number of trainable parameters = 384194811


Epoch,Training Loss,Validation Loss
1,No log,13.807752
2,No log,11.810097
3,No log,10.204957


***** Running Evaluation *****
  Num examples = 6
  Batch size = 8
***** Running Evaluation *****
  Num examples = 6
  Batch size = 8
***** Running Evaluation *****
  Num examples = 6
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=9, training_loss=15.035142686631945, metrics={'train_runtime': 83.3262, 'train_samples_per_second': 0.864, 'train_steps_per_second': 0.108, 'total_flos': 11047096535040.0, 'train_loss': 15.035142686631945, 'epoch': 3.0})

In [23]:
def prepare_input(text, tokenizer, max_length=512):
    input_tokens = tokenizer.encode_plus(
        text,
        max_length=max_length,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors="pt"
    )
    return input_tokens

def decode_predictions(predictions, tokenizer):
    decoded_output = tokenizer.decode(predictions[0], skip_special_tokens=True)
    return decoded_output


In [32]:
input_text = "texto a resumir"

# Tokeniza el texto de entrada
input_tokens = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)

# Genera los ID de entrada del decodificador (inicialmente configurados en None)
decoder_input_ids = torch.zeros((1, 1), dtype=torch.long)

# Realiza la predicción usando el modelo
with torch.no_grad():
    outputs = model(input_ids=input_tokens['input_ids'], attention_mask=input_tokens['attention_mask'], decoder_input_ids=decoder_input_ids)
    predictions = outputs.logits.argmax(-1)

# Decodifica las predicciones en texto legible
decoded_output = tokenizer.decode(predictions[0], skip_special_tokens=True)

print("Texto de entrada:", input_text)
print("Texto extraído:", decoded_output)


Texto de entrada: texto a resumir
Texto extraído: ##ини
