In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import pandas as pd
from datasets import load_dataset, load_metric
from sklearn.utils import shuffle
import torch 
import mlflow
import numpy as np
from transformers import BartForConditionalGeneration, BartTokenizer, Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
metric = load_metric('rouge')

  metric = load_metric('rouge')


In [3]:
model_name = "facebook/bart-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [4]:
train_small = pd.read_csv('data\\train_small.csv', index_col=0)
val = pd.read_csv('data\\eval.csv', index_col=0)

train_pairs = list(zip(train_small.source, train_small.target))
eval_pairs = list(zip(val.source, val.target))
print(train_small.shape[0], val.shape[0])

20480 2560


In [5]:
class SimplificationDataset(torch.utils.data.Dataset):
    def __init__(self, pairs, tokenizer, max_length):
        self.pairs = pairs
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, index):
        source_text, target_text = self.pairs[index]
        source_encoding = self.tokenizer(source_text, truncation=True, max_length=self.max_length, padding="max_length")
        target_encoding = self.tokenizer(target_text, truncation=True, max_length=self.max_length, padding="max_length")

        return {
            "input_ids": source_encoding["input_ids"],
            "attention_mask": source_encoding["attention_mask"],
            "labels": target_encoding["input_ids"],
        }

In [6]:
max_length = 512
train_dataset = SimplificationDataset(train_pairs, tokenizer, max_length=max_length)
eval_dataset = SimplificationDataset(eval_pairs, tokenizer, max_length=max_length)

In [7]:
collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [8]:
def compute_rouge(pred):
    predictions, labels = pred

    decode_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    
    decode_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    res = metric.compute(predictions=decode_predictions, references=decode_labels, use_stemmer=True)

    res = {key: value.mid.fmeasure * 100 for key, value in res.items()}

    pred_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    res['gen_len'] = np.mean(pred_lens)

    return {k: round(v, 4) for k, v in res.items()}

In [12]:
args = Seq2SeqTrainingArguments(
    'bart', 
    learning_rate=2e-5,
    per_device_train_batch_size= 6,
    per_device_eval_batch_size= 6,
    num_train_epochs=3,
    predict_with_generate=True,
    evaluation_strategy='steps',
    logging_steps=1000,
    save_steps=10000,
    fp16=True
    )

trainer = Seq2SeqTrainer(
    model, 
    args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_rouge
)

using `logging_steps` to initialize `eval_steps` to 1000
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend


In [13]:
experiment_name = "bart"
mlflow.set_experiment(experiment_name=experiment_name)
current_experiment = dict(mlflow.get_experiment_by_name(experiment_name))
exp_id = current_experiment['experiment_id']

In [14]:
with mlflow.start_run(experiment_id=exp_id, run_name='first_test'):
    # Логируем параметры
    mlflow.log_param('model_name', model_name)
    #mlflow.log_param('max_length', max_length)
    mlflow.log_param('train_data_size', len(train_dataset))
    mlflow.log_param('batch_size', args.per_device_train_batch_size)
    
    trainer.train()
    
    # Логируем метрики
    metrics = trainer.evaluate()
    for key, value in metrics.items():
        mlflow.log_metric(key, value)

    # Сохраняем модель и логируем путь к ней
    output_dir = './bart'
    trainer.save_model(output_dir)
    mlflow.log_artifacts(output_dir, artifact_path='models')
    
mlflow.end_run()

***** Running training *****
  Num examples = 20480
  Num Epochs = 3
  Instantaneous batch size per device = 6
  Total train batch size (w. parallel, distributed & accumulation) = 6
  Gradient Accumulation steps = 1
  Total optimization steps = 10242
  Number of trainable parameters = 406291456


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1000,0.1905,0.193545,1.7087,0.3646,1.7216,1.7342,19.9941
2000,0.1763,0.189292,1.6124,0.3437,1.6235,1.641,19.9941
3000,0.1885,0.174543,1.6782,0.2891,1.7,1.7073,19.9879
4000,0.1864,0.171757,1.6316,0.3177,1.626,1.6433,19.9926
5000,0.1788,0.167954,1.6852,0.3523,1.6756,1.6962,19.9883
6000,0.1771,0.164487,1.8609,0.3385,1.8675,1.8903,19.9922
7000,0.1728,0.162993,1.6121,0.3034,1.6099,1.6253,19.9891
8000,0.1687,0.161706,1.6207,0.2604,1.6266,1.6318,19.9914
9000,0.1667,0.159812,1.8588,0.3314,1.8679,1.8867,19.991
10000,0.1647,0.158522,1.752,0.2708,1.7636,1.7783,19.9945


***** Running Evaluation *****
  Num examples = 2560
  Batch size = 6
***** Running Evaluation *****
  Num examples = 2560
  Batch size = 6
***** Running Evaluation *****
  Num examples = 2560
  Batch size = 6
***** Running Evaluation *****
  Num examples = 2560
  Batch size = 6
***** Running Evaluation *****
  Num examples = 2560
  Batch size = 6
***** Running Evaluation *****
  Num examples = 2560
  Batch size = 6
***** Running Evaluation *****
  Num examples = 2560
  Batch size = 6
***** Running Evaluation *****
  Num examples = 2560
  Batch size = 6
***** Running Evaluation *****
  Num examples = 2560
  Batch size = 6
***** Running Evaluation *****
  Num examples = 2560
  Batch size = 6
Saving model checkpoint to bart\checkpoint-10000
Configuration saved in bart\checkpoint-10000\config.json
Model weights saved in bart\checkpoint-10000\pytorch_model.bin
tokenizer config file saved in bart\checkpoint-10000\tokenizer_config.json
Special tokens file saved in bart\checkpoint-10000\speci

Saving model checkpoint to ./bart
Configuration saved in ./bart\config.json
Model weights saved in ./bart\pytorch_model.bin
tokenizer config file saved in ./bart\tokenizer_config.json
Special tokens file saved in ./bart\special_tokens_map.json
