In [1]:
import torch
from torch.autograd import Variable
from transformers import (
    AutoTokenizer,
    Adafactor,
    DataCollatorForSeq2Seq,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
from datasets import load_dataset, load_metric

import wandb
wandb.login()
%env WANDB_PROJECT=WebNLG_exp

%load_ext jupyter_black

# Use gpu
if torch.cuda.is_available():
    dev = torch.device("cuda:0")
    print("Running on the GPU")
else:
    dev = torch.device("cpu")
    print("Running on the CPU")

tokenizer = AutoTokenizer.from_pretrained("t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base").to(dev)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

  from .autonotebook import tqdm as notebook_tqdm
[34m[1mwandb[0m: Currently logged in as: [33mliux2[0m. Use [1m`wandb login --relogin`[0m to force relogin


env: WANDB_PROJECT=WebNLG_exp


Running on the GPU


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [2]:
dataset = load_dataset(
    "csv",
    data_files={
        "train": "../datasets/train_set.csv",
        "dev": "../datasets/dev_set.csv",
        "test": "../datasets/test_set.csv",
    },
)


def data_map(examples):
    inputs = ["WebNLG: " + eg + "</s>" for eg in examples["triple"]]
    outputs = [eg + "</s>" for eg in examples["sentence"]]
    model_inputs = tokenizer(
        inputs, max_length=tokenizer.model_max_length, truncation=True
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            outputs, max_length=tokenizer.model_max_length, truncation=True
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


dataset = dataset.map(
    data_map,
    batched=True,
    batch_size=8,
    num_proc=32,
)

100%|█████████████████████████████████████████████████| 3/3 [00:00<00:00, 1235.19it/s]


In [3]:
bleu = load_metric("bleu")
ter = load_metric("ter")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    b = bleu.compute(predictions=predictions, references=labels)
    t = ter.compute(predictions=predictions, references=labels)
    return {"BLEU": b, "TER": t}

In [4]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    debug="underflow_overflow",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    report_to="all",
    eval_accumulation_steps=10,
    # weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    fp16=True,
)
optimizer = Adafactor(
    model.parameters(),
    lr=1e-3,
    eps=(1e-30, 1e-3),
    clip_threshold=1.0,
    decay_rate=-0.8,
    beta1=None,
    weight_decay=0.0,
    relative_step=False,
    scale_parameter=False,
    warmup_init=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["dev"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    optimizers=(optimizer, None),
    #     compute_metrics=compute_metrics,
)

Using amp half precision backend


In [5]:
wandb.jupyter.__IFrame = wandb.jupyter.IFrame(
    opts={"height": 420, "workspace": False, "quiet": False}
)
trainer.train()
wandb.finish()

wandb.jupyter.__IFrame = None

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: triple, sentence. If triple, sentence are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 104799
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 6550
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss


output has nans

Detected inf/nan during batch_number=2
Last 21 forward frames:
abs min  abs max  metadata
                  encoder.block.0.layer.1.DenseReluDense.relu_act ReLU
6.54e-05 3.95e+02 input[0]
0.00e+00 2.38e+02 output
                  encoder.block.0.layer.1.DenseReluDense.relu_act ReLU
6.54e-05 3.95e+02 input[0]
0.00e+00 2.38e+02 output
                  encoder.block.0.layer.1.DenseReluDense.relu_act ReLU
6.54e-05 3.95e+02 input[0]
0.00e+00 2.38e+02 output
                  encoder.block.0.layer.1.DenseReluDense.relu_act ReLU
6.54e-05 3.95e+02 input[0]
0.00e+00 2.38e+02 output
                  encoder.block.0.layer.1.DenseReluDense.relu_act ReLU
6.54e-05 3.95e+02 input[0]
0.00e+00 2.38e+02 output
                  encoder.block.0.layer.1.DenseReluDense.relu_act ReLU
6.54e-05 3.95e+02 input[0]
0.00e+00 2.38e+02 output
                  encoder.block.0.layer.1.DenseReluDense.relu_act ReLU
6.54e-05 3.95e+02 input[0]
0.00e+00 2.38e+02 output
                  encoder.block.

ValueError: DebugUnderflowOverflow: inf/nan detected, aborting as there is no point running further. Please scroll up above this traceback to see the activation values prior to this event.