In [1]:
import pandas as pd
import torch

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, TrainingArguments, Seq2SeqTrainingArguments
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SEP_TOKEN = "<sep>"
HL_TOKEN = "<hl>"
tokenizer = AutoTokenizer.from_pretrained("VietAI/vit5-base")

tokenizer.add_tokens(SEP_TOKEN)
tokenizer.add_tokens(HL_TOKEN)
len(tokenizer)

36098

In [2]:
with open("../data/train.json", "r") as f:
    train_df = pd.read_json(f, orient="index")

In [3]:
train_df.columns

Index(['context', 'claim', 'verdict', 'evidence', 'domain',
       'evidence_predict'],
      dtype='object')

In [4]:
def highlight_fn(row):
    row["context"] = row["context"].replace(row["evidence_predict"], f"{HL_TOKEN} " + row["evidence_predict"] + f" {HL_TOKEN}")
    return row

In [5]:
train_df = train_df.apply(highlight_fn, axis=1)

In [13]:
def preprocess_fn(examples):
    inputs = tokenizer(
        examples["context"], max_length=2048, truncation=True, padding=True
    )

    labels = tokenizer(
        text_target=examples["verdict"], max_length=16, truncation=True, padding=True
    )
    
    return {"input_ids": inputs["input_ids"], "labels": labels["input_ids"]}

In [14]:
dataset = Dataset.from_pandas(train_df)
dataset = dataset.map(preprocess_fn, batched=True)



In [8]:
model = AutoModelForSeq2SeqLM.from_pretrained("VietAI/vit5-base")
model.resize_token_embeddings(len(tokenizer))
model.to('cuda')

T5ForConditionalGeneration(
  (shared): Embedding(36098, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(36098, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [9]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")

training_args = Seq2SeqTrainingArguments("tmp/",
                                      do_train=True,
                                      do_eval=False,
                                      num_train_epochs=30,
                                      optim="adamw_torch",
                                      learning_rate=1e-5,
                                      warmup_ratio=0.05,
                                      weight_decay=0.01,
                                      per_device_train_batch_size=4,
                                      per_device_eval_batch_size=4,
                                      logging_dir='./log',
                                      group_by_length=True,
                                      save_strategy="epoch",
                                      save_total_limit=3,
                                      #eval_steps=1,
                                      #evaluation_strategy="steps",
                                      # evaluation_strategy="no",
                                      fp16=True,
                                      )

In [10]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator,
)

trainer.train()

  0%|          | 0/202260 [00:00<?, ?it/s]You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


OutOfMemoryError: CUDA out of memory. Tried to allocate 768.00 MiB. GPU 0 has a total capacty of 7.76 GiB of which 294.81 MiB is free. Including non-PyTorch memory, this process has 6.24 GiB memory in use. Of the allocated memory 5.61 GiB is allocated by PyTorch, and 468.50 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF