based on: https://github.com/vjeronymo2/pygaggle/blob/master/pygaggle/run/finetune_monot5.py

In [1]:
import os
import json
import pickle
import numpy as np

import pandas as pd
from tqdm.auto import tqdm
import torch
from torch.utils.data import Dataset
import jsonlines
import argparse

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
    TrainerCallback,
)

  from .autonotebook import tqdm as notebook_tqdm
2023-05-01 22:03:34.983692: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
queries=pd.read_csv("../data/publish/English/Queries/train.tsv", delimiter = "\t", names=["idx", "text"])

In [3]:
all_ids = queries.text.to_list()
train_ids, validation_ids, test_ids = np.split(
    all_ids, [int(0.6 * len(all_ids)), int(0.8 * len(all_ids))]
)

In [4]:
################################
base_model = 't5-base'
# base_model = "castorini/monoT5-base-msmarco"
triples_path = "../data/passages.jsonl"
output_model_path = "../data/models/monoT5-WT/train/checkpoints/"
save_every_n_steps = 1000
logging_steps = 100
per_device_train_batch_size = 6
gradient_accumulation_steps = 16
learning_rate = 3e-4  # original
epochs = 10

In [5]:
class MonoT5Dataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        text = f'Query: {sample[0]} Document: {sample[1]} Relevant:'
        return {
          'text': text,
          'labels': sample[2],
        }

In [6]:
device = torch.device('cuda')
torch.manual_seed(123)

<torch._C.Generator at 0x7f01491f30d0>

In [7]:
model = AutoModelForSeq2SeqLM.from_pretrained(base_model)
tokenizer = AutoTokenizer.from_pretrained('t5-base')

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [8]:
train_samples = []
with open(triples_path, 'r', encoding="utf-8") as fIn:
    for num, line in enumerate(fIn):

        if num > 6.4e5 * epochs:
            break
        if line == "\n":
            continue
        line = json.loads(line)

        # limit to train queries
        if line[0] not in train_ids:
          continue

        train_samples.append((line[0], line[1], 'true'))
        train_samples.append((line[0], line[2], 'false'))
        

In [9]:
len(train_samples)

28260

In [10]:
len(set(train_samples))

27649

In [11]:
train_samples = list(set(train_samples))

In [12]:
len(train_samples)

27649

In [13]:
def smart_batching_collate_text_only(batch):
    texts = [example['text'] for example in batch]
    tokenized = tokenizer(texts, padding=True, truncation='longest_first', return_tensors='pt', max_length=512)
    tokenized['labels'] = tokenizer([example['labels'] for example in batch], return_tensors='pt')['input_ids']

    for name in tokenized:
        tokenized[name] = tokenized[name].to(device)

    return tokenized

In [14]:
dataset_train = MonoT5Dataset(train_samples)

In [15]:
train_args = Seq2SeqTrainingArguments(
        output_dir=output_model_path,
        do_train=True,
        save_strategy="steps",
        save_steps =save_every_n_steps, 
        logging_steps=logging_steps,
        per_device_train_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        learning_rate=learning_rate,
        weight_decay=5e-5,
        num_train_epochs=1,
        warmup_steps=0,
        # warmup_steps=1000,
        adafactor=True,
        seed=1,
        disable_tqdm=False,
        load_best_model_at_end=False,
        predict_with_generate=True,
        dataloader_pin_memory=False,
        remove_unused_columns=False
    )



In [16]:
trainer = Seq2SeqTrainer(
    model=model,
    args=train_args,
    train_dataset=dataset_train,
    tokenizer=tokenizer,
    data_collator=smart_batching_collate_text_only,
)

In [17]:
trainer.train()

trainer.save_model(output_model_path.replace("checkpoints", "monoT5-WT"))
trainer.save_state()

Trainer is attempting to log a value of "{'summarization': {'early_stopping': True, 'length_penalty': 2.0, 'max_length': 200, 'min_length': 30, 'no_repeat_ngram_size': 3, 'num_beams': 4, 'prefix': 'summarize: '}, 'translation_en_to_de': {'early_stopping': True, 'max_length': 300, 'num_beams': 4, 'prefix': 'translate English to German: '}, 'translation_en_to_fr': {'early_stopping': True, 'max_length': 300, 'num_beams': 4, 'prefix': 'translate English to French: '}, 'translation_en_to_ro': {'early_stopping': True, 'max_length': 300, 'num_beams': 4, 'prefix': 'translate English to Romanian: '}}" for key "task_specific_params" as a parameter. MLflow's log_param() only accepts values no longer than 250 characters so we dropped this attribute. You can use `MLFLOW_FLATTEN_PARAMS` environment variable to flatten the parameters and avoid this message.
 35%|███▍      | 100/288 [02:42<04:47,  1.53s/it]

{'loss': 0.6259, 'learning_rate': 0.00019583333333333331, 'epoch': 0.35}


 69%|██████▉   | 200/288 [05:28<02:11,  1.50s/it]

{'loss': 0.3517, 'learning_rate': 9.166666666666667e-05, 'epoch': 0.69}


100%|██████████| 288/288 [08:02<00:00,  1.67s/it]


{'train_runtime': 482.7091, 'train_samples_per_second': 57.279, 'train_steps_per_second': 0.597, 'train_loss': 0.44365525907940334, 'epoch': 1.0}
