In [1]:
%load_ext autoreload
%load_ext dotenv
%dotenv
!huggingface-cli login --token $HUGGING_FACE_TOKEN

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/paperspace/.cache/huggingface/token
Login successful


In [2]:
# %%capture
# %pip install transformers evaluate
# %pip install nltk absl-py rouge_score
# %pip install bleu sacrebleu
# %pip install sacremoses
# %pip install scipy
# %pip install sentencepiece
# %pip install optimum auto-gptq
# %pip install scikit-learn
# %pip install einops
# %pip install bitsandbytes
# %pip install accelerate
# %pip install pynvml

In [3]:
%load_ext autoreload
%autoreload 2

import os
import sys
import time
from pprint import pprint

import evaluate
import numpy as np
import pandas as pd
import torch
from datasets import load_dataset
from huggingface_hub import HfApi
from transformers import (
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    BartForCausalLM,
    BartModel,
    BartTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    T5ForConditionalGeneration,
    T5Tokenizer,
    T5TokenizerFast,
)

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

from utils.dataset import get_iterater_samples_simplified, get_iterater_samples_with_instruction
from utils.metric import calculate_scores
from utils.monitoring import calculate_utilization, format_utilization_narrow, print_utilization

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pprint(f"Device: {device}")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
'Device: cuda'


## Loading models

### Loading BART
* https://huggingface.co/facebook/bart-large

In [4]:
from transformers import BartTokenizer, AutoModelForSeq2SeqLM

model_name = "bart-large"
model_repo = f"facebook"
model_id = f"{model_repo}/{model_name}"
model_checkpoint = f"{model_repo}/{model_name}"
model_path = f"{model_repo}_{model_name}"
print(f"model_name: {model_name}," f"model_id: {model_id}," f"model_path: {model_path}")

tokenizer = BartTokenizer.from_pretrained(model_id)
tokenizer.add_eos_token = True
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "right"
print(type(tokenizer))

model = AutoModelForSeq2SeqLM.from_pretrained(model_id, device_map=0)
print(type(model))

model_name: bart-large,model_id: facebook/bart-large,model_path: facebook_bart-large
<class 'transformers.models.bart.tokenization_bart.BartTokenizer'>
<class 'transformers.models.bart.modeling_bart.BartForConditionalGeneration'>


## Loading dataset

In [19]:
from datasets import DatasetDict, load_dataset

# full_dataset = load_dataset("grammarly/coedit")
# print(full_dataset)

# train_dataset = load_dataset("grammarly/coedit", split="train[:50000]")
# test_dataset = load_dataset("grammarly/coedit", split="train[10000:]")
# # test_dataset = load_dataset("grammarly/coedit", split="validation")

all_dataset = load_dataset("grammarly/coedit", split="train+validation")
# print(all_dataset)

# print()
# print(f"train set {set(all_dataset['task'])}")
# print(f"total len: {len(all_dataset)}")
# print(f"gec len: {len(all_dataset.filter(lambda x: x['task'] == 'gec'))}")
# print(f"simplification len: {len(all_dataset.filter(lambda x: x['task'] == 'simplification'))}")
# print(f"clarity len: {len(all_dataset.filter(lambda x: x['task'] == 'clarity'))}")
# print(f"coherence len: {len(all_dataset.filter(lambda x: x['task'] == 'coherence'))}")
# print(f"paraphrase len: {len(all_dataset.filter(lambda x: x['task'] == 'paraphrase'))}")
# print(f"neutralize len: {len(all_dataset.filter(lambda x: x['task'] == 'neutralize'))}")
# print()

# train_ratio = 0.001
# test_ratio = 0.0001
train_ratio = 0.1
test_ratio = 0.01
# train_ratio = 0.9
# test_ratio = 0.1

gec_dataset = all_dataset.filter(lambda x: x["task"] == "gec")
train_gec_dataset = gec_dataset.select(range(0, int(train_ratio * len(gec_dataset))))
test_gec_dataset = gec_dataset.select(range(int((1 - test_ratio) * len(gec_dataset)), len(gec_dataset)))

simplification_dataset = all_dataset.filter(lambda x: x["task"] == "simplification")
train_simplification_dataset = simplification_dataset.select(range(0, int(train_ratio * len(simplification_dataset))))
test_simplification_dataset = simplification_dataset.select(
    range(int((1 - test_ratio) * len(simplification_dataset)), len(simplification_dataset))
)

clarity_dataset = all_dataset.filter(lambda x: x["task"] == "clarity")
train_clarity_dataset = clarity_dataset.select(range(0, int(train_ratio * len(clarity_dataset))))
test_clarity_dataset = clarity_dataset.select(range(int((1 - test_ratio) * len(clarity_dataset)), len(clarity_dataset)))

coherence_dataset = all_dataset.filter(lambda x: x["task"] == "coherence")
train_coherence_dataset = coherence_dataset.select(range(0, int(train_ratio * len(coherence_dataset))))
test_coherence_dataset = coherence_dataset.select(
    range(int((1 - test_ratio) * len(coherence_dataset)), len(coherence_dataset))
)

paraphrase_dataset = all_dataset.filter(lambda x: x["task"] == "paraphrase")
train_paraphrase_dataset = paraphrase_dataset.select(range(0, int(train_ratio * len(paraphrase_dataset))))
test_paraphrase_dataset = paraphrase_dataset.select(
    range(int((1 - test_ratio) * len(paraphrase_dataset)), len(paraphrase_dataset))
)

neutralize_dataset = all_dataset.filter(lambda x: x["task"] == "neutralize")
neutralize_dataset_split = int(train_ratio * len(neutralize_dataset))
train_neutralize_dataset = neutralize_dataset.select(range(0, int(train_ratio * len(neutralize_dataset))))
test_neutralize_dataset = neutralize_dataset.select(
    range(int((1 - test_ratio) * len(neutralize_dataset)), len(neutralize_dataset))
)

from datasets import concatenate_datasets

train_dataset = concatenate_datasets(
    [
        train_gec_dataset,
        train_simplification_dataset,
        train_clarity_dataset,
        train_coherence_dataset,
        train_paraphrase_dataset,
        train_neutralize_dataset,
    ]
)
train_dataset = train_dataset.map(
    lambda item: {
        "input": item["src"],
        "reference": item["tgt"],
        "references": [item["tgt"]],
    },
    remove_columns=["src", "tgt", "_id"],
)
print(f"train set {set(train_dataset['task'])}")
print(train_dataset)

test_dataset = concatenate_datasets(
    [
        test_gec_dataset,
        test_simplification_dataset,
        test_clarity_dataset,
        test_coherence_dataset,
        test_paraphrase_dataset,
        test_neutralize_dataset,
    ]
)
test_dataset = test_dataset.map(
    lambda item: {
        "input": item["src"],
        "reference": item["tgt"],
        "references": [item["tgt"]],
    },
    remove_columns=["src", "tgt", "_id"],
)
print(f"test set {set(test_dataset['task'])}")
print(test_dataset)

dataset = DatasetDict({"train": train_dataset, "test": test_dataset})
# dataset = dataset.rename_column("task", "label")
# dataset = dataset.map(
#     lambda item: {
#         "input": item["src"],
#         "reference": item["tgt"],
#         "references": [item["tgt"]],
#     },
#     remove_columns=["src", "tgt", "_id"],
# )
print(dataset)
print(dataset["train"][0])

Map: 100%|██████████| 7076/7076 [00:00<00:00, 11590.98 examples/s]


train set {'gec', 'paraphrase', 'coherence', 'neutralize', 'clarity', 'simplification'}
Dataset({
    features: ['task', 'input', 'reference', 'references'],
    num_rows: 7076
})


Map: 100%|██████████| 711/711 [00:00<00:00, 10218.72 examples/s]

test set {'gec', 'paraphrase', 'coherence', 'neutralize', 'clarity', 'simplification'}
Dataset({
    features: ['task', 'input', 'reference', 'references'],
    num_rows: 711
})
DatasetDict({
    train: Dataset({
        features: ['task', 'input', 'reference', 'references'],
        num_rows: 7076
    })
    test: Dataset({
        features: ['task', 'input', 'reference', 'references'],
        num_rows: 711
    })
})
{'task': 'gec', 'input': 'Remove all grammatical errors from this text: For example, countries with a lot of deserts can terraform their desert to increase their habitable land and using irrigation to provide clean water to the desert.', 'reference': 'For example, countries with a lot of deserts can transform their desert to increase their habitable land and use irrigation to provide clean water to the desert.', 'references': ['For example, countries with a lot of deserts can transform their desert to increase their habitable land and use irrigation to provide clean wate




In [20]:
# find the longest sequence in the dataset
max_input_length = max(len(tokenizer.encode(item["input"])) for item in dataset["train"])
print(f"max_input_length train: {max_input_length}")
max_input_length = max(len(tokenizer.encode(item["input"])) for item in dataset["test"])
print(f"max_input_length test: {max_input_length}")

max_input_length train: 136
max_input_length test: 241


In [21]:
from datasets import Dataset

train_lists_map = {}

for task in set(train_dataset['task']):
    train_lists_map[task] = []

for item in dataset["train"]:
    train_lists_map[item["task"]].append(item)

train_dataset_map = {}
for task, list in train_lists_map.items():
    train_dataset_map[task] = Dataset.from_list(list)
# print(train_dataset_map)

train_dataset_dict = DatasetDict(train_dataset_map)
# print(train_dataset_dict)

# for task, ds in train_dataset_dict.items():
#     print(f"{task}: {ds}")

for task in set(train_dataset['task']):
    print(f"train/{task}: {len(train_lists_map[task])}")

train/gec: 2030
train/paraphrase: 1589
train/coherence: 1061
train/neutralize: 1127
train/clarity: 125
train/simplification: 1144


In [22]:
from datasets import Dataset

test_lists_map = {}

for task in set(train_dataset['task']):
    test_lists_map[task] = []

for item in dataset["test"]:
    test_lists_map[item["task"]].append(item)

test_dataset_map = {}
for task, list in test_lists_map.items():
    test_dataset_map[task] = Dataset.from_list(list)
# print(test_dataset_map)

test_dataset_dict = DatasetDict(test_dataset_map)
# print(test_dataset_dict)

# for task, ds in test_dataset_dict.items():
#     print(f"{task}: {ds}")

for task in set(train_dataset['task']):
    print(f"test/{task}: {len(test_lists_map[task])}")

test/gec: 204
test/paraphrase: 159
test/coherence: 107
test/neutralize: 113
test/clarity: 13
test/simplification: 115


In [23]:
import numpy as np
from datasets import Dataset
from torch.utils.data import DataLoader
from transformers import DataCollatorForLanguageModeling, DataCollatorForSeq2Seq

print(f"pad_token_id: {tokenizer.pad_token_id}")
print(f"padding_side: {tokenizer.padding_side}")


def process_dataset(batch):
    model_inputs = tokenizer(batch["input"], max_length=1024, padding=True)
    labels = tokenizer(text_target=batch["reference"], max_length=1024, padding=True).input_ids
    # model_inputs = tokenizer(batch["input"], padding=True)
    # labels = tokenizer(text_target=batch["reference"], padding=True).input_ids
    # model_inputs = tokenizer(batch["input"])
    # labels = tokenizer(text_target=batch["reference"]).input_ids
    model_inputs["labels"] = labels
    return model_inputs


# processed_dataset = data.map(preprocess_function, batched=True, batch_size=10, remove_columns=data["train"].column_names)
processed_dataset = dataset.map(process_dataset, batched=True, batch_size=10, remove_columns=train_dataset.column_names)
# processed_dataset = dataset.map(process_dataset, batched=True, batch_size=10)
print(processed_dataset)
print(len(processed_dataset["train"]["input_ids"][0]), processed_dataset["train"]["input_ids"][0])
print(len(processed_dataset["train"]["attention_mask"][0]), processed_dataset["train"]["attention_mask"][0])
print(len(processed_dataset["train"]["labels"][0]), processed_dataset["train"]["labels"][0])

data_collator = DataCollatorForSeq2Seq(tokenizer, return_tensors="pt", model=model)
# data_collator = DataCollatorForSeq2Seq(tokenizer, return_tensors="pt", padding=True)

# dataloader = DataLoader(processed_dataset, batch_size=5, collate_fn=data_collator)
# for batch in dataloader:
#     print(batch)
#     break

pad_token_id: 2
padding_side: right


Map: 100%|██████████| 7076/7076 [00:03<00:00, 2268.29 examples/s]
Map: 100%|██████████| 711/711 [00:00<00:00, 1343.43 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 7076
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 711
    })
})
49 [0, 47583, 70, 25187, 45816, 9126, 31, 42, 2788, 35, 286, 1246, 6, 749, 19, 10, 319, 9, 38905, 64, 8470, 763, 3899, 49, 10348, 7, 712, 49, 40752, 1212, 8, 634, 20500, 7, 694, 2382, 514, 7, 5, 10348, 4, 2, 2, 2, 2, 2, 2, 2, 2]
49 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]
40 [0, 2709, 1246, 6, 749, 19, 10, 319, 9, 38905, 64, 7891, 49, 10348, 7, 712, 49, 40752, 1212, 8, 304, 20500, 7, 694, 2382, 514, 7, 5, 10348, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]


## Pre-test inference

In [24]:
max_batch = 2
max_length = 350

for task, batch in test_dataset_dict.items():
    print()
    print(f">> {task}")
    batch_size = len(batch) if len(batch) < max_batch else max_batch
    input_batch = batch.select(range(batch_size))
    print(f"input: {input_batch['input']}")

    input = tokenizer(input_batch["input"], padding=True, return_tensors="pt").to(device)
    outputs = model.generate(input.input_ids, max_length=max_length)
    processed = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    print(f"result: {processed}")


>> gec
input: ["Fix grammaticality: How many times do we turn before the television and we watch different commercial ad beyond different enterprises that are permanently appealing the youngster to have a new toy or a new doll and put a pressure on the parent's to they will have, most over the time, to spending money on something that will bring to the child a momentaneous happiness.", 'Fix grammaticality of the sentence: I thinking you should go on holidays for your friend before you can has more fun without upon your parents In fact with your friends you can did what you are keen on: swimming, going shopping and going to the disco, instead of having a boring holiday with your parent, who want you to did the same thing they do I hope you will choose what is the best by you']
result: ['"It\'s not about the money. <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> 

## Training

In [25]:
%reload_ext autoreload

training_model_repo = f"iliazlobin"
training_model_name: str = f"{model_name}-coedit"
training_model_id = f"{training_model_repo}/{training_model_name}"
training_model_checkpoint = f"{training_model_id}"
training_model_path = f"{training_model_repo}_{training_model_name}"
print(
    f"training_model_name: {training_model_name}, "
    f"training_model_id: {training_model_id}, "
    f"training_model_path: {training_model_path}"
)

utilization = calculate_utilization()
utilization_str = format_utilization_narrow(utilization)
print(
    f"total/used/cuda/res/ram (Gb): {utilization_str['total_memory']}/{utilization_str['memory_used']}/"
    f"{utilization_str['cuda_allocated']}/{utilization_str['cuda_reserved']}/{utilization_str['ram_usage']}"
)

available_memory = utilization["total_memory"] - utilization["memory_used"]
recommended_fraction = available_memory / utilization["total_memory"]

actual_fraction = 0.95
torch.cuda.set_per_process_memory_fraction(actual_fraction, 0)

print(
    f"total/used/available memory (Gb): {utilization['total_memory']/1024**3:.2f}/"
    f"{utilization['memory_used']/1024**3:.2f}/{available_memory/1024**3:.2f}"
)
print(f"recommended/actual fraction: {recommended_fraction:.2f}/{actual_fraction:.2f}")

training_model_name: bart-large-coedit, training_model_id: iliazlobin/bart-large-coedit, training_model_path: iliazlobin_bart-large-coedit
total/used/cuda/res/ram (Gb): 79.33/3.34/1.52/2.01/5.45
total/used/available memory (Gb): 79.33/3.34/75.99
recommended/actual fraction: 0.96/0.95


In [31]:
training_model = model
print(type(training_model))
print(training_model.config)

total_params = sum(p.numel() for p in training_model.parameters())
total_trainable_params = sum(p.numel() for p in training_model.parameters() if p.requires_grad)
print(f"Total/trainable params: {total_params}/{total_trainable_params}")


rouge_metric = evaluate.load("rouge")
# glue_metric = evaluate.load("glue", "stsb")
sacreblue_metric = evaluate.load("sacrebleu")
sari_metric = evaluate.load("sari")
em_metric = evaluate.load("exact_match")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    rouge_score = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    sacreblue_score = sacreblue_metric.compute(predictions=decoded_preds, references=decoded_labels)
    # sari_score = sari_metric.compute(
    #     sources=processed_samples["input"],
    #     predictions=processed_samples["processed"],
    #     references=processed_samples["references"],
    # )
    em_score = em_metric.compute(predictions=decoded_preds, references=decoded_labels)

    report = {
        "rouge1": rouge_score["rouge1"],
        "rouge2": rouge_score["rouge2"],
        "rougeL": rouge_score["rougeL"],
        "rougeLsum": rouge_score["rougeLsum"],
        "sacreblue": sacreblue_score["score"],
        # "sari.sari": 0,
        "em": em_score["exact_match"],
    }

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    report["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in report.items()}

<class 'transformers.models.bart.modeling_bart.BartForConditionalGeneration'>
BartConfig {
  "_name_or_path": "facebook/bart-large",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id": {
   

In [16]:
%%capture
%load_ext tensorboard
%tensorboard --logdir "model-{training_model_path}-train/runs"

In [29]:
train_path = f"model-{training_model_path}-train"
print(f"train_path: {train_path}")

train_size = len(processed_dataset["train"])
batch_size = 250
gradient_accumulation_steps = 4

per_epoch_steps = train_size / (batch_size * gradient_accumulation_steps)

max_steps = 10
epochs = max_steps / per_epoch_steps
print(f"train_size: {train_size}, batch_size: {batch_size}, per_epoch_steps: {per_epoch_steps}, max_steps: {max_steps}, epochs: {epochs}")

# if max_steps < per_epoch_steps:
#     raise Exception(f"Training doesn't cover the entire training dataset with {train_size} samples")

epochs = 2
epoch_total_steps = epochs * per_epoch_steps
print(f"train_size: {train_size}, batch_size: {batch_size}, per_epoch_steps: {per_epoch_steps}, epochs: {epochs}, epoch_total_steps: {epoch_total_steps}")


train_path: model-iliazlobin_bart-large-coedit-train
train_size: 7076, batch_size: 250, per_epoch_steps: 7.076, max_steps: 10, epochs: 1.4132278123233466
train_size: 7076, batch_size: 250, per_epoch_steps: 7.076, epochs: 2, epoch_total_steps: 14.152


In [30]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

args = Seq2SeqTrainingArguments(
    output_dir=train_path,
    learning_rate=2e-5,
    weight_decay=0.01,
    # warmup_ratio=0.05,
    # optim="paged_adamw_8bit",
    fp16=True,
    predict_with_generate=True,
    gradient_checkpointing=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    num_train_epochs=epochs,
    # max_steps=max_steps,
    # warmup_steps=2,
    logging_steps=5,
    save_strategy="steps",
    save_steps=5,
    evaluation_strategy="steps",
    eval_steps=5,
    report_to="tensorboard",
)


trainer = Seq2SeqTrainer(
    model=training_model,
    train_dataset=processed_dataset["train"],
    eval_dataset=processed_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    args=args,
)
print(type(trainer))

model.config.use_cache = False
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


<class 'transformers.trainer_seq2seq.Seq2SeqTrainer'>




Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Sacreblue,Em,Gen Len
5,1.5798,0.624166,0.4945,0.3664,0.4554,0.4561,17.6643,0.0,18.0
10,0.6914,0.54254,0.5306,0.4056,0.4907,0.4907,21.4514,0.0,18.0




> <transformers.trainer_utils.EvalPrediction object at 0x7f5601612bf0>


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


> <transformers.trainer_utils.EvalPrediction object at 0x7f55fbe01a80>


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


TrainOutput(global_step=14, training_loss=0.981952394757952, metrics={'train_runtime': 102.5809, 'train_samples_per_second': 137.959, 'train_steps_per_second': 0.136, 'total_flos': 3426154282303488.0, 'train_loss': 0.981952394757952, 'epoch': 1.93})

In [15]:
print(type(trainer))
print(type(trainer.model))

trainer.save_model(f"model-{training_model_path}")
# trainer.push_to_hub(training_model_id, token=os.getenv("HUGGING_FACE_TOKEN"))
trained_model = trainer.model

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


<class 'transformers.trainer_seq2seq.Seq2SeqTrainer'>
<class 'transformers.models.bart.modeling_bart.BartForConditionalGeneration'>


## Test inference

In [33]:
model = trainer.model

max_batch = 2
max_length = 350

for task, batch in test_dataset_dict.items():
    print()
    print(f">> {task}")
    batch_size = len(batch) if len(batch) < max_batch else max_batch
    input_batch = batch.select(range(batch_size))
    print(f"input: {input_batch['input']}")

    input = tokenizer(input_batch["input"], padding=True, return_tensors="pt").to(device)
    outputs = model.generate(input.input_ids, max_length=max_length)
    processed = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    print(f"result: {processed}")


>> gec
input: ["Fix grammaticality: How many times do we turn before the television and we watch different commercial ad beyond different enterprises that are permanently appealing the youngster to have a new toy or a new doll and put a pressure on the parent's to they will have, most over the time, to spending money on something that will bring to the child a momentaneous happiness.", 'Fix grammaticality of the sentence: I thinking you should go on holidays for your friend before you can has more fun without upon your parents In fact with your friends you can did what you are keen on: swimming, going shopping and going to the disco, instead of having a boring holiday with your parent, who want you to did the same thing they do I hope you will choose what is the best by you']




result: ['"If you have a child, you will have to have him. <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>', 'In fact with your friends you can did what you are keen on: swimming, going shopping and going to the disco, instead of having a boring holiday with your parent, who want you to did the same thing.']

>> paraphrase
input: ['Rewrite this sentence: Researchers and national competent authorities have developed a wide range of collections and related databases on genetic resources, breeds and plant varieties.', 'Write a paraphrased version of the sentence: The Commission report will be accompanied, if appropriate, by a legislative proposal.']
result: ['Researchers and national competent authorities have developed a wide range of collections and related databases on genetic resources, breeds and plant varieties.', 'The report will be accompanied, <pad> <pad> 